feat: CPU-only build (#51)

XuehaiPan · web-flow · commit 5b5b21d3a481 · 2022-08-07T21:08:29.000+08:00
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -11,6 +11,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ### Added
 
+- CPU-only build without `nvcc` requirement by [@XuehaiPan](https://github.com/XuehaiPan) in [#51](https://github.com/metaopt/TorchOpt/pull/51).
 - Use [`cibuildwheel`](https://github.com/pypa/cibuildwheel) to build wheels by [@XuehaiPan](https://github.com/XuehaiPan) in [#45](https://github.com/metaopt/TorchOpt/pull/45).
 - Use dynamic process number in CPU kernels by [@JieRen98](https://github.com/JieRen98) in [#42](https://github.com/metaopt/TorchOpt/pull/42).
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,33 +13,43 @@
 # limitations under the License.
 # ==============================================================================
 
-cmake_minimum_required(VERSION 3.4)
-project(torchopt LANGUAGES CXX CUDA)
+cmake_minimum_required(VERSION 3.8)
+project(torchopt LANGUAGES CXX)
 
 if(NOT CMAKE_BUILD_TYPE)
     set(CMAKE_BUILD_TYPE Release)
 endif()
 
-find_package(CUDA REQUIRED)
-cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS All)
-list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})
-
 set(CMAKE_CXX_STANDARD 14)
-set(CMAKE_CUDA_STANDARD 14)
 set(CMAKE_CXX_STANDARD_REQUIRED ON)
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -pthread -fPIC -fopenmp")
 set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -O3")
-set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -O3")
+
+find_package(CUDA)
+
+if(CUDA_FOUND)
+    message(STATUS "Found CUDA, enabling CUDA support.")
+    enable_language(CUDA)
+
+    cuda_select_nvcc_arch_flags(CUDA_ARCH_FLAGS All)
+    list(APPEND CUDA_NVCC_FLAGS ${CUDA_ARCH_FLAGS})
+    set(CMAKE_CUDA_STANDARD 14)
+    set(CMAKE_CUDA_FLAGS_RELEASE "${CMAKE_CUDA_FLAGS_RELEASE} -O3")
+else()
+    message(STATUS "CUDA not found, build for CPU-only.")
+endif()
 
 function(system)
     set(options STRIP)
     set(oneValueArgs OUTPUT_VARIABLE ERROR_VARIABLE WORKING_DIRECTORY)
     set(multiValueArgs COMMAND)
-    cmake_parse_arguments(SYSTEM
-                          "${options}"
-                          "${oneValueArgs}"
-                          "${multiValueArgs}"
-                          "${ARGN}")
+    cmake_parse_arguments(
+        SYSTEM
+        "${options}"
+        "${oneValueArgs}"
+        "${multiValueArgs}"
+        "${ARGN}"
+    )
 
     if(NOT DEFINED SYSTEM_WORKING_DIRECTORY)
         set(SYSTEM_WORKING_DIRECTORY "${PROJECT_SOURCE_DIR}")
@@ -51,6 +61,7 @@ function(system)
         ERROR_VARIABLE STDERR
         WORKING_DIRECTORY "${SYSTEM_WORKING_DIRECTORY}"
     )
+
     if("${SYSTEM_STRIP}")
         string(STRIP "${STDOUT}" STDOUT)
         string(STRIP "${STDERR}" STDERR)
diff --git a/README.md b/README.md
@@ -252,7 +252,6 @@ pip3 install --no-build-isolation --editable .
 
 - [ ] Support general implicit differentiation with functional programing.
 - [ ] Support more optimizers such as AdamW, RMSProp
-- [ ] CPU-accelerated optimizer
 
 ## Changelog
 
diff --git a/include/adam_op/adam_op_impl_cpu.h b/include/adam_op/adam_op_impl_cpu.h
diff --git a/include/adam_op/adam_op_impl_cuda.cuh b/include/adam_op/adam_op_impl_cuda.cuh
diff --git a/pyproject.toml b/pyproject.toml
@@ -92,12 +92,18 @@ environment-pass = ["CUDA_VERSION", "TEST_TORCH_SPECS"]
 container-engine = "docker"
 
 before-all = """
-    CUDA_VERSION="$(echo "${CUDA_VERSION:-"${DEFAULT_CUDA_VERSION}"}" | cut -d"." -f-2)"
-    CUDA_PKG_SUFFIX="$(echo "${CUDA_VERSION}" | tr "." "-")"
-    echo "CUDA_VERSION=${CUDA_VERSION}"
-    yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
-    yum clean all
-    yum install -y nvidia-driver-latest-libs "cuda-minimal-build-${CUDA_PKG_SUFFIX}"
+    CUDA_VERSION="${CUDA_VERSION:-"${DEFAULT_CUDA_VERSION}"}"
+    if [[ "${CUDA_VERSION}" == "None" || "${CUDA_VERSION}" == "none" ]]; then
+        sed -i -E "s/__version__\\s*=\\s*.*$/\\0 + '+cpu'/" torchopt/version.py
+    else
+        CUDA_VERSION="$(echo "${CUDA_VERSION}" | cut -d"." -f-2)"
+        CUDA_PKG_SUFFIX="$(echo "${CUDA_VERSION}" | tr "." "-")"
+        echo "CUDA_VERSION=${CUDA_VERSION}"
+        yum-config-manager --add-repo https://developer.download.nvidia.com/compute/cuda/repos/rhel7/x86_64/cuda-rhel7.repo
+        yum clean all
+        yum install -y nvidia-driver-latest-libs "cuda-minimal-build-${CUDA_PKG_SUFFIX}"
+    fi
+    echo "cat torchopt/version.py"; cat torchopt/version.py
     """
 test-extras = ["test"]
 test-command = """
@@ -130,7 +136,7 @@ repair-wheel-command = """
     echo "ls ${TORCH_LIB_PATH}"; ls -lh "${TORCH_LIB_PATH}"
     python -m pip install --force-reinstall git+https://github.com/XuehaiPan/auditwheel.git@torchopt
     python -m auditwheel lddtree "{wheel}"
-    python -m auditwheel repair --wheel-dir="{dest_dir}" "{wheel}"
+    python -m auditwheel repair --no-copy-site-libs --wheel-dir="{dest_dir}" "{wheel}"
     """
 
 # Linter tools #################################################################
diff --git a/src/adam_op/CMakeLists.txt b/src/adam_op/CMakeLists.txt
@@ -13,36 +13,13 @@
 # limitations under the License.
 # ==============================================================================
 
-# add_library(
-#     adam_op_CUDA SHARED
-#     adam_op_impl.cu
-#     )
+set(adam_op_src adam_op.cpp adam_op_impl_cpu.cpp)
 
-# target_link_libraries(
-#     adam_op_CUDA
-#     ${TORCH_LIBRARIES}
-#     )
+if(CUDA_FOUND)
+    list(APPEND adam_op_src adam_op_impl_cuda.cu)
+endif()
 
-# add_library(
-#     adam_op_CPU SHARED
-#     adam_op_impl.cpp
-#     )
-
-# target_link_libraries(
-#     adam_op_CPU
-#     ${TORCH_LIBRARIES}
-#     )
-
-# pybind11_add_module(adam_op adam_op.cpp)
-
-# target_link_libraries(
-#     adam_op PRIVATE
-#     adam_op_CPU
-#     adam_op_CUDA
-#     ${TORCH_LIBRARIES}
-#     )
-
-pybind11_add_module(adam_op adam_op.cpp adam_op_impl.cpp adam_op_impl.cu)
+pybind11_add_module(adam_op "${adam_op_src}")
 
 target_link_libraries(
     adam_op PRIVATE
diff --git a/src/adam_op/adam_op.cpp b/src/adam_op/adam_op.cpp
@@ -18,29 +18,37 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 
-#include "include/adam_op/adam_op_impl.cuh"
-#include "include/adam_op/adam_op_impl.h"
+#include "include/adam_op/adam_op_impl_cpu.h"
+#if defined(__CUDACC__)
+#include "include/adam_op/adam_op_impl_cuda.cuh"
+#endif
 
 namespace torchopt {
 TensorArray<3> adamForwardInplace(const torch::Tensor& updates,
                                   const torch::Tensor& mu,
                                   const torch::Tensor& nu, const float b1,
                                   const float b2, const float eps,
                                   const float eps_root, const int count) {
+#if defined(__CUDACC__)
   if (updates.device().is_cuda()) {
     return adamForwardInplaceCUDA(updates, mu, nu, b1, b2, eps, eps_root,
                                   count);
-  } else if (updates.device().is_cpu()) {
+  }
+#endif
+  if (updates.device().is_cpu()) {
     return adamForwardInplaceCPU(updates, mu, nu, b1, b2, eps, eps_root, count);
   } else {
     throw std::runtime_error("Not implemented");
   }
 }
 torch::Tensor adamForwardMu(const torch::Tensor& updates,
                             const torch::Tensor& mu, const float b1) {
+#if defined(__CUDACC__)
   if (updates.device().is_cuda()) {
     return adamForwardMuCUDA(updates, mu, b1);
-  } else if (updates.device().is_cpu()) {
+  }
+#endif
+  if (updates.device().is_cpu()) {
     return adamForwardMuCPU(updates, mu, b1);
   } else {
     throw std::runtime_error("Not implemented");
@@ -49,9 +57,12 @@ torch::Tensor adamForwardMu(const torch::Tensor& updates,
 
 torch::Tensor adamForwardNu(const torch::Tensor& updates,
                             const torch::Tensor& nu, const float b2) {
+#if defined(__CUDACC__)
   if (updates.device().is_cuda()) {
     return adamForwardNuCUDA(updates, nu, b2);
-  } else if (updates.device().is_cpu()) {
+  }
+#endif
+  if (updates.device().is_cpu()) {
     return adamForwardNuCPU(updates, nu, b2);
   } else {
     throw std::runtime_error("Not implemented");
@@ -62,9 +73,12 @@ torch::Tensor adamForwardUpdates(const torch::Tensor& new_mu,
                                  const torch::Tensor& new_nu, const float b1,
                                  const float b2, const float eps,
                                  const float eps_root, const int count) {
+#if defined(__CUDACC__)
   if (new_mu.device().is_cuda()) {
     return adamForwardUpdatesCUDA(new_mu, new_nu, b1, b2, eps, eps_root, count);
-  } else if (new_mu.device().is_cpu()) {
+  }
+#endif
+  if (new_mu.device().is_cpu()) {
     return adamForwardUpdatesCPU(new_mu, new_nu, b1, b2, eps, eps_root, count);
   } else {
     throw std::runtime_error("Not implemented");
@@ -74,9 +88,12 @@ torch::Tensor adamForwardUpdates(const torch::Tensor& new_mu,
 TensorArray<2> adamBackwardMu(const torch::Tensor& dmu,
                               const torch::Tensor& updates,
                               const torch::Tensor& mu, const float b1) {
+#if defined(__CUDACC__)
   if (dmu.device().is_cuda()) {
     return adamBackwardMuCUDA(dmu, updates, mu, b1);
-  } else if (dmu.device().is_cpu()) {
+  }
+#endif
+  if (dmu.device().is_cpu()) {
     return adamBackwardMuCPU(dmu, updates, mu, b1);
   } else {
     throw std::runtime_error("Not implemented");
@@ -86,9 +103,12 @@ TensorArray<2> adamBackwardMu(const torch::Tensor& dmu,
 TensorArray<2> adamBackwardNu(const torch::Tensor& dnu,
                               const torch::Tensor& updates,
                               const torch::Tensor& nu, const float b2) {
+#if defined(__CUDACC__)
   if (dnu.device().is_cuda()) {
     return adamBackwardNuCUDA(dnu, updates, nu, b2);
-  } else if (dnu.device().is_cpu()) {
+  }
+#endif
+  if (dnu.device().is_cpu()) {
     return adamBackwardNuCPU(dnu, updates, nu, b2);
   } else {
     throw std::runtime_error("Not implemented");
@@ -100,10 +120,13 @@ TensorArray<2> adamBackwardUpdates(const torch::Tensor& dupdates,
                                    const torch::Tensor& new_mu,
                                    const torch::Tensor& new_nu, const float b1,
                                    const float b2, const int count) {
+#if defined(__CUDACC__)
   if (dupdates.device().is_cuda()) {
     return adamBackwardUpdatesCUDA(dupdates, updates, new_mu, new_nu, b1, b2,
                                    count);
-  } else if (dupdates.device().is_cpu()) {
+  }
+#endif
+  if (dupdates.device().is_cpu()) {
     return adamBackwardUpdatesCPU(dupdates, updates, new_mu, new_nu, b1, b2,
                                   count);
   } else {
diff --git a/src/adam_op/adam_op_impl_cpu.cpp b/src/adam_op/adam_op_impl_cpu.cpp
@@ -13,7 +13,7 @@
 // limitations under the License.
 // ==============================================================================
 
-#include "include/adam_op/adam_op_impl.h"
+#include "include/adam_op/adam_op_impl_cpu.h"
 
 #include <omp.h>
 #include <torch/extension.h>
diff --git a/src/adam_op/adam_op_impl_cuda.cu b/src/adam_op/adam_op_impl_cuda.cu
@@ -17,7 +17,7 @@
 
 #include <vector>
 
-#include "include/adam_op/adam_op_impl.cuh"
+#include "include/adam_op/adam_op_impl_cuda.cuh"
 #include "include/utils.h"
 
 namespace torchopt {