From 46e7d63df32da62dd419a59ae157b2728b491169 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 13:24:43 -0800
Subject: [PATCH 01/22] Switched from using OpenMP for parallelism to using
 oneAPI::TBB (though not using the oneapi scope). Untested but building.

---
 .../LegacyReconstructionUtil.h                | 115 ++++--
 cpp/open3d/core/ParallelFor.h                 |  17 +-
 .../core/hashmap/CPU/CPUHashBackendBuffer.cpp |  13 +-
 cpp/open3d/core/hashmap/CPU/TBBHashBackend.h  |  77 ++--
 cpp/open3d/core/kernel/NonZeroCPU.cpp         |  24 +-
 cpp/open3d/core/kernel/ReductionCPU.cpp       | 102 ++---
 cpp/open3d/geometry/EstimateNormals.cpp       |  82 ++--
 cpp/open3d/geometry/ISSKeypoints.cpp          |  81 ++--
 cpp/open3d/geometry/Image.cpp                 | 218 +++++------
 cpp/open3d/geometry/PointCloud.cpp            | 172 +++++----
 cpp/open3d/geometry/PointCloudCluster.cpp     |  28 +-
 .../PointCloudPlanarPatchDetection.cpp        |  19 +-
 .../geometry/PointCloudSegmentation.cpp       | 108 +++---
 cpp/open3d/geometry/TriangleMesh.cpp          | 167 ++++----
 .../geometry/TriangleMeshDeformation.cpp      | 127 ++++---
 .../sensor/azure_kinect/AzureKinectSensor.cpp |  26 +-
 .../pipelines/color_map/ColorMapUtils.cpp     | 235 ++++++------
 .../pipelines/color_map/NonRigidOptimizer.cpp | 237 +++++++-----
 .../pipelines/color_map/RigidOptimizer.cpp    |  94 ++---
 .../integration/UniformTSDFVolume.cpp         | 279 +++++++-------
 cpp/open3d/pipelines/odometry/Odometry.cpp    | 327 ++++++++--------
 .../registration/FastGlobalRegistration.cpp   |  59 +--
 cpp/open3d/pipelines/registration/Feature.cpp | 161 ++++----
 .../pipelines/registration/GeneralizedICP.cpp |  16 +-
 .../pipelines/registration/Registration.cpp   | 357 +++++++++++-------
 .../pipelines/registration/Registration.h     |   5 +
 cpp/open3d/t/geometry/PointCloud.cpp          |   4 +-
 .../t/geometry/kernel/PointCloudCPU.cpp       |  13 +-
 cpp/open3d/t/geometry/kernel/UVUnwrapping.cpp |   3 +
 .../t/geometry/kernel/VoxelBlockGridImpl.h    |  20 +-
 cpp/open3d/t/io/sensor/RGBDVideoReader.cpp    |  15 +-
 .../pipelines/kernel/FillInLinearSystemImpl.h |  56 ++-
 .../t/pipelines/kernel/RGBDOdometryCPU.cpp    | 338 ++++++++---------
 .../t/pipelines/kernel/RegistrationCPU.cpp    | 232 ++++++------
 .../t/pipelines/registration/Feature.cpp      |  26 +-
 cpp/open3d/utility/Eigen.cpp                  | 172 +++++----
 cpp/open3d/utility/Parallel.cpp               |  15 +-
 cpp/open3d/utility/Parallel.h                 |  12 +-
 cpp/open3d/utility/ProgressBar.cpp            |  25 ++
 cpp/open3d/utility/ProgressBar.h              |  14 +
 cpp/open3d/utility/Random.cpp                 |  14 +-
 cpp/open3d/utility/Random.h                   |  18 +-
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/utility/ProgressBar.cpp             |  17 +
 examples/cpp/CMakeLists.txt                   |  12 +-
 examples/cpp/OpenMP.cpp                       |   2 +-
 examples/cpp/RealSenseBagReader.cpp           |  15 +-
 examples/cpp/TrimMeshBasedOnPointCloud.cpp    |  28 +-
 48 files changed, 2320 insertions(+), 1878 deletions(-)

diff --git a/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h b/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h
index 26d30bfe182..4070e808e39 100644
--- a/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h
+++ b/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h
@@ -8,6 +8,8 @@
 #pragma once
 
 #include <json/json.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
 
 #include "DebugUtil.h"
 #include "FileSystemUtil.h"
@@ -547,12 +549,15 @@ class ReconstructionPipeline {
 
         const size_t num_pairs = fragment_matching_results.size();
         if (config_["multi_threading"].asBool()) {
-#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
-            for (int i = 0; i < (int)num_pairs; i++) {
-                RegisterFragmentPair(ply_files, fragment_matching_results[i].s_,
-                                     fragment_matching_results[i].t_,
-                                     fragment_matching_results[i]);
-            }
+            tbb::parallel_for(tbb::blocked_range<std::size_t>(0, num_pairs),
+                    [&](const tbb::blocked_range<std::size_t>& range){
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    RegisterFragmentPair(ply_files,
+                                         fragment_matching_results[i].s_,
+                                         fragment_matching_results[i].t_,
+                                         fragment_matching_results[i]);
+                }
+            });
         } else {
             for (size_t i = 0; i < num_pairs; i++) {
                 RegisterFragmentPair(ply_files, fragment_matching_results[i].s_,
@@ -635,13 +640,16 @@ class ReconstructionPipeline {
         }
 
         if (config_["multi_threading"].asBool()) {
-#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
-            for (int i = 0; i < (int)fragment_matching_results.size(); i++) {
-                const int s = fragment_matching_results[i].s_;
-                const int t = fragment_matching_results[i].t_;
-                RefineFragmentPair(ply_files, s, t,
-                                   fragment_matching_results[i]);
-            }
+            tbb::parallel_for(tbb::blocked_range<std::size_t>(0,
+                    fragment_matching_results.size()),
+                    [&](const tbb::blocked_range<std::size_t>& range){
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    const int s = fragment_matching_results[i].s_;
+                    const int t = fragment_matching_results[i].t_;
+                    RefineFragmentPair(ply_files, s, t,
+                                       fragment_matching_results[i]);
+                }
+            });
         } else {
             for (size_t i = 0; i < fragment_matching_results.size(); i++) {
                 const int s = fragment_matching_results[i].s_;
@@ -722,37 +730,70 @@ class ReconstructionPipeline {
         matched_result.information_ = std::get<1>(result);
     }
 
+    struct PointCloudIntegrator {
+        // Globals
+        ReconstructionPipeline& pipeline;
+        const int fragment_id;
+        const std::vector<std::string>& color_files;
+        const std::vector<std::string>& depth_files;
+        const camera::PinholeCameraIntrinsic& intrinsic;
+        using PoseGraphT = pipelines::registration::PoseGraph;
+        const PoseGraphT& pose_graph;
+        // Locals
+        geometry::PointCloud fragment;
+
+        PointCloudIntegrator(ReconstructionPipeline& pipeline_,
+                             const int fragment_id_,
+                             const std::vector<std::string>& color_files_,
+                             const std::vector<std::string>& depth_files_,
+                             const camera::PinholeCameraIntrinsic& intrinsic_,
+                             const PoseGraphT& pose_graph_)
+                : pipeline(pipeline_), fragment_id(fragment_id_),
+              color_files(color_files_), depth_files(depth_files_),
+              intrinsic(intrinsic_), pose_graph(pose_graph_) {}
+
+        PointCloudIntegrator(PointCloudIntegrator& o, tbb::split)
+                : pipeline(o.pipeline), fragment_id(o.fragment_id),
+              color_files(o.color_files), depth_files(o.depth_files),
+              intrinsic(o.intrinsic), pose_graph(o.pose_graph) {}
+
+        void operator()(const tbb::blocked_range<std::size_t>& range) {
+            for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                const int i_abs = fragment_id *
+                    pipeline.config_["n_frames_per_fragment"].asInt() + i;
+                utility::LogInfo("Fragment {:03d} / {:03d} :: "
+                        "Integrate rgbd frame {:d} ({:d} of {:d}).",
+                        fragment_id, pipeline.n_fragments_ - 1, i_abs,
+                        i + 1, pose_graph.nodes_.size());
+                const geometry::RGBDImage rgbd = pipeline.ReadRGBDImage(
+                        color_files[i_abs], depth_files[i_abs], false);
+                auto pcd = geometry::PointCloud::CreateFromRGBDImage(
+                        rgbd, intrinsic, Eigen::Matrix4d::Identity(), true);
+                pcd->Transform(pose_graph.nodes_[i].pose_);
+            }
+        }
+
+        void join(PointCloudIntegrator& rhs) {
+            fragment += rhs.fragment;
+        }
+    };
+
     void IntegrateFragmentRGBD(
             int fragment_id,
             const std::vector<std::string>& color_files,
             const std::vector<std::string>& depth_files,
             const camera::PinholeCameraIntrinsic& intrinsic,
             const pipelines::registration::PoseGraph& pose_graph) {
-        geometry::PointCloud fragment;
-        const size_t graph_num = pose_graph.nodes_.size();
 
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int i = 0; i < int(graph_num); ++i) {
-            const int i_abs =
-                    fragment_id * config_["n_frames_per_fragment"].asInt() + i;
-            utility::LogInfo(
-                    "Fragment {:03d} / {:03d} :: Integrate rgbd frame {:d} "
-                    "({:d} "
-                    "of "
-                    "{:d}).",
-                    fragment_id, n_fragments_ - 1, i_abs, i + 1, graph_num);
-            const geometry::RGBDImage rgbd = ReadRGBDImage(
-                    color_files[i_abs], depth_files[i_abs], false);
-            auto pcd = geometry::PointCloud::CreateFromRGBDImage(
-                    rgbd, intrinsic, Eigen::Matrix4d::Identity(), true);
-            pcd->Transform(pose_graph.nodes_[i].pose_);
-#pragma omp critical
-            { fragment += *pcd; }
-        }
-
-        const geometry::PointCloud fragment_down = *fragment.VoxelDownSample(
-                config_["tsdf_cubic_size"].asDouble() / 512.0);
+        const size_t graph_num = pose_graph.nodes_.size();
+        PointCloudIntegrator reducer(*this, fragment_id, color_files,
+                                     depth_files, intrinsic, pose_graph);
+        tbb::parallel_reduce(tbb::blocked_range<std::size_t>(
+                0, graph_num), reducer);
+
+        const geometry::PointCloud fragment_down =
+                *reducer.fragment.VoxelDownSample(
+                    config_["tsdf_cubic_size"].asDouble() / 512.0);
         io::WritePointCloud(
                 utility::filesystem::JoinPath(
                         config_["path_dataset"].asString(),
diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h
index d9bbe376684..ba8104fda89 100644
--- a/cpp/open3d/core/ParallelFor.h
+++ b/cpp/open3d/core/ParallelFor.h
@@ -21,6 +21,8 @@
 #include <cuda_runtime.h>
 
 #include "open3d/core/CUDAUtils.h"
+#else
+#include <tbb/parallel_for.h>
 #endif
 
 namespace open3d {
@@ -79,10 +81,17 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
         return;
     }
 
-#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
-    for (int64_t i = 0; i < n; ++i) {
-        func(i);
-    }
+//    #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
+//    for (int64_t i = 0; i < n; ++i) {
+//        func(i);
+//    }
+
+    tbb::parallel_for(tbb::blocked_range<int64_t>(0, n, 32),
+            [&func](const tbb::blocked_range<int64_t>& range){
+        for (int64_t i = range.begin(); i < range.end(); ++i) {
+            func(i);
+        }
+    });
 }
 
 #endif
diff --git a/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp b/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp
index 831b65fd2cc..86a23c7e285 100644
--- a/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp
+++ b/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp
@@ -8,16 +8,21 @@
 #include "open3d/core/hashmap/HashBackendBuffer.h"
 #include "open3d/utility/Parallel.h"
 
+#include <tbb/parallel_for.h>
+
 namespace open3d {
 namespace core {
 void CPUResetHeap(Tensor& heap) {
     uint32_t* heap_ptr = heap.GetDataPtr<uint32_t>();
     int64_t capacity = heap.GetLength();
 
-#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
-    for (int64_t i = 0; i < capacity; ++i) {
-        heap_ptr[i] = i;
-    }
+    tbb::parallel_for(tbb::blocked_range<int64_t>(
+            0, capacity, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<int64_t>& range){
+        for (int64_t i = range.begin(); i < range.end(); ++i) {
+            heap_ptr[i] = i;
+        }
+    });
 };
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h b/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h
index f562dc2cc9d..b72ed98bf3d 100644
--- a/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h
+++ b/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <tbb/concurrent_unordered_map.h>
+#include <tbb/parallel_for.h>
 
 #include <limits>
 #include <unordered_map>
@@ -93,15 +94,17 @@ void TBBHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
                                          int64_t count) {
     const Key* input_keys_templated = static_cast<const Key*>(input_keys);
 
-#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
-    for (int64_t i = 0; i < count; ++i) {
-        const Key& key = input_keys_templated[i];
+    tbb::parallel_for(tbb::blocked_range<int64_t>(0, count, 64),
+            [=, &impl=impl_](const tbb::blocked_range<int64_t>& range){
+        for (int64_t i = range.begin(); i < range.end(); ++i) {
+            const Key& key = input_keys_templated[i];
 
-        auto iter = impl_->find(key);
-        bool flag = (iter != impl_->end());
-        output_masks[i] = flag;
-        output_buf_indices[i] = flag ? iter->second : 0;
-    }
+            auto iter = impl->find(key);
+            bool flag = (iter != impl->end());
+            output_masks[i] = flag;
+            output_buf_indices[i] = flag ? iter->second : 0;
+        }
+    });
 }
 
 template <typename Key, typename Hash, typename Eq>
@@ -177,43 +180,45 @@ void TBBHashBackend<Key, Hash, Eq>::Insert(
 
     size_t n_values = input_values_soa.size();
 
-#pragma omp parallel for num_threads(utility::EstimateMaxThreads())
-    for (int64_t i = 0; i < count; ++i) {
-        output_buf_indices[i] = 0;
-        output_masks[i] = false;
+    tbb::parallel_for(tbb::blocked_range<int64_t>(0, count, 64),
+            [&](const tbb::blocked_range<int64_t>& range){
+        for (int64_t i = range.begin(); i < range.end(); ++i) {
+            output_buf_indices[i] = 0;
+            output_masks[i] = false;
 
-        const Key& key = input_keys_templated[i];
+            const Key& key = input_keys_templated[i];
 
-        // Try to insert a dummy buffer index.
-        auto res = impl_->insert({key, 0});
+            // Try to insert a dummy buffer index.
+            auto res = impl_->insert({key, 0});
 
-        // Lazy copy key value pair to buffer only if succeeded
-        if (res.second) {
-            buf_index_t buf_index = buffer_accessor_->DeviceAllocate();
-            void* key_ptr = buffer_accessor_->GetKeyPtr(buf_index);
+            // Lazy copy key value pair to buffer only if succeeded
+            if (res.second) {
+                buf_index_t buf_index = buffer_accessor_->DeviceAllocate();
+                void* key_ptr = buffer_accessor_->GetKeyPtr(buf_index);
 
-            // Copy templated key to buffer
-            *static_cast<Key*>(key_ptr) = key;
+                // Copy templated key to buffer
+                *static_cast<Key*>(key_ptr) = key;
 
-            // Copy/reset non-templated value in buffer
-            for (size_t j = 0; j < n_values; ++j) {
-                uint8_t* dst_value = static_cast<uint8_t*>(
-                        buffer_accessor_->GetValuePtr(buf_index, j));
+                // Copy/reset non-templated value in buffer
+                for (size_t j = 0; j < n_values; ++j) {
+                    uint8_t* dst_value = static_cast<uint8_t*>(
+                            buffer_accessor_->GetValuePtr(buf_index, j));
 
-                const uint8_t* src_value =
-                        static_cast<const uint8_t*>(input_values_soa[j]) +
-                        this->value_dsizes_[j] * i;
-                std::memcpy(dst_value, src_value, this->value_dsizes_[j]);
-            }
+                    const uint8_t* src_value =
+                            static_cast<const uint8_t*>(input_values_soa[j]) +
+                            this->value_dsizes_[j] * i;
+                    std::memcpy(dst_value, src_value, this->value_dsizes_[j]);
+                }
 
-            // Update from dummy 0
-            res.first->second = buf_index;
+                // Update from dummy 0
+                res.first->second = buf_index;
 
-            // Write to return variables
-            output_buf_indices[i] = buf_index;
-            output_masks[i] = true;
+                // Write to return variables
+                output_buf_indices[i] = buf_index;
+                output_masks[i] = true;
+            }
         }
-    }
+    });
 }
 
 template <typename Key, typename Hash, typename Eq>
diff --git a/cpp/open3d/core/kernel/NonZeroCPU.cpp b/cpp/open3d/core/kernel/NonZeroCPU.cpp
index 4bea5919154..67618aca5d0 100644
--- a/cpp/open3d/core/kernel/NonZeroCPU.cpp
+++ b/cpp/open3d/core/kernel/NonZeroCPU.cpp
@@ -12,6 +12,8 @@
 #include "open3d/utility/Logging.h"
 #include "open3d/utility/Parallel.h"
 
+#include <tbb/parallel_for.h>
+
 namespace open3d {
 namespace core {
 namespace kernel {
@@ -46,17 +48,19 @@ Tensor NonZeroCPU(const Tensor& src) {
 
     std::vector<std::vector<int64_t>> non_zero_indices_by_dimensions(
             num_dims, std::vector<int64_t>(num_non_zeros, 0));
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int64_t i = 0; i < static_cast<int64_t>(num_non_zeros); i++) {
-        int64_t non_zero_index = non_zero_indices[i];
-        for (int64_t dim = num_dims - 1; dim >= 0; dim--) {
-            void* result_ptr = result_iter.GetPtr(dim * num_non_zeros + i);
-            OPEN3D_ASSERT(result_ptr != nullptr && "Internal error.");
-            *static_cast<int64_t*>(result_ptr) = non_zero_index % shape[dim];
-            non_zero_index = non_zero_index / shape[dim];
+    tbb::parallel_for(tbb::blocked_range<int64_t>(
+            0, num_non_zeros, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<int64_t>& range){
+        for (int64_t i = range.begin(); i < range.end(); i++) {
+            int64_t non_zero_index = non_zero_indices[i];
+            for (int64_t dim = num_dims - 1; dim >= 0; dim--) {
+                void* result_ptr = result_iter.GetPtr(dim * num_non_zeros + i);
+                OPEN3D_ASSERT(result_ptr != nullptr && "Internal error.");
+                *static_cast<int64_t*>(result_ptr) = non_zero_index % shape[dim];
+                non_zero_index = non_zero_index / shape[dim];
+            }
         }
-    }
+    });
 
     return result;
 }
diff --git a/cpp/open3d/core/kernel/ReductionCPU.cpp b/cpp/open3d/core/kernel/ReductionCPU.cpp
index 7caa60b34f7..cf4c7f12e85 100644
--- a/cpp/open3d/core/kernel/ReductionCPU.cpp
+++ b/cpp/open3d/core/kernel/ReductionCPU.cpp
@@ -14,6 +14,9 @@
 #include "open3d/utility/Logging.h"
 #include "open3d/utility/Parallel.h"
 
+#include <tbb/parallel_reduce.h>
+#include <tbb/parallel_for.h>
+
 namespace open3d {
 namespace core {
 namespace kernel {
@@ -76,9 +79,7 @@ class CPUReductionEngine {
     void Run(const func_t& reduce_func, scalar_t identity) {
         // See: PyTorch's TensorIterator::parallel_reduce for the reference
         // design of reduction strategy.
-        if (utility::EstimateMaxThreads() == 1 || utility::InParallel()) {
-            LaunchReductionKernelSerial<scalar_t>(indexer_, reduce_func);
-        } else if (indexer_.NumOutputElements() <= 1) {
+        if (indexer_.NumOutputElements() <= 1) {
             LaunchReductionKernelTwoPass<scalar_t>(indexer_, reduce_func,
                                                    identity);
         } else {
@@ -111,29 +112,19 @@ class CPUReductionEngine {
                     "Internal error: two-pass reduction only works for "
                     "single-output reduction ops.");
         }
-        int64_t num_workloads = indexer.NumWorkloads();
-        int64_t num_threads = utility::EstimateMaxThreads();
-        int64_t workload_per_thread =
-                (num_workloads + num_threads - 1) / num_threads;
-        std::vector<scalar_t> thread_results(num_threads, identity);
-
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
-            int64_t start = thread_idx * workload_per_thread;
-            int64_t end = std::min(start + workload_per_thread, num_workloads);
-            for (int64_t workload_idx = start; workload_idx < end;
-                 ++workload_idx) {
+        scalar_t& dst = *reinterpret_cast<scalar_t*>(indexer.GetOutputPtr(0));
+        dst = tbb::parallel_reduce(tbb::blocked_range<int64_t>(
+                0, indexer.NumWorkloads(), utility::DefaultGrainSizeTBB()),
+                identity, [&](const tbb::blocked_range<int64_t>& range,
+                                           scalar_t so_far){
+            for (int64_t workload_idx = range.begin();
+                    workload_idx < range.end(); ++workload_idx) {
                 scalar_t* src = reinterpret_cast<scalar_t*>(
                         indexer.GetInputPtr(0, workload_idx));
-                thread_results[thread_idx] =
-                        element_kernel(*src, thread_results[thread_idx]);
+                so_far = element_kernel(*src, so_far);
             }
-        }
-        scalar_t* dst = reinterpret_cast<scalar_t*>(indexer.GetOutputPtr(0));
-        for (int64_t thread_idx = 0; thread_idx < num_threads; ++thread_idx) {
-            *dst = element_kernel(thread_results[thread_idx], *dst);
-        }
+            return so_far;
+        }, element_kernel);
     }
 
     template <typename scalar_t, typename func_t>
@@ -164,13 +155,16 @@ class CPUReductionEngine {
                     "LaunchReductionKernelTwoPass instead.");
         }
 
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int64_t i = 0; i < indexer_shape[best_dim]; ++i) {
-            Indexer sub_indexer(indexer);
-            sub_indexer.ShrinkDim(best_dim, i, 1);
-            LaunchReductionKernelSerial<scalar_t>(sub_indexer, element_kernel);
-        }
+        // TODO: could theoretically do inner reductions in parallel too with TBB
+        tbb::parallel_for(tbb::blocked_range<int64_t>(
+                0, indexer_shape[best_dim], 1),
+                [&](const tbb::blocked_range<int64_t>& range){
+            for (int64_t i = range.begin(); i < range.end(); ++i) {
+                Indexer sub_indexer(indexer);
+                sub_indexer.ShrinkDim(best_dim, i, 1);
+                LaunchReductionKernelSerial<scalar_t>(sub_indexer, element_kernel);
+            }
+        });
     }
 
 private:
@@ -191,25 +185,37 @@ class CPUArgReductionEngine {
         // sub-iteration.
         int64_t num_output_elements = indexer_.NumOutputElements();
 
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int64_t output_idx = 0; output_idx < num_output_elements;
-             output_idx++) {
-            // sub_indexer.NumWorkloads() == ipo.
-            // sub_indexer's workload_idx is indexer_'s ipo_idx.
-            Indexer sub_indexer = indexer_.GetPerOutputIndexer(output_idx);
-            scalar_t dst_val = identity;
-            for (int64_t workload_idx = 0;
-                 workload_idx < sub_indexer.NumWorkloads(); workload_idx++) {
-                int64_t src_idx = workload_idx;
-                scalar_t* src_val = reinterpret_cast<scalar_t*>(
-                        sub_indexer.GetInputPtr(0, workload_idx));
-                int64_t* dst_idx = reinterpret_cast<int64_t*>(
-                        sub_indexer.GetOutputPtr(0, workload_idx));
-                std::tie(*dst_idx, dst_val) =
-                        reduce_func(src_idx, *src_val, *dst_idx, dst_val);
+        tbb::parallel_for(tbb::blocked_range<int64_t>(
+                0, num_output_elements, 1),
+                [&](const tbb::blocked_range<int64_t>& range){
+            for (int64_t output_idx = range.begin();
+                    output_idx < range.end(); ++output_idx) {
+                // sub_indexer.NumWorkloads() == ipo.
+                // sub_indexer's workload_idx is indexer_'s ipo_idx.
+                Indexer sub_indexer = indexer_.GetPerOutputIndexer(output_idx);
+                using result_t = std::pair<int64_t, scalar_t>;
+                result_t val_idx{identity, identity};
+                val_idx = tbb::parallel_deterministic_reduce(
+                        tbb::blocked_range<int64_t>(0, sub_indexer.NumWorkloads(),
+                        utility::DefaultGrainSizeTBB()), val_idx,
+                        [&](const tbb::blocked_range<int64_t>& range,
+                            result_t so_far){
+                    for (int64_t workload_idx = range.begin();
+                            workload_idx < range.end(); ++workload_idx) {
+                        scalar_t& src_val = *reinterpret_cast<scalar_t*>(
+                            sub_indexer.GetInputPtr(0, workload_idx));
+                        so_far = reduce_func(workload_idx, src_val,
+                            std::get<0>(so_far), std::get<1>(so_far));
+                    }
+                    return so_far;
+                }, [&reduce_func](result_t a, result_t b) {
+                    return reduce_func(std::get<0>(a), std::get<1>(a),
+                            std::get<0>(b), std::get<1>(b));
+                });
+                *reinterpret_cast<int64_t *>(sub_indexer.GetOutputPtr(0)) =
+                        std::get<1>(val_idx);
             }
-        }
+        });
     }
 
 private:
diff --git a/cpp/open3d/geometry/EstimateNormals.cpp b/cpp/open3d/geometry/EstimateNormals.cpp
index 5bc9abc0ba3..3f1ddd6bda2 100644
--- a/cpp/open3d/geometry/EstimateNormals.cpp
+++ b/cpp/open3d/geometry/EstimateNormals.cpp
@@ -8,6 +8,7 @@
 #include <Eigen/Eigenvalues>
 #include <queue>
 #include <tuple>
+#include <tbb/parallel_for.h>
 
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
@@ -298,22 +299,25 @@ void PointCloud::EstimateNormals(
     } else {
         covariances = covariances_;
     }
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)covariances.size(); i++) {
-        auto normal = ComputeNormal(covariances[i], fast_normal_computation);
-        if (normal.norm() == 0.0) {
-            if (has_normal) {
-                normal = normals_[i];
-            } else {
-                normal = Eigen::Vector3d(0.0, 0.0, 1.0);
+
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, covariances.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range){
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            auto normal = ComputeNormal(covariances[i], fast_normal_computation);
+            if (normal.norm() == 0.0) {
+                if (has_normal) {
+                    normal = normals_[i];
+                } else {
+                    normal = Eigen::Vector3d(0.0, 0.0, 1.0);
+                }
             }
+            if (has_normal && normal.dot(normals_[i]) < 0.0) {
+                normal *= -1.0;
+            }
+            normals_[i] = normal;
         }
-        if (has_normal && normal.dot(normals_[i]) < 0.0) {
-            normal *= -1.0;
-        }
-        normals_[i] = normal;
-    }
+    });
 }
 
 void PointCloud::OrientNormalsToAlignWithDirection(
@@ -323,16 +327,18 @@ void PointCloud::OrientNormalsToAlignWithDirection(
         utility::LogError(
                 "No normals in the PointCloud. Call EstimateNormals() first.");
     }
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)points_.size(); i++) {
-        auto &normal = normals_[i];
-        if (normal.norm() == 0.0) {
-            normal = orientation_reference;
-        } else if (normal.dot(orientation_reference) < 0.0) {
-            normal *= -1.0;
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range){
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            auto &normal = normals_[i];
+            if (normal.norm() == 0.0) {
+                normal = orientation_reference;
+            } else if (normal.dot(orientation_reference) < 0.0) {
+                normal *= -1.0;
+            }
         }
-    }
+    });
 }
 
 void PointCloud::OrientNormalsTowardsCameraLocation(
@@ -341,22 +347,26 @@ void PointCloud::OrientNormalsTowardsCameraLocation(
         utility::LogError(
                 "No normals in the PointCloud. Call EstimateNormals() first.");
     }
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)points_.size(); i++) {
-        Eigen::Vector3d orientation_reference = camera_location - points_[i];
-        auto &normal = normals_[i];
-        if (normal.norm() == 0.0) {
-            normal = orientation_reference;
+
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range){
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            Eigen::Vector3d orientation_reference =
+                    camera_location - points_[i];
+            auto &normal = normals_[i];
             if (normal.norm() == 0.0) {
-                normal = Eigen::Vector3d(0.0, 0.0, 1.0);
-            } else {
-                normal.normalize();
+                normal = orientation_reference;
+                if (normal.norm() == 0.0) {
+                    normal = Eigen::Vector3d(0.0, 0.0, 1.0);
+                } else {
+                    normal.normalize();
+                }
+            } else if (normal.dot(orientation_reference) < 0.0) {
+                normal *= -1.0;
             }
-        } else if (normal.dot(orientation_reference) < 0.0) {
-            normal *= -1.0;
         }
-    }
+    });
 }
 
 void PointCloud::OrientNormalsConsistentTangentPlane(
diff --git a/cpp/open3d/geometry/ISSKeypoints.cpp b/cpp/open3d/geometry/ISSKeypoints.cpp
index f47abae1ad2..db4f633fc1a 100644
--- a/cpp/open3d/geometry/ISSKeypoints.cpp
+++ b/cpp/open3d/geometry/ISSKeypoints.cpp
@@ -18,11 +18,15 @@
 #include <tuple>
 #include <vector>
 
+#include <tbb/concurrent_vector.h>
+#include <tbb/parallel_for.h>
+
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/Keypoint.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/utility/Eigen.h"
 #include "open3d/utility/Logging.h"
+#include "open3d/utility/Parallel.h"
 
 namespace open3d {
 
@@ -82,52 +86,59 @@ std::shared_ptr<PointCloud> ComputeISSKeypoints(
     }
 
     std::vector<double> third_eigen_values(points.size());
-#pragma omp parallel for schedule(static) shared(third_eigen_values)
-    for (int i = 0; i < (int)points.size(); i++) {
-        std::vector<int> indices;
-        std::vector<double> dist;
-        int nb_neighbors =
-                kdtree.SearchRadius(points[i], salient_radius, indices, dist);
-        if (nb_neighbors < min_neighbors) {
-            continue;
-        }
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> indices;
+            std::vector<double> dist;
+            int nb_neighbors =
+                    kdtree.SearchRadius(points[i],
+                        salient_radius, indices, dist);
+            if (nb_neighbors < min_neighbors) {
+                continue;
+            }
 
-        Eigen::Matrix3d cov = utility::ComputeCovariance(points, indices);
-        if (cov.isZero()) {
-            continue;
-        }
+            Eigen::Matrix3d cov = utility::ComputeCovariance(points, indices);
+            if (cov.isZero()) {
+                continue;
+            }
 
-        Eigen::SelfAdjointEigenSolver<Eigen::Matrix3d> solver(cov);
-        const double& e1c = solver.eigenvalues()[2];
-        const double& e2c = solver.eigenvalues()[1];
-        const double& e3c = solver.eigenvalues()[0];
+            Eigen::SelfAdjointEigenSolver<Eigen::Matrix3d> solver(cov);
+            const double& e1c = solver.eigenvalues()[2];
+            const double& e2c = solver.eigenvalues()[1];
+            const double& e3c = solver.eigenvalues()[0];
 
-        if ((e2c / e1c) < gamma_21 && e3c / e2c < gamma_32) {
-            third_eigen_values[i] = e3c;
+            if ((e2c / e1c) < gamma_21 && e3c / e2c < gamma_32) {
+                third_eigen_values[i] = e3c;
+            }
         }
-    }
+    });
 
-    std::vector<size_t> kp_indices;
+    tbb::concurrent_vector<std::size_t> kp_indices;
     kp_indices.reserve(points.size());
-#pragma omp parallel for schedule(static) shared(kp_indices)
-    for (int i = 0; i < (int)points.size(); i++) {
-        if (third_eigen_values[i] > 0.0) {
-            std::vector<int> nn_indices;
-            std::vector<double> dist;
-            int nb_neighbors = kdtree.SearchRadius(points[i], non_max_radius,
-                                                   nn_indices, dist);
-
-            if (nb_neighbors >= min_neighbors &&
-                IsLocalMaxima(i, nn_indices, third_eigen_values)) {
-#pragma omp critical
-                kp_indices.emplace_back(i);
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            if (third_eigen_values[i] > 0.0) {
+                std::vector<int> nn_indices;
+                std::vector<double> dist;
+                int nb_neighbors = kdtree.SearchRadius(
+                        points[i], non_max_radius, nn_indices, dist);
+
+                if (nb_neighbors >= min_neighbors &&
+                    IsLocalMaxima(i, nn_indices, third_eigen_values)) {
+                    kp_indices.emplace_back(i);
+                }
             }
         }
-    }
+    });
+
 
     utility::LogDebug("[ComputeISSKeypoints] Extracted {} keypoints",
                       kp_indices.size());
-    return input.SelectByIndex(kp_indices);
+    return input.SelectByIndex({kp_indices.begin(), kp_indices.end()});
 }
 
 }  // namespace keypoint
diff --git a/cpp/open3d/geometry/Image.cpp b/cpp/open3d/geometry/Image.cpp
index f14f709a108..f8d6b265fb5 100644
--- a/cpp/open3d/geometry/Image.cpp
+++ b/cpp/open3d/geometry/Image.cpp
@@ -7,6 +7,8 @@
 
 #include "open3d/geometry/Image.h"
 
+#include <tbb/tbb.h>
+
 #include "open3d/utility/Parallel.h"
 
 namespace {
@@ -133,27 +135,24 @@ std::shared_ptr<Image> Image::Downsample() const {
     if (num_of_channels_ != 1 || bytes_per_channel_ != 4) {
         utility::LogError("Unsupported image format.");
     }
-    int half_width = (int)floor((double)width_ / 2.0);
-    int half_height = (int)floor((double)height_ / 2.0);
-    output->Prepare(half_width, half_height, 1, 4);
-
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int y = 0; y < output->height_; y++) {
-        for (int x = 0; x < output->width_; x++) {
-            float *p1 = PointerAt<float>(x * 2, y * 2);
-            float *p2 = PointerAt<float>(x * 2 + 1, y * 2);
-            float *p3 = PointerAt<float>(x * 2, y * 2 + 1);
-            float *p4 = PointerAt<float>(x * 2 + 1, y * 2 + 1);
-            float *p = output->PointerAt<float>(x, y);
-            *p = (*p1 + *p2 + *p3 + *p4) / 4.0f;
+    // Integer division always rounds towards zero
+    output->Prepare(width_ / 2, height_ / 2, 1, 4);
+
+    tbb::parallel_for(tbb::blocked_range2d<int, int>(
+            0, output->height_, utility::DefaultGrainSizeTBB(),
+            0, output->width_, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range2d<int, int>& range){
+        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
+            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
+                float *p1 = PointerAt<float>(x * 2, y * 2);
+                float *p2 = PointerAt<float>(x * 2 + 1, y * 2);
+                float *p3 = PointerAt<float>(x * 2, y * 2 + 1);
+                float *p4 = PointerAt<float>(x * 2 + 1, y * 2 + 1);
+                float *p = output->PointerAt<float>(x, y);
+                *p = (*p1 + *p2 + *p3 + *p4) / 4.0f;
+            }
         }
-    }
+    });
     return output;
 }
 
@@ -169,28 +168,30 @@ std::shared_ptr<Image> Image::FilterHorizontal(
     output->Prepare(width_, height_, 1, 4);
 
     const int half_kernel_size = (int)(floor((double)kernel.size() / 2.0));
-
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int y = 0; y < height_; y++) {
-        for (int x = 0; x < width_; x++) {
-            float *po = output->PointerAt<float>(x, y, 0);
-            double temp = 0;
-            for (int i = -half_kernel_size; i <= half_kernel_size; i++) {
-                int x_shift = x + i;
-                if (x_shift < 0) x_shift = 0;
-                if (x_shift > width_ - 1) x_shift = width_ - 1;
-                float *pi = PointerAt<float>(x_shift, y, 0);
-                temp += (*pi * (float)kernel[i + half_kernel_size]);
+    // Define the grain size based on the width of the memory read
+    // Want no more than 1/4 of the read to be the halo cells
+    const std::size_t multiple = (utility::DefaultGrainSizeTBB() +
+        4 * (kernel.size() - 1)) / utility::DefaultGrainSizeTBB();
+    const std::size_t grain_size = multiple * utility::DefaultGrainSizeTBB()
+        + 1 - kernel.size();
+    tbb::parallel_for(tbb::blocked_range2d<int, int>(
+            0, height_, grain_size, 0, width_, grain_size),
+            [&](const tbb::blocked_range2d<int, int>& range){
+        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
+            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
+                float *po = output->PointerAt<float>(x, y, 0);
+                double temp = 0;
+                for (int i = -half_kernel_size; i <= half_kernel_size; ++i) {
+                    int x_shift = x + i;
+                    if (x_shift < 0) x_shift = 0;
+                    if (x_shift > width_ - 1) x_shift = width_ - 1;
+                    float *pi = PointerAt<float>(x_shift, y, 0);
+                    temp += (*pi * (float)kernel[i + half_kernel_size]);
+                }
+                *po = (float)temp;
             }
-            *po = (float)temp;
         }
-    }
+    });
     return output;
 }
 
@@ -255,23 +256,21 @@ std::shared_ptr<Image> Image::Transpose() const {
     int in_bytes_per_line = BytesPerLine();
     int bytes_per_pixel = num_of_channels_ * bytes_per_channel_;
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int y = 0; y < height_; y++) {
-        for (int x = 0; x < width_; x++) {
-            std::copy(
+    tbb::parallel_for(tbb::blocked_range2d<int, int>(
+            0, height_, utility::DefaultGrainSizeTBB(),
+            0, width_, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range2d<int, int>& range){
+        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
+            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
+                std::copy(
                     data_.data() + y * in_bytes_per_line + x * bytes_per_pixel,
                     data_.data() + y * in_bytes_per_line +
                             (x + 1) * bytes_per_pixel,
                     output->data_.data() + x * out_bytes_per_line +
                             y * bytes_per_pixel);
+            }
         }
-    }
+    });
 
     return output;
 }
@@ -281,13 +280,15 @@ std::shared_ptr<Image> Image::FlipVertical() const {
     output->Prepare(width_, height_, num_of_channels_, bytes_per_channel_);
 
     int bytes_per_line = BytesPerLine();
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int y = 0; y < height_; y++) {
-        std::copy(data_.data() + y * bytes_per_line,
-                  data_.data() + (y + 1) * bytes_per_line,
-                  output->data_.data() + (height_ - y - 1) * bytes_per_line);
-    }
+    tbb::parallel_for(tbb::blocked_range<int>(
+            0, height_, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<int>& range){
+        for (int y = range.begin(); y < range.end(); ++y) {
+            std::copy(data_.data() + y * bytes_per_line,
+                      data_.data() + (y + 1) * bytes_per_line,
+                      output->data_.data() + (height_ - y - 1) * bytes_per_line);
+        }
+    });
     return output;
 }
 
@@ -297,22 +298,23 @@ std::shared_ptr<Image> Image::FlipHorizontal() const {
 
     int bytes_per_line = BytesPerLine();
     int bytes_per_pixel = num_of_channels_ * bytes_per_channel_;
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int y = 0; y < height_; y++) {
-        for (int x = 0; x < width_; x++) {
-            std::copy(data_.data() + y * bytes_per_line + x * bytes_per_pixel,
-                      data_.data() + y * bytes_per_line +
-                              (x + 1) * bytes_per_pixel,
-                      output->data_.data() + y * bytes_per_line +
-                              (width_ - x - 1) * bytes_per_pixel);
+
+    tbb::parallel_for(tbb::blocked_range2d<int, int>(
+            0, height_, utility::DefaultGrainSizeTBB(),
+            0, width_, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range2d<int, int>& range){
+        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
+            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
+                std::copy(
+                    data_.data() + y * bytes_per_line + x * bytes_per_pixel,
+                    data_.data() + y * bytes_per_line +
+                        (x + 1) * bytes_per_pixel,
+                    output->data_.data() + y * bytes_per_line +
+                        (width_ - x - 1) * bytes_per_pixel);
+            }
         }
-    }
+    });
+
 
     return output;
 }
@@ -324,30 +326,29 @@ std::shared_ptr<Image> Image::Dilate(int half_kernel_size /* = 1 */) const {
     }
     output->Prepare(width_, height_, 1, 1);
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int y = 0; y < height_; y++) {
-        for (int x = 0; x < width_; x++) {
-            for (int yy = -half_kernel_size; yy <= half_kernel_size; yy++) {
-                for (int xx = -half_kernel_size; xx <= half_kernel_size; xx++) {
-                    unsigned char *pi;
-                    if (TestImageBoundary(x + xx, y + yy)) {
-                        pi = PointerAt<unsigned char>(x + xx, y + yy);
-                        if (*pi == 255) {
-                            *output->PointerAt<unsigned char>(x, y, 0) = 255;
-                            xx = half_kernel_size;
-                            yy = half_kernel_size;
+    tbb::parallel_for(tbb::blocked_range2d<int, int>(
+            0, height_, utility::DefaultGrainSizeTBB() - (2 * half_kernel_size),
+            0, width_, utility::DefaultGrainSizeTBB() - (2 * half_kernel_size)),
+            [&](const tbb::blocked_range2d<int, int>& range){
+        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
+            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
+                for (int yy = -half_kernel_size; yy <= half_kernel_size; ++yy) {
+                    for (int xx = -half_kernel_size; xx <= half_kernel_size; ++xx) {
+                        unsigned char *pi;
+                        if (TestImageBoundary(x + xx, y + yy)) {
+                            pi = PointerAt<unsigned char>(x + xx, y + yy);
+                            if (*pi == 255) {
+                                *output->PointerAt<unsigned char>(x, y, 0) = 255;
+                                xx = half_kernel_size;
+                                yy = half_kernel_size;
+                            }
                         }
                     }
                 }
             }
         }
-    }
+    });
+
     return output;
 }
 
@@ -364,25 +365,24 @@ std::shared_ptr<Image> Image::CreateDepthBoundaryMask(
     auto mask = std::make_shared<Image>();
     mask->Prepare(width, height, 1, 1);
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int v = 0; v < height; v++) {
-        for (int u = 0; u < width; u++) {
-            double dx = *depth_image_gradient_dx->PointerAt<float>(u, v);
-            double dy = *depth_image_gradient_dy->PointerAt<float>(u, v);
-            double mag = sqrt(dx * dx + dy * dy);
-            if (mag > depth_threshold_for_discontinuity_check) {
-                *mask->PointerAt<unsigned char>(u, v) = 255;
-            } else {
-                *mask->PointerAt<unsigned char>(u, v) = 0;
+    tbb::parallel_for(tbb::blocked_range2d<int, int>(
+            0, height, utility::DefaultGrainSizeTBB(),
+            0, width, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range2d<int, int>& range){
+        for (int v = range.rows().begin(); v < range.rows().end(); v++) {
+            for (int u = range.cols().begin(); u < range.cols().end(); u++) {
+                double dx = *depth_image_gradient_dx->PointerAt<float>(u, v);
+                double dy = *depth_image_gradient_dy->PointerAt<float>(u, v);
+                double mag = sqrt(dx * dx + dy * dy);
+                if (mag > depth_threshold_for_discontinuity_check) {
+                    *mask->PointerAt<unsigned char>(u, v) = 255;
+                } else {
+                    *mask->PointerAt<unsigned char>(u, v) = 0;
+                }
             }
         }
-    }
+    });
+
     if (half_dilation_kernel_size_for_discontinuity_map >= 1) {
         auto mask_dilated =
                 mask->Dilate(half_dilation_kernel_size_for_discontinuity_map);
diff --git a/cpp/open3d/geometry/PointCloud.cpp b/cpp/open3d/geometry/PointCloud.cpp
index b0b6e0130ae..0f7f4826de0 100644
--- a/cpp/open3d/geometry/PointCloud.cpp
+++ b/cpp/open3d/geometry/PointCloud.cpp
@@ -11,6 +11,9 @@
 #include <algorithm>
 #include <numeric>
 
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
 #include "open3d/geometry/BoundingVolume.h"
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/Qhull.h"
@@ -127,20 +130,22 @@ std::vector<double> PointCloud::ComputePointCloudDistance(
     std::vector<double> distances(points_.size());
     KDTreeFlann kdtree;
     kdtree.SetGeometry(target);
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)points_.size(); i++) {
-        std::vector<int> indices(1);
-        std::vector<double> dists(1);
-        if (kdtree.SearchKNN(points_[i], 1, indices, dists) == 0) {
-            utility::LogDebug(
-                    "[ComputePointCloudToPointCloudDistance] Found a point "
-                    "without neighbors.");
-            distances[i] = 0.0;
-        } else {
-            distances[i] = std::sqrt(dists[0]);
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> indices(1);
+            std::vector<double> dists(1);
+            if (kdtree.SearchKNN(points_[i], 1, indices, dists) == 0) {
+                utility::LogDebug(
+                        "[ComputePointCloudToPointCloudDistance] Found a point "
+                        "without neighbors.");
+                distances[i] = 0.0;
+            } else {
+                distances[i] = std::sqrt(dists[0]);
+            }
         }
-    }
+    });
     return distances;
 }
 
@@ -498,9 +503,9 @@ std::shared_ptr<PointCloud> PointCloud::RandomDownSample(
     std::vector<size_t> indices(points_.size());
     std::iota(std::begin(indices), std::end(indices), (size_t)0);
     {
-        std::lock_guard<std::mutex> lock(*utility::random::GetMutex());
+        tbb::spin_mutex::scoped_lock lock(utility::random::GetMutex());
         std::shuffle(indices.begin(), indices.end(),
-                     *utility::random::GetEngine());
+                     utility::random::GetEngine());
     }
     indices.resize((int)(sampling_ratio * points_.size()));
     return SelectByIndex(indices);
@@ -574,18 +579,22 @@ PointCloud::RemoveRadiusOutliers(size_t nb_points,
     KDTreeFlann kdtree;
     kdtree.SetGeometry(*this);
     std::vector<bool> mask = std::vector<bool>(points_.size());
-    utility::OMPProgressBar progress_bar(
+    utility::TBBProgressBar progress_bar(
             points_.size(), "Remove radius outliers: ", print_progress);
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < int(points_.size()); i++) {
-        std::vector<int> tmp_indices;
-        std::vector<double> dist;
-        size_t nb_neighbors = kdtree.SearchRadius(points_[i], search_radius,
-                                                  tmp_indices, dist);
-        mask[i] = (nb_neighbors > nb_points);
-        ++progress_bar;
-    }
+
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> tmp_indices;
+            std::vector<double> dist;
+            size_t nb_neighbors = kdtree.SearchRadius(points_[i],
+                search_radius, tmp_indices, dist);
+            mask[i] = (nb_neighbors > nb_points);
+            ++progress_bar;
+        }
+    });
+
     std::vector<size_t> indices;
     for (size_t i = 0; i < mask.size(); i++) {
         if (mask[i]) {
@@ -612,25 +621,33 @@ PointCloud::RemoveStatisticalOutliers(size_t nb_neighbors,
     kdtree.SetGeometry(*this);
     std::vector<double> avg_distances = std::vector<double>(points_.size());
     std::vector<size_t> indices;
-    size_t valid_distances = 0;
-    utility::OMPProgressBar progress_bar(
+
+    utility::TBBProgressBar progress_bar(
             points_.size(), "Remove statistical outliers: ", print_progress);
 
-#pragma omp parallel for reduction(+ : valid_distances) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < int(points_.size()); i++) {
-        std::vector<int> tmp_indices;
-        std::vector<double> dist;
-        kdtree.SearchKNN(points_[i], int(nb_neighbors), tmp_indices, dist);
-        double mean = -1.0;
-        if (dist.size() > 0u) {
-            valid_distances++;
-            std::for_each(dist.begin(), dist.end(),
-                          [](double &d) { d = std::sqrt(d); });
-            mean = std::accumulate(dist.begin(), dist.end(), 0.0) / dist.size();
+    size_t valid_distances = tbb::parallel_reduce(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+            utility::DefaultGrainSizeTBB()), std::size_t{0},
+            [&](const tbb::blocked_range<std::size_t>& range,
+                    std::size_t valid_so_far){
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> tmp_indices;
+            std::vector<double> dist;
+            kdtree.SearchKNN(points_[i], int(nb_neighbors), tmp_indices, dist);
+            double mean = -1.0;
+            if (dist.size() > 0u) {
+                ++valid_so_far;
+                std::for_each(dist.begin(), dist.end(),
+                              [](double &d) { d = std::sqrt(d); });
+                mean = std::accumulate(dist.begin(), dist.end(),
+                                       0.0) / dist.size();
+            }
+            avg_distances[i] = mean;
+            ++progress_bar;
         }
-        avg_distances[i] = mean;
-        ++progress_bar;
-    }
+        return valid_so_far;
+    }, std::plus<std::size_t>{});
+
     if (valid_distances == 0) {
         return std::make_tuple(std::make_shared<PointCloud>(),
                                std::vector<size_t>());
@@ -646,7 +663,8 @@ PointCloud::RemoveStatisticalOutliers(size_t nb_neighbors,
                 return x > 0 ? (x - cloud_mean) * (y - cloud_mean) : 0;
             });
     // Bessel's correction
-    double std_dev = std::sqrt(sq_sum / (valid_distances - 1));
+    double std_dev = std::sqrt(sq_sum /
+        static_cast<double>(valid_distances - 1));
     double distance_threshold = cloud_mean + std_ratio * std_dev;
     for (size_t i = 0; i < avg_distances.size(); i++) {
         if (avg_distances[i] > 0 && avg_distances[i] < distance_threshold) {
@@ -665,21 +683,23 @@ std::vector<Eigen::Matrix3d> PointCloud::EstimatePerPointCovariances(
 
     KDTreeFlann kdtree;
     kdtree.SetGeometry(input);
-#pragma omp parallel for schedule(static)
-    for (int i = 0; i < (int)points.size(); i++) {
-        std::vector<int> indices;
-        std::vector<double> distance2;
-        if (kdtree.Search(points[i], search_param, indices, distance2) >= 3) {
-            auto covariance = utility::ComputeCovariance(points, indices);
-            if (input.HasCovariances() && covariance.isIdentity(1e-4)) {
-                covariances[i] = input.covariances_[i];
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(0, points.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> indices;
+            std::vector<double> distance2;
+            if (kdtree.Search(points[i], search_param, indices, distance2) >= 3) {
+                auto covariance = utility::ComputeCovariance(points, indices);
+                if (input.HasCovariances() && covariance.isIdentity(1e-4)) {
+                    covariances[i] = input.covariances_[i];
+                } else {
+                    covariances[i] = covariance;
+                }
             } else {
-                covariances[i] = covariance;
+                covariances[i] = Eigen::Matrix3d::Identity();
             }
-        } else {
-            covariances[i] = Eigen::Matrix3d::Identity();
         }
-    }
+    });
     return covariances;
 }
 void PointCloud::EstimateCovariances(
@@ -704,12 +724,14 @@ std::vector<double> PointCloud::ComputeMahalanobisDistance() const {
     Eigen::Matrix3d covariance;
     std::tie(mean, covariance) = ComputeMeanAndCovariance();
     Eigen::Matrix3d cov_inv = covariance.inverse();
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)points_.size(); i++) {
-        Eigen::Vector3d p = points_[i] - mean;
-        mahalanobis[i] = std::sqrt(p.transpose() * cov_inv * p);
-    }
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            Eigen::Vector3d p = points_[i] - mean;
+            mahalanobis[i] = std::sqrt(p.transpose() * cov_inv * p);
+        }
+    });
     return mahalanobis;
 }
 
@@ -720,20 +742,22 @@ std::vector<double> PointCloud::ComputeNearestNeighborDistance() const {
 
     std::vector<double> nn_dis(points_.size());
     KDTreeFlann kdtree(*this);
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)points_.size(); i++) {
-        std::vector<int> indices(2);
-        std::vector<double> dists(2);
-        if (kdtree.SearchKNN(points_[i], 2, indices, dists) <= 1) {
-            utility::LogDebug(
-                    "[ComputePointCloudNearestNeighborDistance] Found a point "
-                    "without neighbors.");
-            nn_dis[i] = 0.0;
-        } else {
-            nn_dis[i] = std::sqrt(dists[1]);
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> indices(2);
+            std::vector<double> dists(2);
+            if (kdtree.SearchKNN(points_[i], 2, indices, dists) <= 1) {
+                utility::LogDebug(
+                        "[ComputePointCloudNearestNeighborDistance] "
+                        "Found a point without neighbors.");
+                nn_dis[i] = 0.0;
+            } else {
+                nn_dis[i] = std::sqrt(dists[1]);
+            }
         }
-    }
+    });
     return nn_dis;
 }
 
diff --git a/cpp/open3d/geometry/PointCloudCluster.cpp b/cpp/open3d/geometry/PointCloudCluster.cpp
index 6efad46bd50..5166b9ac244 100644
--- a/cpp/open3d/geometry/PointCloudCluster.cpp
+++ b/cpp/open3d/geometry/PointCloudCluster.cpp
@@ -8,6 +8,9 @@
 #include <Eigen/Dense>
 #include <unordered_set>
 
+#include <tbb/spin_mutex.h>
+#include <tbb/parallel_for.h>
+
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/utility/Logging.h"
@@ -27,15 +30,22 @@ std::vector<int> PointCloud::ClusterDBSCAN(double eps,
     utility::ProgressBar progress_bar(points_.size(), "Precompute neighbors.",
                                       print_progress);
     std::vector<std::vector<int>> nbs(points_.size());
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int idx = 0; idx < int(points_.size()); ++idx) {
-        std::vector<double> dists2;
-        kdtree.SearchRadius(points_[idx], eps, nbs[idx], dists2);
-
-#pragma omp critical(ClusterDBSCAN)
-        { ++progress_bar; }
-    }
+
+    tbb::spin_mutex mtx;
+    tbb::profiling::set_name(mtx, "ClusterDBSCAN");
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<double> dists2;
+            kdtree.SearchRadius(points_[i], eps, nbs[i], dists2);
+            {
+                tbb::spin_mutex::scoped_lock lock(mtx);
+                ++progress_bar;
+            }
+        }
+    });
+
     utility::LogDebug("Done Precompute neighbors.");
 
     // Set all labels to undefined (-2).
diff --git a/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp b/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp
index b76bc9f3fc4..ff435589937 100644
--- a/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp
+++ b/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp
@@ -14,6 +14,8 @@
 #include <unordered_map>
 #include <unordered_set>
 
+#include <tbb/parallel_for.h>
+
 #include "libqhullcpp/PointCoordinates.h"
 #include "libqhullcpp/Qhull.h"
 #include "libqhullcpp/QhullVertex.h"
@@ -22,6 +24,7 @@
 #include "open3d/geometry/KDTreeSearchParam.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/utility/Logging.h"
+#include "open3d/utility/Parallel.h"
 
 namespace open3d {
 namespace geometry {
@@ -993,12 +996,16 @@ PointCloud::DetectPlanarPatches(
     kdtree.SetGeometry(*this);
     std::vector<std::vector<int>> neighbors;
     neighbors.resize(points_.size());
-#pragma omp parallel for schedule(static)
-    for (int i = 0; i < static_cast<int>(points_.size()); i++) {
-        std::vector<int> indices;
-        std::vector<double> distance2;
-        kdtree.Search(points_[i], search_param, neighbors[i], distance2);
-    }
+
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> indices;
+            std::vector<double> distance2;
+            kdtree.Search(points_[i], search_param, neighbors[i], distance2);
+        }
+    });
 
     const double normal_similarity_rad = normal_similarity_deg * M_PI / 180.0;
     const double coplanarity_rad = coplanarity_deg * M_PI / 180.0;
diff --git a/cpp/open3d/geometry/PointCloudSegmentation.cpp b/cpp/open3d/geometry/PointCloudSegmentation.cpp
index 2968810cecc..e4da2a382b4 100644
--- a/cpp/open3d/geometry/PointCloudSegmentation.cpp
+++ b/cpp/open3d/geometry/PointCloudSegmentation.cpp
@@ -12,6 +12,9 @@
 #include <numeric>
 #include <unordered_set>
 
+#include <tbb/spin_rw_mutex.h>
+#include <tbb/parallel_for.h>
+
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/geometry/TriangleMesh.h"
 #include "open3d/utility/Logging.h"
@@ -60,6 +63,11 @@ class RANSACResult {
     RANSACResult() : fitness_(0), inlier_rmse_(0) {}
     ~RANSACResult() {}
 
+    bool IsBetterRANSACThan(const RANSACResult& other) {
+        return fitness_ > other.fitness_ ||
+         (fitness_ == other.fitness_ && inlier_rmse_ < other.inlier_rmse_);
+    }
+
 public:
     double fitness_;
     double inlier_rmse_;
@@ -163,12 +171,6 @@ std::tuple<Eigen::Vector4d, std::vector<size_t>> PointCloud::SegmentPlane(
 
     size_t num_points = points_.size();
     RandomSampler<size_t> sampler(num_points);
-    // Pre-generate all random samples before entering the parallel region
-    std::vector<std::vector<size_t>> all_sampled_indices;
-    all_sampled_indices.reserve(num_iterations);
-    for (int i = 0; i < num_iterations; i++) {
-        all_sampled_indices.push_back(sampler(ransac_n));
-    }
 
     // Return if ransac_n is less than the required plane model parameters.
     if (ransac_n < 3) {
@@ -183,57 +185,63 @@ std::tuple<Eigen::Vector4d, std::vector<size_t>> PointCloud::SegmentPlane(
                                std::vector<size_t>{});
     }
 
-    // Use size_t here to avoid large integer which acceed max of int.
-    size_t break_iteration = std::numeric_limits<size_t>::max();
-    int iteration_count = 0;
-
-#pragma omp parallel for schedule(static)
-    for (int itr = 0; itr < num_iterations; itr++) {
-        if ((size_t)iteration_count > break_iteration) {
-            continue;
-        }
+    // Use size_t here to avoid large integer which exceed max of int.
+    std::size_t break_iteration = std::numeric_limits<std::size_t>::max();
+    std::size_t iteration_count = 0;
+
+    tbb::spin_rw_mutex mtx;
+    tbb::parallel_for(tbb::blocked_range<int>(0, num_iterations, 1),
+            [&](const tbb::blocked_range<int>& range) {
+        for (int i = range.begin(); i < range.end(); ++i) {
+            // Eschew using a mutex here because doing an extra iteration is ok
+            if (iteration_count > break_iteration) {
+                continue;
+            }
 
-        // Access the pre-generated sampled indices
-        std::vector<size_t> inliers = all_sampled_indices[itr];
-
-        // Fit model to num_model_parameters randomly selected points among the
-        // inliers.
-        Eigen::Vector4d plane_model;
-        if (ransac_n == 3) {
-            plane_model = TriangleMesh::ComputeTrianglePlane(
-                    points_[inliers[0]], points_[inliers[1]],
-                    points_[inliers[2]]);
-        } else {
-            plane_model = GetPlaneFromPoints(points_, inliers);
-        }
+            const std::vector<size_t> sampled_indices = sampler(ransac_n);
+            std::vector<size_t> inliers = sampled_indices;
+
+            // Fit model to num_model_parameters randomly selected points
+            // among the inliers.
+            Eigen::Vector4d plane_model;
+            if (ransac_n == 3) {
+                plane_model = TriangleMesh::ComputeTrianglePlane(
+                        points_[inliers[0]], points_[inliers[1]],
+                        points_[inliers[2]]);
+            } else {
+                plane_model = GetPlaneFromPoints(points_, inliers);
+            }
 
-        if (plane_model.isZero(0)) {
-            continue;
-        }
+            if (plane_model.isZero(0)) {
+                continue;
+            }
 
-        inliers.clear();
-        auto this_result = EvaluateRANSACBasedOnDistance(
-                points_, plane_model, inliers, distance_threshold);
-#pragma omp critical
-        {
-            if (this_result.fitness_ > result.fitness_ ||
-                (this_result.fitness_ == result.fitness_ &&
-                 this_result.inlier_rmse_ < result.inlier_rmse_)) {
-                result = this_result;
-                best_plane_model = plane_model;
-                if (result.fitness_ < 1.0) {
-                    break_iteration = std::min(
-                            log(1 - probability) /
+            inliers.clear();
+            auto this_result = EvaluateRANSACBasedOnDistance(
+                    points_, plane_model, inliers, distance_threshold);
+            {
+                tbb::spin_rw_mutex::scoped_lock lock(mtx);
+                if (this_result.IsBetterRANSACThan(result)) {
+                    lock.upgrade_to_writer();
+                    // Have to recheck after upgrading lock because
+                    // upgrading requires releasing the lock
+                    if (this_result.IsBetterRANSACThan(result)) {
+                        result = this_result;
+                        best_plane_model = plane_model;
+                        if (result.fitness_ < 1.0) {
+                            break_iteration = std::min(log(1 - probability) /
                                     log(1 - pow(result.fitness_, ransac_n)),
-                            (double)num_iterations);
-                } else {
-                    // Set break_iteration to 0 to force to break the loop.
-                    break_iteration = 0;
+                                    (double)num_iterations);
+                        } else {
+                            // Set break_iteration to 0 so we break the loop.
+                            break_iteration = 0;
+                        }
+                    }
                 }
+                iteration_count++;
             }
-            iteration_count++;
         }
-    }
+    });
 
     // Find the final inliers using best_plane_model.
     std::vector<size_t> final_inliers;
diff --git a/cpp/open3d/geometry/TriangleMesh.cpp b/cpp/open3d/geometry/TriangleMesh.cpp
index 9e909858736..f77c1965001 100644
--- a/cpp/open3d/geometry/TriangleMesh.cpp
+++ b/cpp/open3d/geometry/TriangleMesh.cpp
@@ -11,6 +11,7 @@
 #include <numeric>
 #include <queue>
 #include <tuple>
+#include <tbb/tbb.h>
 
 #include "open3d/geometry/BoundingVolume.h"
 #include "open3d/geometry/IntersectionTest.h"
@@ -927,13 +928,15 @@ TriangleMesh &TriangleMesh::MergeCloseVertices(double eps) {
     // precompute all neighbours
     utility::LogDebug("Precompute Neighbours");
     std::vector<std::vector<int>> nbs(vertices_.size());
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int idx = 0; idx < int(vertices_.size()); ++idx) {
-        std::vector<double> dists2;
-        kdtree.SearchRadius(vertices_[idx], eps, nbs[idx], dists2);
-    }
     utility::LogDebug("Done Precompute Neighbours");
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, vertices_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range){
+        std::vector<double> dists2;
+        for (std::size_t idx = range.begin(); idx < range.end(); ++idx) {
+            kdtree.SearchRadius(vertices_[idx], eps, nbs[idx], dists2);
+        }
+    });
 
     bool has_vertex_normals = HasVertexNormals();
     bool has_vertex_colors = HasVertexColors();
@@ -973,12 +976,12 @@ TriangleMesh &TriangleMesh::MergeCloseVertices(double eps) {
             new_vert_mapping[nb] = new_vidx;
             n += 1;
         }
-        new_vertices.push_back(vertex / n);
+        new_vertices.emplace_back(vertex / n);
         if (has_vertex_normals) {
-            new_vertex_normals.push_back(normal / n);
+            new_vertex_normals.emplace_back(normal / n);
         }
         if (has_vertex_colors) {
-            new_vertex_colors.push_back(color / n);
+            new_vertex_colors.emplace_back(color / n);
         }
     }
     utility::LogDebug("Merged {} vertices",
@@ -1198,18 +1201,6 @@ double TriangleMesh::GetSurfaceArea(std::vector<double> &triangle_areas) const {
 }
 
 double TriangleMesh::GetVolume() const {
-    // Computes the signed volume of the tetrahedron defined by
-    // the three triangle vertices and the origin. The sign is determined by
-    // checking if the origin is at the same side as the normal with respect to
-    // the triangle.
-    auto GetSignedVolumeOfTriangle = [&](size_t tidx) {
-        const Eigen::Vector3i &triangle = triangles_[tidx];
-        const Eigen::Vector3d &vertex0 = vertices_[triangle(0)];
-        const Eigen::Vector3d &vertex1 = vertices_[triangle(1)];
-        const Eigen::Vector3d &vertex2 = vertices_[triangle(2)];
-        return vertex0.dot(vertex1.cross(vertex2)) / 6.0;
-    };
-
     if (!IsWatertight()) {
         utility::LogError(
                 "The mesh is not watertight, and the volume cannot be "
@@ -1221,13 +1212,22 @@ double TriangleMesh::GetVolume() const {
                 "computed.");
     }
 
-    double volume = 0;
-    int64_t num_triangles = triangles_.size();
-#pragma omp parallel for reduction(+ : volume) num_threads(utility::EstimateMaxThreads())
-    for (int64_t tidx = 0; tidx < num_triangles; ++tidx) {
-        volume += GetSignedVolumeOfTriangle(tidx);
-    }
-    return std::abs(volume);
+    return std::abs(tbb::parallel_reduce(tbb::blocked_range<std::size_t>(
+            0, triangles_.size(), utility::DefaultGrainSizeTBB()), 0.0,
+            [&](const tbb::blocked_range<std::size_t>& range, double volume){
+        for (std::size_t tidx = range.begin(); tidx < range.end(); ++tidx) {
+            // Computes the signed volume of the tetrahedron defined by
+            // the three triangle vertices and the origin. The sign is determined by
+            // checking if the origin is at the same side as the normal with respect to
+            // the triangle.
+            const Eigen::Vector3i &triangle = triangles_[tidx];
+            const Eigen::Vector3d &vertex0 = vertices_[triangle(0)];
+            const Eigen::Vector3d &vertex1 = vertices_[triangle(1)];
+            const Eigen::Vector3d &vertex2 = vertices_[triangle(2)];
+            volume += vertex0.dot(vertex1.cross(vertex2)) / 6.0;
+        }
+        return volume;
+    }, std::plus<double>{}));
 }
 
 Eigen::Vector4d TriangleMesh::ComputeTrianglePlane(const Eigen::Vector3d &p0,
@@ -1359,45 +1359,50 @@ bool TriangleMesh::IsVertexManifold() const {
 std::vector<Eigen::Vector2i> TriangleMesh::GetSelfIntersectingTriangles()
         const {
     std::vector<Eigen::Vector2i> self_intersecting_triangles;
-    for (size_t tidx0 = 0; tidx0 < triangles_.size() - 1; ++tidx0) {
-        const Eigen::Vector3i &tria_p = triangles_[tidx0];
-        const Eigen::Vector3d &p0 = vertices_[tria_p(0)];
-        const Eigen::Vector3d &p1 = vertices_[tria_p(1)];
-        const Eigen::Vector3d &p2 = vertices_[tria_p(2)];
-
-        const Eigen::Vector3d bb_min1 =
-                p0.array().min(p1.array().min(p2.array()));
-        const Eigen::Vector3d bb_max1 =
-                p0.array().max(p1.array().max(p2.array()));
-
-        for (size_t tidx1 = tidx0 + 1; tidx1 < triangles_.size(); ++tidx1) {
-            const Eigen::Vector3i &tria_q = triangles_[tidx1];
-            // check if neighbour triangle
-            if (tria_p(0) == tria_q(0) || tria_p(0) == tria_q(1) ||
-                tria_p(0) == tria_q(2) || tria_p(1) == tria_q(0) ||
-                tria_p(1) == tria_q(1) || tria_p(1) == tria_q(2) ||
-                tria_p(2) == tria_q(0) || tria_p(2) == tria_q(1) ||
-                tria_p(2) == tria_q(2)) {
-                continue;
-            }
+    tbb::spin_mutex out_mtx;
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, triangles_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range){
+        for (std::size_t tidx0 = range.begin(); tidx0 < range.end(); ++tidx0) {
+            const Eigen::Vector3i &tria_p = triangles_[tidx0];
+            const Eigen::Vector3d &p0 = vertices_[tria_p(0)];
+            const Eigen::Vector3d &p1 = vertices_[tria_p(1)];
+            const Eigen::Vector3d &p2 = vertices_[tria_p(2)];
+
+            const Eigen::Vector3d bb_min1 =
+                    p0.array().min(p1.array().min(p2.array()));
+            const Eigen::Vector3d bb_max1 =
+                    p0.array().max(p1.array().max(p2.array()));
+
+            for (size_t tidx1 = tidx0 + 1; tidx1 < triangles_.size(); ++tidx1) {
+                const Eigen::Vector3i &tria_q = triangles_[tidx1];
+                // check if neighbour triangle
+                if (tria_p(0) == tria_q(0) || tria_p(0) == tria_q(1) ||
+                    tria_p(0) == tria_q(2) || tria_p(1) == tria_q(0) ||
+                    tria_p(1) == tria_q(1) || tria_p(1) == tria_q(2) ||
+                    tria_p(2) == tria_q(0) || tria_p(2) == tria_q(1) ||
+                    tria_p(2) == tria_q(2)) {
+                    continue;
+                }
 
-            // check for intersection
-            const Eigen::Vector3d &q0 = vertices_[tria_q(0)];
-            const Eigen::Vector3d &q1 = vertices_[tria_q(1)];
-            const Eigen::Vector3d &q2 = vertices_[tria_q(2)];
-
-            const Eigen::Vector3d bb_min2 =
-                    q0.array().min(q1.array().min(q2.array()));
-            const Eigen::Vector3d bb_max2 =
-                    q0.array().max(q1.array().max(q2.array()));
-            if (IntersectionTest::AABBAABB(bb_min1, bb_max1, bb_min2,
-                                           bb_max2) &&
-                IntersectionTest::TriangleTriangle3d(p0, p1, p2, q0, q1, q2)) {
-                self_intersecting_triangles.push_back(
-                        Eigen::Vector2i(tidx0, tidx1));
+                // check for intersection
+                const Eigen::Vector3d &q0 = vertices_[tria_q(0)];
+                const Eigen::Vector3d &q1 = vertices_[tria_q(1)];
+                const Eigen::Vector3d &q2 = vertices_[tria_q(2)];
+
+                const Eigen::Vector3d bb_min2 =
+                        q0.array().min(q1.array().min(q2.array()));
+                const Eigen::Vector3d bb_max2 =
+                        q0.array().max(q1.array().max(q2.array()));
+                if (IntersectionTest::AABBAABB(bb_min1, bb_max1, bb_min2, bb_max2) &&
+                    IntersectionTest::TriangleTriangle3d(p0, p1, p2, q0, q1, q2)) {
+                    tbb::spin_mutex::scoped_lock lock(out_mtx);
+                    self_intersecting_triangles.push_back(
+                            Eigen::Vector2i(tidx0, tidx1));
+                }
             }
         }
-    }
+    });
     return self_intersecting_triangles;
 }
 
@@ -1441,23 +1446,25 @@ TriangleMesh::ClusterConnectedTriangles() const {
     utility::LogDebug("[ClusterConnectedTriangles] Compute triangle adjacency");
     auto edges_to_triangles = GetEdgeToTrianglesMap();
     std::vector<std::unordered_set<int>> adjacency_list(triangles_.size());
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int tidx = 0; tidx < int(triangles_.size()); ++tidx) {
-        const auto &triangle = triangles_[tidx];
-        for (auto tnb :
-             edges_to_triangles[GetOrderedEdge(triangle(0), triangle(1))]) {
-            adjacency_list[tidx].insert(tnb);
-        }
-        for (auto tnb :
-             edges_to_triangles[GetOrderedEdge(triangle(0), triangle(2))]) {
-            adjacency_list[tidx].insert(tnb);
-        }
-        for (auto tnb :
-             edges_to_triangles[GetOrderedEdge(triangle(1), triangle(2))]) {
-            adjacency_list[tidx].insert(tnb);
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, triangles_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range){
+        for (std::size_t tidx = range.begin(); tidx < range.end(); ++tidx) {
+            const auto &triangle = triangles_[tidx];
+            for (auto tnb :
+                 edges_to_triangles[GetOrderedEdge(triangle(0), triangle(1))]) {
+                adjacency_list[tidx].insert(tnb);
+            }
+            for (auto tnb :
+                 edges_to_triangles[GetOrderedEdge(triangle(0), triangle(2))]) {
+                adjacency_list[tidx].insert(tnb);
+            }
+            for (auto tnb :
+                 edges_to_triangles[GetOrderedEdge(triangle(1), triangle(2))]) {
+                adjacency_list[tidx].insert(tnb);
+            }
         }
-    }
+    });
     utility::LogDebug(
             "[ClusterConnectedTriangles] Done computing triangle adjacency");
 
diff --git a/cpp/open3d/geometry/TriangleMeshDeformation.cpp b/cpp/open3d/geometry/TriangleMeshDeformation.cpp
index adfe76b76ed..4541c0fd7c4 100644
--- a/cpp/open3d/geometry/TriangleMeshDeformation.cpp
+++ b/cpp/open3d/geometry/TriangleMeshDeformation.cpp
@@ -9,6 +9,8 @@
 #include <Eigen/Sparse>
 #include <algorithm>
 
+#include <tbb/tbb.h>
+
 #include "open3d/geometry/TriangleMesh.h"
 #include "open3d/utility/Logging.h"
 #include "open3d/utility/Parallel.h"
@@ -93,72 +95,77 @@ std::shared_ptr<TriangleMesh> TriangleMesh::DeformAsRigidAsPossible(
             std::swap(Rs, Rs_old);
         }
 
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int i = 0; i < int(vertices_.size()); ++i) {
-            // Update rotations
-            Eigen::Matrix3d S = Eigen::Matrix3d::Zero();
-            Eigen::Matrix3d R = Eigen::Matrix3d::Zero();
-            int n_nbs = 0;
-            for (int j : prime->adjacency_list_[i]) {
-                Eigen::Vector3d e0 = vertices_[i] - vertices_[j];
-                Eigen::Vector3d e1 = prime->vertices_[i] - prime->vertices_[j];
-                double w = edge_weights[GetOrderedEdge(i, j)];
-                S += w * (e0 * e1.transpose());
-                if (energy_model == DeformAsRigidAsPossibleEnergy::Smoothed) {
-                    R += Rs_old[j];
-                }
-                n_nbs++;
-            }
-            if (energy_model == DeformAsRigidAsPossibleEnergy::Smoothed &&
-                iter > 0 && n_nbs > 0) {
-                S = 2 * S +
-                    (4 * smoothed_alpha * surface_area / n_nbs) * R.transpose();
-            }
-            Eigen::JacobiSVD<Eigen::Matrix3d> svd(
-                    S, Eigen::ComputeFullU | Eigen::ComputeFullV);
-            Eigen::Matrix3d U = svd.matrixU();
-            Eigen::Matrix3d V = svd.matrixV();
-            Eigen::Vector3d D(1, 1, (V * U.transpose()).determinant());
-            // ensure rotation:
-            // http://graphics.stanford.edu/~smr/ICP/comparison/eggert_comparison_mva97.pdf
-            Rs[i] = V * D.asDiagonal() * U.transpose();
-            if (Rs[i].determinant() <= 0) {
-                utility::LogError(
-                        "something went wrong with "
-                        "updating R");
-            }
-        }
-
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int i = 0; i < int(vertices_.size()); ++i) {
-            // Update Positions
-            Eigen::Vector3d bi(0, 0, 0);
-            if (constraints.count(i) > 0) {
-                bi = constraints[i];
-            } else {
+        tbb::parallel_for(tbb::blocked_range<int>(
+                0, vertices_.size(), utility::DefaultGrainSizeTBB()),
+                [&](const tbb::blocked_range<int>& range){
+            for (int i = range.begin(); i < range.end(); ++i) {
+                // Update rotations
+                Eigen::Matrix3d S = Eigen::Matrix3d::Zero();
+                Eigen::Matrix3d R = Eigen::Matrix3d::Zero();
+                int n_nbs = 0;
                 for (int j : prime->adjacency_list_[i]) {
+                    Eigen::Vector3d e0 = vertices_[i] - vertices_[j];
+                    Eigen::Vector3d e1 = prime->vertices_[i] - prime->vertices_[j];
                     double w = edge_weights[GetOrderedEdge(i, j)];
-                    bi += w / 2 *
-                          ((Rs[i] + Rs[j]) * (vertices_[i] - vertices_[j]));
+                    S += w * (e0 * e1.transpose());
+                    if (energy_model == DeformAsRigidAsPossibleEnergy::Smoothed) {
+                        R += Rs_old[j];
+                    }
+                    n_nbs++;
+                }
+                if (energy_model == DeformAsRigidAsPossibleEnergy::Smoothed &&
+                    iter > 0 && n_nbs > 0) {
+                    S = 2 * S + (4 * smoothed_alpha * surface_area / n_nbs)
+                        * R.transpose();
+                }
+                Eigen::JacobiSVD<Eigen::Matrix3d> svd(
+                        S, Eigen::ComputeFullU | Eigen::ComputeFullV);
+                Eigen::Matrix3d U = svd.matrixU();
+                Eigen::Matrix3d V = svd.matrixV();
+                Eigen::Vector3d D(1, 1, (V * U.transpose()).determinant());
+                // ensure rotation:
+                // http://graphics.stanford.edu/~smr/ICP/comparison/eggert_comparison_mva97.pdf
+                Rs[i] = V * D.asDiagonal() * U.transpose();
+                if (Rs[i].determinant() <= 0) {
+                    utility::LogError("something went wrong with updating R");
                 }
             }
-            b[0](i) = bi(0);
-            b[1](i) = bi(1);
-            b[2](i) = bi(2);
-        }
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int comp = 0; comp < 3; ++comp) {
-            Eigen::VectorXd p_prime = solver.solve(b[comp]);
-            if (solver.info() != Eigen::Success) {
-                utility::LogError("Cholesky solve failed");
+        });
+
+        tbb::parallel_for(tbb::blocked_range<int>(
+                0, vertices_.size(), utility::DefaultGrainSizeTBB()),
+                [&](const tbb::blocked_range<int>& range){
+            for (int i = range.begin(); i < range.end(); ++i) {
+                // Update Positions
+                Eigen::Vector3d bi(0, 0, 0);
+                if (constraints.count(i) > 0) {
+                    bi = constraints[i];
+                } else {
+                    for (int j : prime->adjacency_list_[i]) {
+                        double w = edge_weights[GetOrderedEdge(i, j)];
+                        bi += w / 2 *
+                              ((Rs[i] + Rs[j]) * (vertices_[i] - vertices_[j]));
+                    }
+                }
+                b[0](i) = bi(0);
+                b[1](i) = bi(1);
+                b[2](i) = bi(2);
             }
-            for (int i = 0; i < int(vertices_.size()); ++i) {
-                prime->vertices_[i](comp) = p_prime(i);
+        });
+
+        tbb::parallel_for(tbb::blocked_range<int>(0, 3, 1),
+                [&](const tbb::blocked_range<int>& range){
+            for (int comp = range.begin(); comp < range.end(); ++comp) {
+                Eigen::VectorXd p_prime = solver.solve(b[comp]);
+                if (solver.info() != Eigen::Success) {
+                    utility::LogError("Cholesky solve failed");
+                }
+                for (int i = 0; i < int(vertices_.size()); ++i) {
+                    prime->vertices_[i](comp) = p_prime(i);
+                }
             }
-        }
+        });
+
 
         // Compute energy and log
         double energy = 0;
diff --git a/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp b/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp
index 962113b2df3..e137fa57933 100644
--- a/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp
+++ b/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp
@@ -11,6 +11,9 @@
 #include <k4arecord/record.h>
 #include <turbojpeg.h>
 
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range2d.h>
+
 #include <memory>
 
 #include "open3d/geometry/RGBDImage.h"
@@ -160,20 +163,19 @@ void ConvertBGRAToRGB(geometry::Image &bgra, geometry::Image &rgb) {
                 "dimensions.");
     }
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(3) schedule(static)
-#endif
-    for (int v = 0; v < bgra.height_; ++v) {
-        for (int u = 0; u < bgra.width_; ++u) {
-            for (int c = 0; c < 3; ++c) {
-                *rgb.PointerAt<uint8_t>(u, v, c) =
-                        *bgra.PointerAt<uint8_t>(u, v, 2 - c);
+    tbb::parallel_for(tbb::blocked_range2d<int>(
+            0, bgra.height_, utility::DefaultGrainSizeTBB2D(),
+            0, bgra.width_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range2d<int>& range){
+        for (int v = range.rows().begin(); v < range.rows().end(); ++v) {
+            for (int u = range.cols().begin(); u < range.cols().end(); ++u) {
+                for (int c = 0; c < 3; ++c) {
+                    *rgb.PointerAt<uint8_t>(u, v, c) =
+                            *bgra.PointerAt<uint8_t>(u, v, 2 - c);
+                }
             }
         }
-    }
+    });
 }
 
 bool AzureKinectSensor::PrintFirmware(k4a_device_t device) {
diff --git a/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp b/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp
index 8ab677d3a3a..10539c4a12b 100644
--- a/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp
+++ b/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp
@@ -7,6 +7,10 @@
 
 #include "open3d/pipelines/color_map/ColorMapUtils.h"
 
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_for_each.h>
+#include <tbb/concurrent_vector.h>
+
 #include "open3d/camera/PinholeCameraTrajectory.h"
 #include "open3d/geometry/Image.h"
 #include "open3d/geometry/KDTreeFlann.h"
@@ -123,50 +127,56 @@ CreateVertexAndImageVisibility(
         const camera::PinholeCameraTrajectory& camera_trajectory,
         double maximum_allowable_depth,
         double depth_threshold_for_visibility_check) {
-    size_t n_camera = camera_trajectory.parameters_.size();
-    size_t n_vertex = mesh.vertices_.size();
+    int n_camera = static_cast<int>(camera_trajectory.parameters_.size());
+    int n_vertex = static_cast<int>(mesh.vertices_.size());
     // visibility_image_to_vertex[c]: vertices visible by camera c.
     std::vector<std::vector<int>> visibility_image_to_vertex;
     visibility_image_to_vertex.resize(n_camera);
+
+    tbb::parallel_for(tbb::blocked_range<int>(0, n_camera, 1),
+            [&](const tbb::blocked_range<int>& range) {
+        for (int camera_id = range.begin();
+                               camera_id < range.end(); ++camera_id) {
+            for (int vertex_id = 0; vertex_id < n_vertex; vertex_id++) {
+                Eigen::Vector3d X = mesh.vertices_[vertex_id];
+                float u, v, d;
+                std::tie(u, v, d) = Project3DPointAndGetUVDepth(
+                        X, camera_trajectory.parameters_[camera_id]);
+                int u_d = int(round(u));
+                int v_d = int(round(v));
+                // Skip if vertex in image boundary.
+                if (d < 0.0 ||
+                    !images_depth[camera_id].TestImageBoundary(u_d, v_d)) {
+                    continue;
+                }
+                // Skip if vertex's depth is too large (e.g. background).
+                float d_sensor =
+                        *images_depth[camera_id].PointerAt<float>(u_d, v_d);
+                if (d_sensor > maximum_allowable_depth) {
+                    continue;
+                }
+                // Check depth boundary mask. If a vertex is located at the boundary
+                // of an object, its color will be highly diverse from different
+                // viewing angles.
+                if (*images_mask[camera_id].PointerAt<uint8_t>(u_d, v_d) == 255) {
+                    continue;
+                }
+                // Check depth errors.
+                if (std::fabs(d - d_sensor) >=
+                    depth_threshold_for_visibility_check) {
+                    continue;
+                }
+                visibility_image_to_vertex[camera_id].push_back(vertex_id);
+            }
+        }
+    });
+
     // visibility_vertex_to_image[v]: cameras that can see vertex v.
     std::vector<std::vector<int>> visibility_vertex_to_image;
     visibility_vertex_to_image.resize(n_vertex);
-
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int camera_id = 0; camera_id < int(n_camera); camera_id++) {
-        for (int vertex_id = 0; vertex_id < int(n_vertex); vertex_id++) {
-            Eigen::Vector3d X = mesh.vertices_[vertex_id];
-            float u, v, d;
-            std::tie(u, v, d) = Project3DPointAndGetUVDepth(
-                    X, camera_trajectory.parameters_[camera_id]);
-            int u_d = int(round(u));
-            int v_d = int(round(v));
-            // Skip if vertex in image boundary.
-            if (d < 0.0 ||
-                !images_depth[camera_id].TestImageBoundary(u_d, v_d)) {
-                continue;
-            }
-            // Skip if vertex's depth is too large (e.g. background).
-            float d_sensor =
-                    *images_depth[camera_id].PointerAt<float>(u_d, v_d);
-            if (d_sensor > maximum_allowable_depth) {
-                continue;
-            }
-            // Check depth boundary mask. If a vertex is located at the boundary
-            // of an object, its color will be highly diverse from different
-            // viewing angles.
-            if (*images_mask[camera_id].PointerAt<uint8_t>(u_d, v_d) == 255) {
-                continue;
-            }
-            // Check depth errors.
-            if (std::fabs(d - d_sensor) >=
-                depth_threshold_for_visibility_check) {
-                continue;
-            }
-            visibility_image_to_vertex[camera_id].push_back(vertex_id);
-#pragma omp critical(CreateVertexAndImageVisibility)
-            { visibility_vertex_to_image[vertex_id].push_back(camera_id); }
+    for (int i = 0; i < n_camera; ++i) {
+        for (int vidx: visibility_image_to_vertex[i]) {
+            visibility_vertex_to_image[vidx].emplace_back(i);
         }
     }
 
@@ -193,37 +203,39 @@ void SetProxyIntensityForVertex(
     auto n_vertex = mesh.vertices_.size();
     proxy_intensity.resize(n_vertex);
 
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < int(n_vertex); i++) {
-        proxy_intensity[i] = 0.0;
-        float sum = 0.0;
-        for (size_t iter = 0; iter < visibility_vertex_to_image[i].size();
-             iter++) {
-            int j = visibility_vertex_to_image[i][iter];
-            float gray;
-            bool valid = false;
-            if (warping_fields.has_value()) {
-                std::tie(valid, gray) = QueryImageIntensity<float>(
-                        images_gray[j], warping_fields.value()[j],
-                        mesh.vertices_[i], camera_trajectory.parameters_[j],
-                        utility::nullopt, image_boundary_margin);
-            } else {
-                std::tie(valid, gray) = QueryImageIntensity<float>(
-                        images_gray[j], utility::nullopt, mesh.vertices_[i],
-                        camera_trajectory.parameters_[j], utility::nullopt,
-                        image_boundary_margin);
-            }
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, n_vertex, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            proxy_intensity[i] = 0.0;
+            float sum = 0.0;
+            for (size_t iter = 0; iter < visibility_vertex_to_image[i].size();
+                 iter++) {
+                int j = visibility_vertex_to_image[i][iter];
+                float gray;
+                bool valid = false;
+                if (warping_fields.has_value()) {
+                    std::tie(valid, gray) = QueryImageIntensity<float>(
+                            images_gray[j], warping_fields.value()[j],
+                            mesh.vertices_[i], camera_trajectory.parameters_[j],
+                            utility::nullopt, image_boundary_margin);
+                } else {
+                    std::tie(valid, gray) = QueryImageIntensity<float>(
+                            images_gray[j], utility::nullopt, mesh.vertices_[i],
+                            camera_trajectory.parameters_[j], utility::nullopt,
+                            image_boundary_margin);
+                }
 
-            if (valid) {
-                sum += 1.0;
-                proxy_intensity[i] += gray;
+                if (valid) {
+                    sum += 1.0;
+                    proxy_intensity[i] += gray;
+                }
+            }
+            if (sum > 0) {
+                proxy_intensity[i] /= sum;
             }
         }
-        if (sum > 0) {
-            proxy_intensity[i] /= sum;
-        }
-    }
+    });
 }
 
 void SetGeometryColorAverage(
@@ -237,43 +249,46 @@ void SetGeometryColorAverage(
     size_t n_vertex = mesh.vertices_.size();
     mesh.vertex_colors_.clear();
     mesh.vertex_colors_.resize(n_vertex);
-    std::vector<size_t> valid_vertices;
-    std::vector<size_t> invalid_vertices;
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)n_vertex; i++) {
-        mesh.vertex_colors_[i] = Eigen::Vector3d::Zero();
-        double sum = 0.0;
-        for (size_t iter = 0; iter < visibility_vertex_to_image[i].size();
-             iter++) {
-            int j = visibility_vertex_to_image[i][iter];
-            uint8_t r_temp, g_temp, b_temp;
-            bool valid = false;
-            utility::optional<ImageWarpingField> optional_warping_field;
-            if (warping_fields.has_value()) {
-                optional_warping_field = warping_fields.value()[j];
-            } else {
-                optional_warping_field = utility::nullopt;
-            }
-            std::tie(valid, r_temp) = QueryImageIntensity<uint8_t>(
-                    images_color[j], optional_warping_field, mesh.vertices_[i],
-                    camera_trajectory.parameters_[j], 0, image_boundary_margin);
-            std::tie(valid, g_temp) = QueryImageIntensity<uint8_t>(
-                    images_color[j], optional_warping_field, mesh.vertices_[i],
-                    camera_trajectory.parameters_[j], 1, image_boundary_margin);
-            std::tie(valid, b_temp) = QueryImageIntensity<uint8_t>(
-                    images_color[j], optional_warping_field, mesh.vertices_[i],
-                    camera_trajectory.parameters_[j], 2, image_boundary_margin);
-            float r = (float)r_temp / 255.0f;
-            float g = (float)g_temp / 255.0f;
-            float b = (float)b_temp / 255.0f;
-            if (valid) {
-                mesh.vertex_colors_[i] += Eigen::Vector3d(r, g, b);
-                sum += 1.0;
+    tbb::concurrent_vector<std::size_t> valid_vertices;
+    tbb::concurrent_vector<std::size_t> invalid_vertices;
+
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, n_vertex, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            mesh.vertex_colors_[i] = Eigen::Vector3d::Zero();
+            double sum = 0.0;
+            for (size_t iter = 0; iter < visibility_vertex_to_image[i].size();
+                 iter++) {
+                int j = visibility_vertex_to_image[i][iter];
+                uint8_t r_temp, g_temp, b_temp;
+                bool valid = false;
+                utility::optional<ImageWarpingField> optional_warping_field;
+                if (warping_fields.has_value()) {
+                    optional_warping_field = warping_fields.value()[j];
+                } else {
+                    optional_warping_field = utility::nullopt;
+                }
+                std::tie(valid, r_temp) = QueryImageIntensity<uint8_t>(
+                        images_color[j], optional_warping_field,
+                        mesh.vertices_[i],camera_trajectory.parameters_[j],
+                        0, image_boundary_margin);
+                std::tie(valid, g_temp) = QueryImageIntensity<uint8_t>(
+                        images_color[j], optional_warping_field,
+                        mesh.vertices_[i], camera_trajectory.parameters_[j],
+                        1, image_boundary_margin);
+                std::tie(valid, b_temp) = QueryImageIntensity<uint8_t>(
+                        images_color[j], optional_warping_field,
+                        mesh.vertices_[i], camera_trajectory.parameters_[j],
+                        2, image_boundary_margin);
+                float r = (float)r_temp / 255.0f;
+                float g = (float)g_temp / 255.0f;
+                float b = (float)b_temp / 255.0f;
+                if (valid) {
+                    mesh.vertex_colors_[i] += Eigen::Vector3d(r, g, b);
+                    sum += 1.0;
+                }
             }
-        }
-#pragma omp critical(SetGeometryColorAverage)
-        {
             if (sum > 0.0) {
                 mesh.vertex_colors_[i] /= sum;
                 valid_vertices.push_back(i);
@@ -281,15 +296,13 @@ void SetGeometryColorAverage(
                 invalid_vertices.push_back(i);
             }
         }
-    }
+    });
     if (invisible_vertex_color_knn > 0) {
-        std::shared_ptr<geometry::TriangleMesh> valid_mesh =
-                mesh.SelectByIndex(valid_vertices);
+        std::shared_ptr<geometry::TriangleMesh> valid_mesh = mesh.SelectByIndex(
+                {valid_vertices.begin(), valid_vertices.end()});
         geometry::KDTreeFlann kd_tree(*valid_mesh);
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int i = 0; i < (int)invalid_vertices.size(); ++i) {
-            size_t invalid_vertex = invalid_vertices[i];
+        tbb::parallel_for_each(invalid_vertices,
+                [&](std::size_t invalid_vertex){
             std::vector<int> indices;  // indices to valid_mesh
             std::vector<double> dists;
             kd_tree.SearchKNN(mesh.vertices_[invalid_vertex],
@@ -298,11 +311,11 @@ void SetGeometryColorAverage(
             for (const int& index : indices) {
                 new_color += valid_mesh->vertex_colors_[index];
             }
-            if (indices.size() > 0) {
+            if (indices.empty()) {
                 new_color /= static_cast<double>(indices.size());
             }
             mesh.vertex_colors_[invalid_vertex] = new_color;
-        }
+        });
     }
 }
 
diff --git a/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp b/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
index 9dfc98fdfa8..d93cc16f4b4 100644
--- a/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
+++ b/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
@@ -10,6 +10,10 @@
 #include <memory>
 #include <vector>
 
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
 #include "open3d/io/ImageIO.h"
 #include "open3d/io/ImageWarpingFieldIO.h"
 #include "open3d/io/PinholeCameraTrajectoryIO.h"
@@ -43,6 +47,55 @@ static std::vector<ImageWarpingField> CreateWarpingFields(
     return fields;
 }
 
+template <typename MatOutType, typename VecOutType,
+          typename VecInTypeDouble, typename VecInTypeInt>
+struct JTJandJTrNonRigidReducer {
+    // Global Data
+    using FuncType = std::function<void(int,
+        VecInTypeDouble&, double&, VecInTypeInt&)>;
+    FuncType& f;
+    // Local Data
+    MatOutType JTJ;
+    VecOutType JTr;
+    double r2_sum = 0.0;
+
+    JTJandJTrNonRigidReducer(FuncType& f_, int nonrigidval)
+            : f(f_), JTJ(MatOutType::Zero(6 + nonrigidval, 6 + nonrigidval)),
+            JTr(VecOutType::Zero(6 + nonrigidval)) {};
+
+    JTJandJTrNonRigidReducer(JTJandJTrNonRigidReducer& o, tbb::split)
+            : f(o.f), JTJ(MatOutType::Zero(o.JTJ.rows(), o.JTJ.cols())),
+          JTr(VecOutType::Zero(o.JTr.rows(), o.JTr.cols())) {}
+
+    void operator()(const tbb::blocked_range<int>& range) {
+        VecInTypeDouble J_r;
+        VecInTypeInt pattern;
+        double r;
+        for (int i = range.begin(); i < range.end(); i++) {
+            f(i, J_r, r, pattern);
+            for (auto x = 0; x < J_r.size(); x++) {
+                for (auto y = 0; y < J_r.size(); y++) {
+                    JTJ(pattern(x), pattern(y)) += J_r(x) * J_r(y);
+                }
+            }
+            for (auto x = 0; x < J_r.size(); x++) {
+                JTr(pattern(x)) += r * J_r(x);
+            }
+            r2_sum += r * r;
+        }
+    }
+
+    void join(const JTJandJTrNonRigidReducer& rhs) {
+        JTJ += rhs.JTJ;
+        JTr += rhs.JTr;
+        r2_sum += rhs.r2_sum;
+    }
+
+    std::tuple<MatOutType, VecOutType, double> as_tuple() && {
+        return {std::move(JTJ), std::move(JTr), r2_sum};
+    }
+};
+
 /// Function to compute JTJ and Jtr
 /// Input: function pointer f and total number of rows of Jacobian matrix
 /// Output: JTJ, JTr, sum of r^2
@@ -58,46 +111,16 @@ static std::tuple<MatOutType, VecOutType, double> ComputeJTJandJTrNonRigid(
         int iteration_num,
         int nonrigidval,
         bool verbose /*=true*/) {
-    MatOutType JTJ(6 + nonrigidval, 6 + nonrigidval);
-    VecOutType JTr(6 + nonrigidval);
-    double r2_sum = 0.0;
-    JTJ.setZero();
-    JTr.setZero();
-#pragma omp parallel
-    {
-        MatOutType JTJ_private(6 + nonrigidval, 6 + nonrigidval);
-        VecOutType JTr_private(6 + nonrigidval);
-        double r2_sum_private = 0.0;
-        JTJ_private.setZero();
-        JTr_private.setZero();
-        VecInTypeDouble J_r;
-        VecInTypeInt pattern;
-        double r;
-#pragma omp for nowait
-        for (int i = 0; i < iteration_num; i++) {
-            f(i, J_r, r, pattern);
-            for (auto x = 0; x < J_r.size(); x++) {
-                for (auto y = 0; y < J_r.size(); y++) {
-                    JTJ_private(pattern(x), pattern(y)) += J_r(x) * J_r(y);
-                }
-            }
-            for (auto x = 0; x < J_r.size(); x++) {
-                JTr_private(pattern(x)) += r * J_r(x);
-            }
-            r2_sum_private += r * r;
-        }
-#pragma omp critical(ComputeJTJandJTrNonRigid)
-        {
-            JTJ += JTJ_private;
-            JTr += JTr_private;
-            r2_sum += r2_sum_private;
-        }
-    }
+    JTJandJTrNonRigidReducer<MatOutType, VecOutType, VecInTypeDouble,
+            VecInTypeInt> reducer(f, nonrigidval);
+    tbb::parallel_reduce(tbb::blocked_range<int>(0, iteration_num,
+        utility::DefaultGrainSizeTBB()), reducer);
+    auto out = std::move(reducer).as_tuple();
     if (verbose) {
         utility::LogDebug("Residual : {:.2e} (# of elements : {:d})",
-                          r2_sum / (double)iteration_num, iteration_num);
+            std::get<2>(out) / (double)iteration_num, iteration_num);
     }
-    return std::make_tuple(std::move(JTJ), std::move(JTr), r2_sum);
+    return out;
 }
 
 static void ComputeJacobianAndResidualNonRigid(
@@ -200,6 +223,15 @@ static void ComputeJacobianAndResidualNonRigid(
     r = (gray - proxy_intensity[vid]);
 }
 
+inline void atomic_sum(std::atomic<double>& total, const double& val) {
+#if defined(__cpp_lib_atomic_float) && __cpp_lib_atomic >= 201711L
+    total.fetch_add(val);
+#else
+    double expected = total;
+    while(!total.compare_exchange_weak(expected, expected + val));
+#endif
+}
+
 std::pair<geometry::TriangleMesh, camera::PinholeCameraTrajectory>
 RunNonRigidOptimizer(const geometry::TriangleMesh& mesh,
                      const std::vector<geometry::RGBDImage>& images_rgbd,
@@ -282,75 +314,74 @@ RunNonRigidOptimizer(const geometry::TriangleMesh& mesh,
                                option.image_boundary_margin_);
     for (int itr = 0; itr < option.maximum_iteration_; itr++) {
         utility::LogDebug("[Iteration {:04d}] ", itr + 1);
-        double residual = 0.0;
-        double residual_reg = 0.0;
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int c = 0; c < n_camera; c++) {
-            int nonrigidval = warping_fields[c].anchor_w_ *
-                              warping_fields[c].anchor_h_ * 2;
-            double rr_reg = 0.0;
-
-            Eigen::Matrix4d pose;
-            pose = opt_camera_trajectory.parameters_[c].extrinsic_;
-
-            auto intrinsic = opt_camera_trajectory.parameters_[c]
-                                     .intrinsic_.intrinsic_matrix_;
-            auto extrinsic = opt_camera_trajectory.parameters_[c].extrinsic_;
-            Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
-            intr.block<3, 3>(0, 0) = intrinsic;
-            intr(3, 3) = 1.0;
-
-            auto f_lambda = [&](int i, Eigen::Vector14d& J_r, double& r,
-                                Eigen::Vector14i& pattern) {
-                ComputeJacobianAndResidualNonRigid(
-                        i, J_r, r, pattern, opt_mesh, proxy_intensity,
-                        images_gray[c], images_dx[c], images_dy[c],
-                        warping_fields[c], intr, extrinsic,
-                        visibility_image_to_vertex[c],
-                        option.image_boundary_margin_);
-            };
-            Eigen::MatrixXd JTJ;
-            Eigen::VectorXd JTr;
-            double r2;
-            std::tie(JTJ, JTr, r2) =
-                    ComputeJTJandJTrNonRigid<Eigen::Vector14d, Eigen::Vector14i,
-                                             Eigen::MatrixXd, Eigen::VectorXd>(
-                            f_lambda, int(visibility_image_to_vertex[c].size()),
-                            nonrigidval, false);
-
-            double weight = option.non_rigid_anchor_point_weight_ *
-                            visibility_image_to_vertex[c].size() / n_vertex;
-            for (int j = 0; j < nonrigidval; j++) {
-                double r = weight * (warping_fields[c].flow_(j) -
-                                     warping_fields_init[c].flow_(j));
-                JTJ(6 + j, 6 + j) += weight * weight;
-                JTr(6 + j) += weight * r;
-                rr_reg += r * r;
-            }
+        std::atomic<double> residual = 0.0;
+        std::atomic<double> residual_reg = 0.0;
+        tbb::parallel_for(tbb::blocked_range<int>(0, n_camera, 1),
+                [&](const tbb::blocked_range<int>& range) {
+            for (int c = range.begin(); c < range.end(); ++c) {
+                int nonrigidval = warping_fields[c].anchor_w_ *
+                                  warping_fields[c].anchor_h_ * 2;
+                double rr_reg = 0.0;
 
-            bool success;
-            Eigen::VectorXd result;
-            std::tie(success, result) = utility::SolveLinearSystemPSD(
-                    JTJ, -JTr, /*prefer_sparse=*/false,
-                    /*check_symmetric=*/false,
-                    /*check_det=*/false, /*check_psd=*/false);
-            Eigen::Vector6d result_pose;
-            result_pose << result.block(0, 0, 6, 1);
-            auto delta = utility::TransformVector6dToMatrix4d(result_pose);
-            pose = delta * pose;
-
-            for (int j = 0; j < nonrigidval; j++) {
-                warping_fields[c].flow_(j) += result(6 + j);
-            }
-            opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
+                Eigen::Matrix4d pose;
+                pose = opt_camera_trajectory.parameters_[c].extrinsic_;
+
+                auto intrinsic = opt_camera_trajectory.parameters_[c]
+                                         .intrinsic_.intrinsic_matrix_;
+                auto extrinsic = opt_camera_trajectory.
+                                 parameters_[c].extrinsic_;
+                Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
+                intr.block<3, 3>(0, 0) = intrinsic;
+                intr(3, 3) = 1.0;
+
+                auto f_lambda = [&](int i, Eigen::Vector14d& J_r, double& r,
+                                    Eigen::Vector14i& pattern) {
+                    ComputeJacobianAndResidualNonRigid(
+                            i, J_r, r, pattern, opt_mesh, proxy_intensity,
+                            images_gray[c], images_dx[c], images_dy[c],
+                            warping_fields[c], intr, extrinsic,
+                            visibility_image_to_vertex[c],
+                            option.image_boundary_margin_);
+                };
+                Eigen::MatrixXd JTJ;
+                Eigen::VectorXd JTr;
+                double r2;
+                std::tie(JTJ, JTr, r2) =
+                        ComputeJTJandJTrNonRigid<Eigen::Vector14d, Eigen::Vector14i,
+                                                 Eigen::MatrixXd, Eigen::VectorXd>(
+                                f_lambda, int(visibility_image_to_vertex[c].size()),
+                                nonrigidval, false);
 
-#pragma omp critical(RunNonRigidOptimizer)
-            {
-                residual += r2;
-                residual_reg += rr_reg;
+                double weight = option.non_rigid_anchor_point_weight_ *
+                                visibility_image_to_vertex[c].size() / n_vertex;
+                for (int j = 0; j < nonrigidval; j++) {
+                    double r = weight * (warping_fields[c].flow_(j) -
+                                         warping_fields_init[c].flow_(j));
+                    JTJ(6 + j, 6 + j) += weight * weight;
+                    JTr(6 + j) += weight * r;
+                    rr_reg += r * r;
+                }
+
+                bool success;
+                Eigen::VectorXd result;
+                std::tie(success, result) = utility::SolveLinearSystemPSD(
+                        JTJ, -JTr, /*prefer_sparse=*/false,
+                        /*check_symmetric=*/false,
+                        /*check_det=*/false, /*check_psd=*/false);
+                Eigen::Vector6d result_pose;
+                result_pose << result.block(0, 0, 6, 1);
+                auto delta = utility::TransformVector6dToMatrix4d(result_pose);
+                pose = delta * pose;
+
+                for (int j = 0; j < nonrigidval; j++) {
+                    warping_fields[c].flow_(j) += result(6 + j);
+                }
+                opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
+
+                atomic_sum(residual, r2);
+                atomic_sum(residual_reg, rr_reg);
             }
-        }
+        });
         utility::LogDebug("Residual error : {:.6f}, reg : {:.6f}", residual,
                           residual_reg);
         SetProxyIntensityForVertex(opt_mesh, images_gray, warping_fields,
diff --git a/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp b/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
index b506d401a6f..f507e996098 100644
--- a/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
+++ b/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
@@ -10,6 +10,8 @@
 #include <memory>
 #include <vector>
 
+#include <tbb/parallel_for.h>
+
 #include "open3d/io/ImageIO.h"
 #include "open3d/io/PinholeCameraTrajectoryIO.h"
 #include "open3d/io/TriangleMeshIO.h"
@@ -70,6 +72,15 @@ static void ComputeJacobianAndResidualRigid(
     w = 1.0;  // Dummy.
 }
 
+inline void atomic_sum(std::atomic<double>& total, const double& val) {
+#if defined(__cpp_lib_atomic_float) && __cpp_lib_atomic >= 201711L
+    total.fetch_add(val);
+#else
+    double expected = total;
+    while(!total.compare_exchange_weak(expected, expected + val));
+#endif
+}
+
 std::pair<geometry::TriangleMesh, camera::PinholeCameraTrajectory>
 RunRigidOptimizer(const geometry::TriangleMesh& mesh,
                   const std::vector<geometry::RGBDImage>& images_rgbd,
@@ -136,7 +147,7 @@ RunRigidOptimizer(const geometry::TriangleMesh& mesh,
 
     utility::LogDebug("[ColorMapOptimization] Rigid Optimization");
     std::vector<double> proxy_intensity;
-    int total_num_ = 0;
+    std::atomic<int> total_num_ = 0;
     int n_camera = int(opt_camera_trajectory.parameters_.size());
     SetProxyIntensityForVertex(opt_mesh, images_gray, utility::nullopt,
                                opt_camera_trajectory,
@@ -144,51 +155,48 @@ RunRigidOptimizer(const geometry::TriangleMesh& mesh,
                                option.image_boundary_margin_);
     for (int itr = 0; itr < option.maximum_iteration_; itr++) {
         utility::LogDebug("[Iteration {:04d}] ", itr + 1);
-        double residual = 0.0;
+        std::atomic<double> residual = 0.0;
         total_num_ = 0;
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-        for (int c = 0; c < n_camera; c++) {
-            Eigen::Matrix4d pose;
-            pose = opt_camera_trajectory.parameters_[c].extrinsic_;
-
-            auto intrinsic = opt_camera_trajectory.parameters_[c]
-                                     .intrinsic_.intrinsic_matrix_;
-            auto extrinsic = opt_camera_trajectory.parameters_[c].extrinsic_;
-            Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
-            intr.block<3, 3>(0, 0) = intrinsic;
-            intr(3, 3) = 1.0;
-
-            auto f_lambda = [&](int i, Eigen::Vector6d& J_r, double& r,
-                                double& w) {
-                w = 1.0;  // Dummy.
-                ComputeJacobianAndResidualRigid(
-                        i, J_r, r, w, opt_mesh, proxy_intensity, images_gray[c],
-                        images_dx[c], images_dy[c], intr, extrinsic,
-                        visibility_image_to_vertex[c],
-                        option.image_boundary_margin_);
-            };
-            Eigen::Matrix6d JTJ;
-            Eigen::Vector6d JTr;
-            double r2;
-            std::tie(JTJ, JTr, r2) =
-                    utility::ComputeJTJandJTr<Eigen::Matrix6d, Eigen::Vector6d>(
-                            f_lambda, int(visibility_image_to_vertex[c].size()),
-                            false);
-
-            bool is_success;
-            Eigen::Matrix4d delta;
-            std::tie(is_success, delta) =
-                    utility::SolveJacobianSystemAndObtainExtrinsicMatrix(JTJ,
-                                                                         JTr);
-            pose = delta * pose;
-            opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
-#pragma omp critical(RunRigidOptimizer)
-            {
-                residual += r2;
+        tbb::parallel_for(tbb::blocked_range<int>(0, n_camera, 1),
+                [&](const tbb::blocked_range<int>& range) {
+            for (int c = range.begin(); c < range.end(); ++c) {
+                Eigen::Matrix4d pose;
+                pose = opt_camera_trajectory.parameters_[c].extrinsic_;
+
+                auto intrinsic = opt_camera_trajectory.
+                                 parameters_[c].intrinsic_.intrinsic_matrix_;
+                auto extrinsic = opt_camera_trajectory.
+                                 parameters_[c].extrinsic_;
+                Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
+                intr.block<3, 3>(0, 0) = intrinsic;
+                intr(3, 3) = 1.0;
+
+                auto f_lambda = [&](int i, Eigen::Vector6d& J_r,
+                                    double& r, double& w) {
+                    w = 1.0;  // Dummy.
+                    ComputeJacobianAndResidualRigid(
+                            i, J_r, r, w, opt_mesh, proxy_intensity,
+                            images_gray[c], images_dx[c], images_dy[c], intr,
+                            extrinsic, visibility_image_to_vertex[c],
+                            option.image_boundary_margin_);
+                };
+                Eigen::Matrix6d JTJ;
+                Eigen::Vector6d JTr;
+                double r2;
+                std::tie(JTJ, JTr, r2) = utility::ComputeJTJandJTr<
+                        Eigen::Matrix6d, Eigen::Vector6d>(f_lambda,
+                            int(visibility_image_to_vertex[c].size()), false);
+
+                Eigen::Matrix4d delta;
+                std::tie(std::ignore, delta) =
+                        utility::SolveJacobianSystemAndObtainExtrinsicMatrix(
+                                JTJ, JTr);
+                pose = delta * pose;
+                opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
+                atomic_sum(residual, r2);
                 total_num_ += int(visibility_image_to_vertex[c].size());
             }
-        }
+        });
         if (total_num_ > 0) {
             utility::LogDebug("Residual error : {:.6f} (avg : {:.6f})",
                               residual, residual / total_num_);
diff --git a/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp b/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp
index 7cb4eba6f38..6698fc149d5 100644
--- a/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp
+++ b/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp
@@ -11,6 +11,10 @@
 #include <thread>
 #include <unordered_map>
 
+#include <tbb/parallel_for.h>
+#include <tbb/blocked_range2d.h>
+#include <tbb/blocked_range3d.h>
+
 #include "open3d/geometry/VoxelGrid.h"
 #include "open3d/pipelines/integration/MarchingCubesConst.h"
 #include "open3d/utility/Helper.h"
@@ -257,28 +261,28 @@ std::shared_ptr<geometry::VoxelGrid> UniformTSDFVolume::ExtractVoxelGrid()
     voxel_grid->voxel_size_ = voxel_length_;
     voxel_grid->origin_ = origin_;
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int x = 0; x < resolution_; x++) {
-        for (int y = 0; y < resolution_; y++) {
-            for (int z = 0; z < resolution_; z++) {
-                const int ind = IndexOf(x, y, z);
-                const float w = voxels_[ind].weight_;
-                const float f = voxels_[ind].tsdf_;
-                if (w != 0.0f && f < 0.98f && f >= -0.98f) {
-                    double c = (f + 1.0) * 0.5;
-                    Eigen::Vector3d color = Eigen::Vector3d(c, c, c);
-                    Eigen::Vector3i index = Eigen::Vector3i(x, y, z);
-                    voxel_grid->voxels_[index] = geometry::Voxel(index, color);
+    tbb::parallel_for(tbb::blocked_range3d<int>(
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range3d<int>& range){
+        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
+            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
+                for (int z = range.cols().begin(); z < range.cols().end(); z++){
+                    const int ind = IndexOf(x, y, z);
+                    const float w = voxels_[ind].weight_;
+                    const float f = voxels_[ind].tsdf_;
+                    if (w != 0.0f && f < 0.98f && f >= -0.98f) {
+                        double c = (f + 1.0) * 0.5;
+                        Eigen::Vector3d color = Eigen::Vector3d(c, c, c);
+                        Eigen::Vector3i index = Eigen::Vector3i(x, y, z);
+                        voxel_grid->voxels_[index] =
+                                geometry::Voxel(index, color);
+                    }
                 }
             }
         }
-    }
+    });
     return voxel_grid;
 }
 
@@ -286,23 +290,22 @@ std::vector<Eigen::Vector2d> UniformTSDFVolume::ExtractVolumeTSDF() const {
     std::vector<Eigen::Vector2d> sharedvoxels_;
     sharedvoxels_.resize(voxel_num_);
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int x = 0; x < resolution_; x++) {
-        for (int y = 0; y < resolution_; y++) {
-            for (int z = 0; z < resolution_; z++) {
-                const int ind = IndexOf(x, y, z);
-                const float f = voxels_[ind].tsdf_;
-                const float w = voxels_[ind].weight_;
-                sharedvoxels_[ind] = Eigen::Vector2d(f, w);
+    tbb::parallel_for(tbb::blocked_range3d<int>(
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range3d<int>& range){
+        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
+            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
+                for (int z = range.cols().begin(); z < range.cols().end(); z++){
+                    const int ind = IndexOf(x, y, z);
+                    const float f = voxels_[ind].tsdf_;
+                    const float w = voxels_[ind].weight_;
+                    sharedvoxels_[ind] = Eigen::Vector2d(f, w);
+                }
             }
         }
-    }
+    });
     return sharedvoxels_;
 }
 
@@ -310,61 +313,58 @@ std::vector<Eigen::Vector3d> UniformTSDFVolume::ExtractVolumeColor() const {
     std::vector<Eigen::Vector3d> sharedcolors_;
     sharedcolors_.resize(voxel_num_);
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int x = 0; x < resolution_; x++) {
-        for (int y = 0; y < resolution_; y++) {
-            for (int z = 0; z < resolution_; z++) {
-                const int ind = IndexOf(x, y, z);
-                sharedcolors_[ind] = voxels_[ind].color_;
+    tbb::parallel_for(tbb::blocked_range3d<int>(
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range3d<int>& range){
+        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
+            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
+                for (int z = range.cols().begin(); z < range.cols().end(); z++){
+                    const int ind = IndexOf(x, y, z);
+                    sharedcolors_[ind] = voxels_[ind].color_;
+                }
             }
         }
-    }
+    });
     return sharedcolors_;
 }
 
 void UniformTSDFVolume::InjectVolumeTSDF(
         const std::vector<Eigen::Vector2d> &sharedvoxels) {
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int x = 0; x < resolution_; x++) {
-        for (int y = 0; y < resolution_; y++) {
-            for (int z = 0; z < resolution_; z++) {
-                const int ind = IndexOf(x, y, z);
-                voxels_[ind].tsdf_ = sharedvoxels[ind](0);
-                voxels_[ind].weight_ = sharedvoxels[ind](1);
+    tbb::parallel_for(tbb::blocked_range3d<int>(
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range3d<int>& range){
+        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
+            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
+                for (int z = range.cols().begin(); z < range.cols().end(); z++){
+                    const int ind = IndexOf(x, y, z);
+                    voxels_[ind].tsdf_ = sharedvoxels[ind](0);
+                    voxels_[ind].weight_ = sharedvoxels[ind](1);
+                }
             }
         }
-    }
+    });
 }
 
 void UniformTSDFVolume::InjectVolumeColor(
         const std::vector<Eigen::Vector3d> &sharedcolors) {
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int x = 0; x < resolution_; x++) {
-        for (int y = 0; y < resolution_; y++) {
-            for (int z = 0; z < resolution_; z++) {
-                const int ind = IndexOf(x, y, z);
-                voxels_[ind].color_ = sharedcolors[ind];
+    tbb::parallel_for(tbb::blocked_range3d<int>(
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range3d<int>& range){
+        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
+            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
+                for (int z = range.cols().begin(); z < range.cols().end(); z++){
+                    const int ind = IndexOf(x, y, z);
+                    voxels_[ind].color_ = sharedcolors[ind];
+                }
             }
         }
-    }
+    });
 }
 
 void UniformTSDFVolume::IntegrateWithDepthToCameraDistanceMultiplier(
@@ -385,80 +385,75 @@ void UniformTSDFVolume::IntegrateWithDepthToCameraDistanceMultiplier(
     const float safe_width_f = intrinsic.width_ - 0.0001f;
     const float safe_height_f = intrinsic.height_ - 0.0001f;
 
-#ifdef _WIN32
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#else
-#pragma omp parallel for collapse(2) schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-#endif
-    for (int x = 0; x < resolution_; x++) {
-        for (int y = 0; y < resolution_; y++) {
-            Eigen::Vector4f pt_3d_homo(float(half_voxel_length_f +
-                                             voxel_length_f * x + origin_(0)),
-                                       float(half_voxel_length_f +
-                                             voxel_length_f * y + origin_(1)),
-                                       float(half_voxel_length_f + origin_(2)),
-                                       1.f);
-            Eigen::Vector4f pt_camera = extrinsic_f * pt_3d_homo;
-            for (int z = 0; z < resolution_; z++,
-                     pt_camera(0) += extrinsic_scaled_f(0, 2),
-                     pt_camera(1) += extrinsic_scaled_f(1, 2),
-                     pt_camera(2) += extrinsic_scaled_f(2, 2)) {
-                // Skip if negative depth after projection
-                if (pt_camera(2) <= 0) {
-                    continue;
-                }
-                // Skip if x-y coordinate not in range
-                float u_f = pt_camera(0) * fx / pt_camera(2) + cx + 0.5f;
-                float v_f = pt_camera(1) * fy / pt_camera(2) + cy + 0.5f;
-                if (!(u_f >= 0.0001f && u_f < safe_width_f && v_f >= 0.0001f &&
-                      v_f < safe_height_f)) {
-                    continue;
-                }
-                // Skip if negative depth in depth image
-                int u = (int)u_f;
-                int v = (int)v_f;
-                float d = *image.depth_.PointerAt<float>(u, v);
-                if (d <= 0.0f) {
-                    continue;
-                }
+    tbb::parallel_for(tbb::blocked_range2d<int>(
+            0, resolution_, utility::DefaultGrainSizeTBB2D(),
+            0, resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range2d<int>& range) {
+        for (int x = range.rows().begin(); x < range.rows().end(); ++x) {
+            for (int y = range.cols().begin(); y < range.cols().end(); ++y) {
+                Eigen::Vector4f pt_3d_homo(
+                    float(half_voxel_length_f + voxel_length_f * x + origin_(0)),
+                    float(half_voxel_length_f + voxel_length_f * y + origin_(1)),
+                    float(half_voxel_length_f + origin_(2)), 1.f);
+                Eigen::Vector4f pt_camera = extrinsic_f * pt_3d_homo;
+                for (int z = 0; z < resolution_; z++,
+                         pt_camera(0) += extrinsic_scaled_f(0, 2),
+                         pt_camera(1) += extrinsic_scaled_f(1, 2),
+                         pt_camera(2) += extrinsic_scaled_f(2, 2)) {
+                    // Skip if negative depth after projection
+                    if (pt_camera(2) <= 0) {
+                        continue;
+                    }
+                    // Skip if x-y coordinate not in range
+                    float u_f = pt_camera(0) * fx / pt_camera(2) + cx + 0.5f;
+                    float v_f = pt_camera(1) * fy / pt_camera(2) + cy + 0.5f;
+                    if (!(u_f >= 0.0001f && u_f < safe_width_f && v_f >= 0.0001f &&
+                          v_f < safe_height_f)) {
+                        continue;
+                    }
+                    // Skip if negative depth in depth image
+                    int u = (int)u_f;
+                    int v = (int)v_f;
+                    float d = *image.depth_.PointerAt<float>(u, v);
+                    if (d <= 0.0f) {
+                        continue;
+                    }
 
-                int v_ind = IndexOf(x, y, z);
-                float sdf =
-                        (d - pt_camera(2)) *
-                        (*depth_to_camera_distance_multiplier.PointerAt<float>(
-                                u, v));
-                if (sdf > -sdf_trunc_f) {
-                    // integrate
-                    float tsdf = std::min(1.0f, sdf * sdf_trunc_inv_f);
-                    voxels_[v_ind].tsdf_ =
-                            (voxels_[v_ind].tsdf_ * voxels_[v_ind].weight_ +
-                             tsdf) /
-                            (voxels_[v_ind].weight_ + 1.0f);
-                    if (color_type_ == TSDFVolumeColorType::RGB8) {
-                        const uint8_t *rgb =
-                                image.color_.PointerAt<uint8_t>(u, v, 0);
-                        Eigen::Vector3d rgb_f(rgb[0], rgb[1], rgb[2]);
-                        voxels_[v_ind].color_ =
-                                (voxels_[v_ind].color_ *
-                                         voxels_[v_ind].weight_ +
-                                 rgb_f) /
-                                (voxels_[v_ind].weight_ + 1.0f);
-                    } else if (color_type_ == TSDFVolumeColorType::Gray32) {
-                        const float *intensity =
-                                image.color_.PointerAt<float>(u, v, 0);
-                        voxels_[v_ind].color_ =
-                                (voxels_[v_ind].color_.array() *
-                                         voxels_[v_ind].weight_ +
-                                 (*intensity)) /
+                    int v_ind = IndexOf(x, y, z);
+                    float sdf = (d - pt_camera(2)) *
+                                (*depth_to_camera_distance_multiplier
+                                          .PointerAt<float>(u, v));
+                    if (sdf > -sdf_trunc_f) {
+                        // integrate
+                        float tsdf = std::min(1.0f, sdf * sdf_trunc_inv_f);
+                        voxels_[v_ind].tsdf_ =
+                                (voxels_[v_ind].tsdf_ * voxels_[v_ind].weight_ +
+                                 tsdf) /
                                 (voxels_[v_ind].weight_ + 1.0f);
+                        if (color_type_ == TSDFVolumeColorType::RGB8) {
+                            const uint8_t *rgb =
+                                    image.color_.PointerAt<uint8_t>(u, v, 0);
+                            Eigen::Vector3d rgb_f(rgb[0], rgb[1], rgb[2]);
+                            voxels_[v_ind].color_ =
+                                    (voxels_[v_ind].color_ *
+                                             voxels_[v_ind].weight_ +
+                                     rgb_f) /
+                                    (voxels_[v_ind].weight_ + 1.0f);
+                        } else if (color_type_ == TSDFVolumeColorType::Gray32) {
+                            const float *intensity =
+                                    image.color_.PointerAt<float>(u, v, 0);
+                            voxels_[v_ind].color_ =
+                                    (voxels_[v_ind].color_.array() *
+                                             voxels_[v_ind].weight_ +
+                                     (*intensity)) /
+                                    (voxels_[v_ind].weight_ + 1.0f);
+                        }
+                        voxels_[v_ind].weight_ += 1.0f;
                     }
-                    voxels_[v_ind].weight_ += 1.0f;
                 }
             }
         }
-    }
+    });
 }
 
 Eigen::Vector3d UniformTSDFVolume::GetNormalAt(const Eigen::Vector3d &p) {
diff --git a/cpp/open3d/pipelines/odometry/Odometry.cpp b/cpp/open3d/pipelines/odometry/Odometry.cpp
index dd8b1569caa..d326c74f283 100644
--- a/cpp/open3d/pipelines/odometry/Odometry.cpp
+++ b/cpp/open3d/pipelines/odometry/Odometry.cpp
@@ -7,6 +7,9 @@
 
 #include "open3d/pipelines/odometry/Odometry.h"
 
+#include <tbb/blocked_range2d.h>
+#include <tbb/parallel_reduce.h>
+
 #include <Eigen/Dense>
 #include <memory>
 
@@ -15,161 +18,164 @@
 #include "open3d/pipelines/odometry/RGBDOdometryJacobian.h"
 #include "open3d/utility/Eigen.h"
 #include "open3d/utility/Timer.h"
+#include "open3d/utility/Parallel.h"
 
 namespace open3d {
 namespace pipelines {
 namespace odometry {
 
-static std::tuple<geometry::Image, geometry::Image> InitializeCorrespondenceMap(
-        int width, int height) {
-    // initialization: filling with any (u,v) to (-1,-1)
+struct CorrespondenceReduction {
+    // Global
+    const geometry::Image& depth_s;
+    const geometry::Image& depth_t;
+    const Eigen::Matrix3d& KRK_inv;
+    const Eigen::Vector3d& Kt;
+    const double depth_diff_max;
+    // Local
     geometry::Image correspondence_map;
     geometry::Image depth_buffer;
-    correspondence_map.Prepare(width, height, 2, 4);
-    depth_buffer.Prepare(width, height, 1, 4);
-    for (int v = 0; v < correspondence_map.height_; v++) {
-        for (int u = 0; u < correspondence_map.width_; u++) {
-            *correspondence_map.PointerAt<int>(u, v, 0) = -1;
-            *correspondence_map.PointerAt<int>(u, v, 1) = -1;
-            *depth_buffer.PointerAt<float>(u, v, 0) = -1.0f;
-        }
+
+    CorrespondenceReduction(const geometry::Image& depth_s_,
+                            const geometry::Image& depth_t_,
+                            const Eigen::Matrix3d& KRK_inv_,
+                            const Eigen::Vector3d& Kt_,
+                            const double depth_diff_max_)
+            : depth_s(depth_s_), depth_t(depth_t_), KRK_inv(KRK_inv_),
+            Kt(Kt_), depth_diff_max(depth_diff_max_) {
+        InitializeCorrespondenceMap();
     }
-    return std::make_tuple(correspondence_map, depth_buffer);
-}
 
-static inline void AddElementToCorrespondenceMap(
-        geometry::Image &correspondence_map,
-        geometry::Image &depth_buffer,
-        int u_s,
-        int v_s,
-        int u_t,
-        int v_t,
-        float transformed_d_t) {
-    int exist_u_t, exist_v_t;
-    double exist_d_t;
-    exist_u_t = *correspondence_map.PointerAt<int>(u_s, v_s, 0);
-    exist_v_t = *correspondence_map.PointerAt<int>(u_s, v_s, 1);
-    if (exist_u_t != -1 && exist_v_t != -1) {
-        exist_d_t = *depth_buffer.PointerAt<float>(u_s, v_s);
-        if (transformed_d_t <
-            exist_d_t) {  // update nearer point as correspondence
-            *correspondence_map.PointerAt<int>(u_s, v_s, 0) = u_t;
-            *correspondence_map.PointerAt<int>(u_s, v_s, 1) = v_t;
-            *depth_buffer.PointerAt<float>(u_s, v_s) = transformed_d_t;
-        }
-    } else {  // register correspondence
-        *correspondence_map.PointerAt<int>(u_s, v_s, 0) = u_t;
-        *correspondence_map.PointerAt<int>(u_s, v_s, 1) = v_t;
-        *depth_buffer.PointerAt<float>(u_s, v_s) = transformed_d_t;
+    CorrespondenceReduction(CorrespondenceReduction& o, tbb::split)
+            : depth_s(o.depth_s), depth_t(o.depth_t), KRK_inv(o.KRK_inv),
+            Kt(o.Kt), depth_diff_max(o.depth_diff_max) {
+        InitializeCorrespondenceMap();
+    }
+
+    void InitializeCorrespondenceMap() {
+        const int width = depth_t.width_;
+        const int height = depth_t.height_;
+        correspondence_map.Prepare(depth_t.width_, depth_t.height_, 2, 4);
+        depth_buffer.Prepare(depth_t.width_, depth_t.height_, 1, 4);
+        std::fill(correspondence_map.PointerAt<int>(0, 0, 0),
+                  correspondence_map.PointerAt<int>(width, height, 1), -1);
+        std::fill(depth_buffer.PointerAt<float>(0, 0, 0),
+                  depth_buffer.PointerAt<float>(width, height, 0), -1.0f);
     }
-}
 
-static void MergeCorrespondenceMaps(geometry::Image &correspondence_map,
-                                    geometry::Image &depth_buffer,
-                                    geometry::Image &correspondence_map_part,
-                                    geometry::Image &depth_buffer_part) {
-    for (int v_s = 0; v_s < correspondence_map.height_; v_s++) {
-        for (int u_s = 0; u_s < correspondence_map.width_; u_s++) {
-            int u_t = *correspondence_map_part.PointerAt<int>(u_s, v_s, 0);
-            int v_t = *correspondence_map_part.PointerAt<int>(u_s, v_s, 1);
-            if (u_t != -1 && v_t != -1) {
-                float transformed_d_t =
-                        *depth_buffer_part.PointerAt<float>(u_s, v_s);
-                AddElementToCorrespondenceMap(correspondence_map, depth_buffer,
-                                              u_s, v_s, u_t, v_t,
-                                              transformed_d_t);
+    void inline AddElementToCorrespondenceMap(
+            int u_s, int v_s, int u_t, int v_t, float transformed_d_t) {
+        int exist_u_t, exist_v_t;
+        double exist_d_t;
+        exist_u_t = *correspondence_map.PointerAt<int>(u_s, v_s, 0);
+        exist_v_t = *correspondence_map.PointerAt<int>(u_s, v_s, 1);
+        if (exist_u_t != -1 && exist_v_t != -1) {
+            exist_d_t = *depth_buffer.PointerAt<float>(u_s, v_s);
+            if (transformed_d_t < exist_d_t) {
+                // update nearer point as correspondence
+                *correspondence_map.PointerAt<int>(u_s, v_s, 0) = u_t;
+                *correspondence_map.PointerAt<int>(u_s, v_s, 1) = v_t;
+                *depth_buffer.PointerAt<float>(u_s, v_s) = transformed_d_t;
             }
+        } else {  // register correspondence
+            *correspondence_map.PointerAt<int>(u_s, v_s, 0) = u_t;
+            *correspondence_map.PointerAt<int>(u_s, v_s, 1) = v_t;
+            *depth_buffer.PointerAt<float>(u_s, v_s) = transformed_d_t;
         }
     }
-}
 
-static int CountCorrespondence(const geometry::Image &correspondence_map) {
-    int correspondence_count = 0;
-    for (int v_s = 0; v_s < correspondence_map.height_; v_s++) {
-        for (int u_s = 0; u_s < correspondence_map.width_; u_s++) {
-            int u_t = *correspondence_map.PointerAt<int>(u_s, v_s, 0);
-            int v_t = *correspondence_map.PointerAt<int>(u_s, v_s, 1);
-            if (u_t != -1 && v_t != -1) {
-                correspondence_count++;
+    void MergeCorrespondenceMaps(const CorrespondenceReduction& rhs) {
+        for (int v_s = 0; v_s < correspondence_map.height_; v_s++) {
+            for (int u_s = 0; u_s < correspondence_map.width_; u_s++) {
+                int u_t = *rhs.correspondence_map.PointerAt<int>(u_s, v_s, 0);
+                int v_t = *rhs.correspondence_map.PointerAt<int>(u_s, v_s, 1);
+                if (u_t != -1 && v_t != -1) {
+                    float transformed_d_t =
+                            *rhs.depth_buffer.PointerAt<float>(u_s, v_s);
+                    AddElementToCorrespondenceMap(
+                            u_s, v_s, u_t, v_t, transformed_d_t);
+                }
             }
         }
     }
-    return correspondence_count;
-}
 
-CorrespondenceSetPixelWise ComputeCorrespondence(
-        const Eigen::Matrix3d &intrinsic_matrix,
-        const Eigen::Matrix4d &extrinsic,
-        const geometry::Image &depth_s,
-        const geometry::Image &depth_t,
-        const OdometryOption &option) {
-    const Eigen::Matrix3d K = intrinsic_matrix;
-    const Eigen::Matrix3d K_inv = K.inverse();
-    const Eigen::Matrix3d R = extrinsic.block<3, 3>(0, 0);
-    const Eigen::Matrix3d KRK_inv = K * R * K_inv;
-    Eigen::Vector3d Kt = K * extrinsic.block<3, 1>(0, 3);
-
-    geometry::Image correspondence_map;
-    geometry::Image depth_buffer;
-    std::tie(correspondence_map, depth_buffer) =
-            InitializeCorrespondenceMap(depth_t.width_, depth_t.height_);
-
-#pragma omp parallel
-    {
-        geometry::Image correspondence_map_private;
-        geometry::Image depth_buffer_private;
-        std::tie(correspondence_map_private, depth_buffer_private) =
-                InitializeCorrespondenceMap(depth_t.width_, depth_t.height_);
-#pragma omp for nowait
-        for (int v_s = 0; v_s < depth_s.height_; v_s++) {
-            for (int u_s = 0; u_s < depth_s.width_; u_s++) {
+    void operator()(const tbb::blocked_range2d<int>&r) {
+        for (int v_s = r.rows().begin(); v_s < r.rows().end(); ++v_s) {
+            for (int u_s = r.cols().begin(); u_s < r.cols().end(); u_s++) {
                 double d_s = *depth_s.PointerAt<float>(u_s, v_s);
                 if (!std::isnan(d_s)) {
                     Eigen::Vector3d uv_in_s =
                             d_s * KRK_inv * Eigen::Vector3d(u_s, v_s, 1.0) + Kt;
                     double transformed_d_s = uv_in_s(2);
-                    int u_t = (int)(uv_in_s(0) / transformed_d_s + 0.5);
-                    int v_t = (int)(uv_in_s(1) / transformed_d_s + 0.5);
+                    int u_t = (int)std::round(uv_in_s(0) / transformed_d_s);
+                    int v_t = (int)std::round(uv_in_s(1) / transformed_d_s);
                     if (u_t >= 0 && u_t < depth_t.width_ && v_t >= 0 &&
                         v_t < depth_t.height_) {
                         double d_t = *depth_t.PointerAt<float>(u_t, v_t);
                         if (!std::isnan(d_t) &&
                             std::abs(transformed_d_s - d_t) <=
-                                    option.depth_diff_max_) {
+                                    depth_diff_max) {
                             AddElementToCorrespondenceMap(
-                                    correspondence_map_private,
-                                    depth_buffer_private, u_s, v_s, u_t, v_t,
-                                    (float)d_s);
+                                    u_s, v_s, u_t, v_t, (float)d_s);
                         }
                     }
                 }
             }
         }
-#pragma omp critical(ComputeCorrespondence)
-        {
-            MergeCorrespondenceMaps(correspondence_map, depth_buffer,
-                                    correspondence_map_private,
-                                    depth_buffer_private);
-
-        }  //    omp critical
-    }      //    omp parallel
-
-    CorrespondenceSetPixelWise correspondence;
-    int correspondence_count = CountCorrespondence(correspondence_map);
-    correspondence.resize(correspondence_count);
-    int cnt = 0;
-    for (int v_s = 0; v_s < correspondence_map.height_; v_s++) {
-        for (int u_s = 0; u_s < correspondence_map.width_; u_s++) {
-            int u_t = *correspondence_map.PointerAt<int>(u_s, v_s, 0);
-            int v_t = *correspondence_map.PointerAt<int>(u_s, v_s, 1);
-            if (u_t != -1 && v_t != -1) {
-                Eigen::Vector4i pixel_correspondence(u_s, v_s, u_t, v_t);
-                correspondence[cnt] = pixel_correspondence;
-                cnt++;
+    }
+
+    inline void join(const CorrespondenceReduction& rhs) {
+        MergeCorrespondenceMaps(rhs);
+    }
+
+    std::size_t CountCorrespondence() const {
+        std::size_t correspondence_count = 0;
+        for (int v_s = 0; v_s < correspondence_map.height_; v_s++) {
+            for (int u_s = 0; u_s < correspondence_map.width_; u_s++) {
+                int u_t = *correspondence_map.PointerAt<int>(u_s, v_s, 0);
+                int v_t = *correspondence_map.PointerAt<int>(u_s, v_s, 1);
+                if (u_t != -1 && v_t != -1) {
+                    correspondence_count++;
+                }
             }
         }
+        return correspondence_count;
     }
-    return correspondence;
+
+    CorrespondenceSetPixelWise correspondences() const {
+        CorrespondenceSetPixelWise out;
+        out.reserve(CountCorrespondence());
+        for (int v_s = 0; v_s < correspondence_map.height_; v_s++) {
+            for (int u_s = 0; u_s < correspondence_map.width_; u_s++) {
+                int u_t = *correspondence_map.PointerAt<int>(u_s, v_s, 0);
+                int v_t = *correspondence_map.PointerAt<int>(u_s, v_s, 1);
+                if (u_t != -1 && v_t != -1) {
+                    out.emplace_back(u_s, v_s, u_t, v_t);
+                }
+            }
+        }
+        return out;
+    }
+};
+
+CorrespondenceSetPixelWise ComputeCorrespondence(
+        const Eigen::Matrix3d &intrinsic_matrix,
+        const Eigen::Matrix4d &extrinsic,
+        const geometry::Image &depth_s,
+        const geometry::Image &depth_t,
+        const OdometryOption &option) {
+    const Eigen::Matrix3d K = intrinsic_matrix;
+    const Eigen::Matrix3d K_inv = K.inverse();
+    const Eigen::Matrix3d R = extrinsic.block<3, 3>(0, 0);
+    const Eigen::Matrix3d KRK_inv = K * R * K_inv;
+    const Eigen::Vector3d Kt = K * extrinsic.block<3, 1>(0, 3);
+
+    CorrespondenceReduction reducer(depth_s, depth_t,
+        KRK_inv, Kt, option.depth_diff_max_);
+    tbb::parallel_reduce(tbb::blocked_range2d<int>(
+            0, depth_s.height_, utility::DefaultGrainSizeTBB2D(),
+            0, depth_s.width_, utility::DefaultGrainSizeTBB2D()), reducer);
+
+    return reducer.correspondences();
 }
 
 static std::shared_ptr<geometry::Image> ConvertDepthImageToXYZImage(
@@ -215,6 +221,54 @@ static std::vector<Eigen::Matrix3d> CreateCameraMatrixPyramid(
     return pyramid_camera_matrix;
 }
 
+struct InformationMatrixReducer {
+    // Globals
+    const CorrespondenceSetPixelWise& corres;
+    const geometry::Image& xyz_t;
+    // Locals
+    Eigen::Matrix6d GTG;
+
+    InformationMatrixReducer(const CorrespondenceSetPixelWise& corres_,
+                             const geometry::Image& xyz_t_)
+        : corres(corres_), xyz_t(xyz_t_), GTG(Eigen::Matrix6d::Zero()) {}
+
+    InformationMatrixReducer(InformationMatrixReducer& o, tbb::split)
+        : corres(o.corres), xyz_t(o.xyz_t), GTG(Eigen::Matrix6d::Zero()) {}
+
+    void operator()(const tbb::blocked_range<std::size_t>& range) {
+        // write q^*
+        // see http://redwood-data.org/indoor/registration.html
+        // note: I comes first in this implementation
+        Eigen::Vector6d G_r;
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            int u_t = corres[i](2);
+            int v_t = corres[i](3);
+            double x = *xyz_t.PointerAt<float>(u_t, v_t, 0);
+            double y = *xyz_t.PointerAt<float>(u_t, v_t, 1);
+            double z = *xyz_t.PointerAt<float>(u_t, v_t, 2);
+            G_r.setZero();
+            G_r(1) = z;
+            G_r(2) = -y;
+            G_r(3) = 1.0;
+            GTG.noalias() += G_r * G_r.transpose();
+            G_r.setZero();
+            G_r(0) = -z;
+            G_r(2) = x;
+            G_r(4) = 1.0;
+            GTG.noalias() += G_r * G_r.transpose();
+            G_r.setZero();
+            G_r(0) = y;
+            G_r(1) = -x;
+            G_r(5) = 1.0;
+            GTG.noalias() += G_r * G_r.transpose();
+        }
+    }
+
+    void join(InformationMatrixReducer& other) {
+        GTG += other.GTG;
+    }
+};
+
 static Eigen::Matrix6d CreateInformationMatrix(
         const Eigen::Matrix4d &extrinsic,
         const camera::PinholeCameraIntrinsic &pinhole_camera_intrinsic,
@@ -228,41 +282,10 @@ static Eigen::Matrix6d CreateInformationMatrix(
     auto xyz_t = ConvertDepthImageToXYZImage(
             depth_t, pinhole_camera_intrinsic.intrinsic_matrix_);
 
-    // write q^*
-    // see http://redwood-data.org/indoor/registration.html
-    // note: I comes first and q_skew is scaled by factor 2.
-    Eigen::Matrix6d GTG = Eigen::Matrix6d::Identity();
-#pragma omp parallel
-    {
-        Eigen::Matrix6d GTG_private = Eigen::Matrix6d::Identity();
-        Eigen::Vector6d G_r_private = Eigen::Vector6d::Zero();
-#pragma omp for nowait
-        for (int row = 0; row < int(correspondence.size()); row++) {
-            int u_t = correspondence[row](2);
-            int v_t = correspondence[row](3);
-            double x = *xyz_t->PointerAt<float>(u_t, v_t, 0);
-            double y = *xyz_t->PointerAt<float>(u_t, v_t, 1);
-            double z = *xyz_t->PointerAt<float>(u_t, v_t, 2);
-            G_r_private.setZero();
-            G_r_private(1) = z;
-            G_r_private(2) = -y;
-            G_r_private(3) = 1.0;
-            GTG_private.noalias() += G_r_private * G_r_private.transpose();
-            G_r_private.setZero();
-            G_r_private(0) = -z;
-            G_r_private(2) = x;
-            G_r_private(4) = 1.0;
-            GTG_private.noalias() += G_r_private * G_r_private.transpose();
-            G_r_private.setZero();
-            G_r_private(0) = y;
-            G_r_private(1) = -x;
-            G_r_private(5) = 1.0;
-            GTG_private.noalias() += G_r_private * G_r_private.transpose();
-        }
-#pragma omp critical(CreateInformationMatrix)
-        { GTG += GTG_private; }
-    }
-    return GTG;
+    InformationMatrixReducer reducer(correspondence, *xyz_t.get());
+    tbb::parallel_reduce(tbb::blocked_range<std::size_t>(0,
+        correspondence.size(), utility::DefaultGrainSizeTBB()), reducer);
+    return std::move(reducer.GTG);
 }
 
 static void NormalizeIntensity(
diff --git a/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp b/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp
index f0e48b43afe..da9dab27a82 100644
--- a/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp
+++ b/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp
@@ -8,6 +8,8 @@
 #include "open3d/pipelines/registration/FastGlobalRegistration.h"
 
 #include <map>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_invoke.h>
 
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
@@ -15,6 +17,7 @@
 #include "open3d/pipelines/registration/Registration.h"
 #include "open3d/utility/Logging.h"
 #include "open3d/utility/Random.h"
+#include "open3d/utility/Parallel.h"
 
 namespace open3d {
 namespace pipelines {
@@ -24,34 +27,44 @@ static std::vector<std::pair<int, int>> InitialMatching(
         const Feature& src_features, const Feature& dst_features) {
     geometry::KDTreeFlann src_feature_tree(src_features);
     geometry::KDTreeFlann dst_feature_tree(dst_features);
-    std::map<int, int> corres_ij;
+    std::vector<int> corres_ij(src_features.data_.cols(), -1);
     std::vector<int> corres_ji(dst_features.data_.cols(), -1);
 
-#pragma omp for nowait
-    for (int j = 0; j < dst_features.data_.cols(); j++) {
-        std::vector<int> corres_tmp(1);
-        std::vector<double> dist_tmp(1);
-        src_feature_tree.SearchKNN(Eigen::VectorXd(dst_features.data_.col(j)),
-                                   1, corres_tmp, dist_tmp);
-        int i = corres_tmp[0];
-        corres_ji[j] = i;
-        if (corres_ij.find(i) == corres_ij.end()) {
-            // set a temp value to prevent other threads recomputing
-            // corres_ij[i] until the following dst_feature_tree.SearchKNN()
-            // call completes. There is still a race condition but the result
-            // would be fine since both threads will compute the same result
-            corres_ij[i] = -1;
-            dst_feature_tree.SearchKNN(
-                    Eigen::VectorXd(src_features.data_.col(i)), 1, corres_tmp,
-                    dist_tmp);
-            corres_ij[i] = corres_tmp[0];
-        }
-    }
+    tbb::parallel_invoke([&](){
+        tbb::parallel_for(tbb::blocked_range<int>(0,
+                src_features.data_.cols(), utility::DefaultGrainSizeTBB()),
+                [&](const tbb::blocked_range<int>& range) {
+            for (int i = range.begin(); i < range.end(); ++i) {
+                std::vector<int> corres_tmp(1);
+                std::vector<double> dist_tmp(1);
+                dst_feature_tree.SearchKNN(
+                        Eigen::VectorXd(src_features.data_.col(i)),
+                        1, corres_tmp, dist_tmp);
+                corres_ij[i] = corres_tmp[0];
+            }
+        });
+    }, [&](){
+        tbb::parallel_for(tbb::blocked_range<int>(0,
+                dst_features.data_.cols(), utility::DefaultGrainSizeTBB()),
+                [&](const tbb::blocked_range<int>& range) {
+            for (int j = range.begin(); j < range.end(); ++j) {
+                std::vector<int> corres_tmp(1);
+                std::vector<double> dist_tmp(1);
+                src_feature_tree.SearchKNN(
+                        Eigen::VectorXd(dst_features.data_.col(j)),
+                        1, corres_tmp, dist_tmp);
+                corres_ji[j] = corres_tmp[0];
+            }
+        });
+    });
 
     utility::LogDebug("\t[cross check] ");
     std::vector<std::pair<int, int>> corres_cross;
-    for (const std::pair<const int, int>& ij : corres_ij) {
-        if (corres_ji[ij.second] == ij.first) corres_cross.push_back(ij);
+    for (int i = 0; i < static_cast<int>(corres_ij.size()); ++i) {
+        int j = corres_ij[i];
+        if (corres_ji[j] == i) {
+            corres_cross.emplace_back(i, j);
+        }
     }
     utility::LogDebug("Initial matchings : {}", corres_cross.size());
     return corres_cross;
diff --git a/cpp/open3d/pipelines/registration/Feature.cpp b/cpp/open3d/pipelines/registration/Feature.cpp
index 2815159dcc4..73580575b02 100644
--- a/cpp/open3d/pipelines/registration/Feature.cpp
+++ b/cpp/open3d/pipelines/registration/Feature.cpp
@@ -9,6 +9,8 @@
 
 #include <Eigen/Dense>
 
+#include <tbb/parallel_for.h>
+
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/utility/Logging.h"
@@ -58,36 +60,38 @@ static std::shared_ptr<Feature> ComputeSPFHFeature(
         const geometry::KDTreeSearchParam &search_param) {
     auto feature = std::make_shared<Feature>();
     feature->Resize(33, (int)input.points_.size());
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)input.points_.size(); i++) {
-        const auto &point = input.points_[i];
-        const auto &normal = input.normals_[i];
-        std::vector<int> indices;
-        std::vector<double> distance2;
-        if (kdtree.Search(point, search_param, indices, distance2) > 1) {
-            // only compute SPFH feature when a point has neighbors
-            double hist_incr = 100.0 / (double)(indices.size() - 1);
-            for (size_t k = 1; k < indices.size(); k++) {
-                // skip the point itself, compute histogram
-                auto pf = ComputePairFeatures(point, normal,
-                                              input.points_[indices[k]],
-                                              input.normals_[indices[k]]);
-                int h_index = (int)(floor(11 * (pf(0) + M_PI) / (2.0 * M_PI)));
-                if (h_index < 0) h_index = 0;
-                if (h_index >= 11) h_index = 10;
-                feature->data_(h_index, i) += hist_incr;
-                h_index = (int)(floor(11 * (pf(1) + 1.0) * 0.5));
-                if (h_index < 0) h_index = 0;
-                if (h_index >= 11) h_index = 10;
-                feature->data_(h_index + 11, i) += hist_incr;
-                h_index = (int)(floor(11 * (pf(2) + 1.0) * 0.5));
-                if (h_index < 0) h_index = 0;
-                if (h_index >= 11) h_index = 10;
-                feature->data_(h_index + 22, i) += hist_incr;
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, input.points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            const auto &point = input.points_[i];
+            const auto &normal = input.normals_[i];
+            std::vector<int> indices;
+            std::vector<double> distance2;
+            if (kdtree.Search(point, search_param, indices, distance2) > 1) {
+                // only compute SPFH feature when a point has neighbors
+                double hist_incr = 100.0 / (double)(indices.size() - 1);
+                for (size_t k = 1; k < indices.size(); k++) {
+                    // skip the point itself, compute histogram
+                    auto pf = ComputePairFeatures(point, normal,
+                                                  input.points_[indices[k]],
+                                                  input.normals_[indices[k]]);
+                    int h_index = (int)(floor(11 * (pf(0) + M_PI) / (2.0 * M_PI)));
+                    if (h_index < 0) h_index = 0;
+                    if (h_index >= 11) h_index = 10;
+                    feature->data_(h_index, i) += hist_incr;
+                    h_index = (int)(floor(11 * (pf(1) + 1.0) * 0.5));
+                    if (h_index < 0) h_index = 0;
+                    if (h_index >= 11) h_index = 10;
+                    feature->data_(h_index + 11, i) += hist_incr;
+                    h_index = (int)(floor(11 * (pf(2) + 1.0) * 0.5));
+                    if (h_index < 0) h_index = 0;
+                    if (h_index >= 11) h_index = 10;
+                    feature->data_(h_index + 22, i) += hist_incr;
+                }
             }
         }
-    }
+    });
     return feature;
 }
 
@@ -105,37 +109,40 @@ std::shared_ptr<Feature> ComputeFPFHFeature(
     if (spfh == nullptr) {
         utility::LogError("Internal error: SPFH feature is nullptr.");
     }
-#pragma omp parallel for schedule(static) \
-        num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)input.points_.size(); i++) {
-        const auto &point = input.points_[i];
-        std::vector<int> indices;
-        std::vector<double> distance2;
-        if (kdtree.Search(point, search_param, indices, distance2) > 1) {
-            double sum[3] = {0.0, 0.0, 0.0};
-            for (size_t k = 1; k < indices.size(); k++) {
-                // skip the point itself
-                double dist = distance2[k];
-                if (dist == 0.0) continue;
+
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, input.points_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            const auto &point = input.points_[i];
+            std::vector<int> indices;
+            std::vector<double> distance2;
+            if (kdtree.Search(point, search_param, indices, distance2) > 1) {
+                double sum[3] = {0.0, 0.0, 0.0};
+                for (size_t k = 1; k < indices.size(); k++) {
+                    // skip the point itself
+                    double dist = distance2[k];
+                    if (dist == 0.0) continue;
+                    for (int j = 0; j < 33; j++) {
+                        double val = spfh->data_(j, indices[k]) / dist;
+                        sum[j / 11] += val;
+                        feature->data_(j, i) += val;
+                    }
+                }
+                for (int j = 0; j < 3; j++)
+                    if (sum[j] != 0.0) sum[j] = 100.0 / sum[j];
                 for (int j = 0; j < 33; j++) {
-                    double val = spfh->data_(j, indices[k]) / dist;
-                    sum[j / 11] += val;
-                    feature->data_(j, i) += val;
+                    feature->data_(j, i) *= sum[j / 11];
+                    // The commented line is the fpfh function in the paper.
+                    // But according to PCL implementation, it is skipped.
+                    // Our initial test shows that the full fpfh function in the
+                    // paper seems to be better than PCL implementation. Further
+                    // test required.
+                    feature->data_(j, i) += spfh->data_(j, i);
                 }
             }
-            for (int j = 0; j < 3; j++)
-                if (sum[j] != 0.0) sum[j] = 100.0 / sum[j];
-            for (int j = 0; j < 33; j++) {
-                feature->data_(j, i) *= sum[j / 11];
-                // The commented line is the fpfh function in the paper.
-                // But according to PCL implementation, it is skipped.
-                // Our initial test shows that the full fpfh function in the
-                // paper seems to be better than PCL implementation. Further
-                // test required.
-                feature->data_(j, i) += spfh->data_(j, i);
-            }
         }
-    }
+    });
     return feature;
 }
 
@@ -153,33 +160,33 @@ CorrespondenceSet CorrespondencesFromFeatures(const Feature &source_features,
                                int(target_features.data_.cols())};
     std::vector<CorrespondenceSet> corres(num_searches);
 
-    const int kMaxThreads = utility::EstimateMaxThreads();
-    const int kOuterThreads = std::min(kMaxThreads, num_searches);
-    const int kInnerThreads = std::max(kMaxThreads / num_searches, 1);
-    (void)kOuterThreads;  // Avoids compiler warning if OpenMP is disabled
-    (void)kInnerThreads;
-#pragma omp parallel for num_threads(kOuterThreads)
-    for (int k = 0; k < num_searches; ++k) {
-        geometry::KDTreeFlann kdtree(features[1 - k]);
-
-        int num_pts_k = num_pts[k];
-        corres[k] = CorrespondenceSet(num_pts_k);
-#pragma omp parallel for num_threads(kInnerThreads)
-        for (int i = 0; i < num_pts_k; i++) {
-            std::vector<int> corres_tmp(1);
-            std::vector<double> dist_tmp(1);
-
-            kdtree.SearchKNN(Eigen::VectorXd(features[k].get().data_.col(i)), 1,
-                             corres_tmp, dist_tmp);
-            int j = corres_tmp[0];
-            corres[k][i] = Eigen::Vector2i(i, j);
+    tbb::parallel_for(tbb::blocked_range<int>(0, num_searches, 1),
+            [&](const tbb::blocked_range<int>& range) {
+        for (int k = range.begin(); k < range.end(); ++k) {
+            geometry::KDTreeFlann kdtree(features[1 - k]);
+
+            int num_pts_k = num_pts[k];
+            corres[k] = CorrespondenceSet(num_pts_k);
+            tbb::parallel_for(tbb::blocked_range<int>(
+                    0, num_pts_k, utility::DefaultGrainSizeTBB()),
+                    [&](const tbb::blocked_range<int>& range) {
+                for (int i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> corres_tmp(1);
+                    std::vector<double> dist_tmp(1);
+
+                    kdtree.SearchKNN(
+                            Eigen::VectorXd(features[k].get().data_.col(i)),
+                            1, corres_tmp, dist_tmp);
+                    int j = corres_tmp[0];
+                    corres[k][i] = Eigen::Vector2i(i, j);
+                }
+            });
         }
-    }
+    });
 
     // corres[0]: corres_ij, corres[1]: corres_ji
     if (!mutual_filter) return corres[0];
 
-    // should not use parallel for due to emplace back
     CorrespondenceSet corres_mutual;
     int num_src_pts = num_pts[0];
     for (int i = 0; i < num_src_pts; ++i) {
diff --git a/cpp/open3d/pipelines/registration/GeneralizedICP.cpp b/cpp/open3d/pipelines/registration/GeneralizedICP.cpp
index c529fdcc639..99911fb0327 100644
--- a/cpp/open3d/pipelines/registration/GeneralizedICP.cpp
+++ b/cpp/open3d/pipelines/registration/GeneralizedICP.cpp
@@ -14,6 +14,7 @@
 
 #include <Eigen/Dense>
 #include <unsupported/Eigen/MatrixFunctions>
+#include <tbb/parallel_for.h>
 
 #include "open3d/geometry/KDTreeSearchParam.h"
 #include "open3d/geometry/PointCloud.h"
@@ -64,11 +65,16 @@ std::shared_ptr<geometry::PointCloud> InitializePointCloudForGeneralizedICP(
 
     output->covariances_.resize(output->points_.size());
     const Eigen::Matrix3d C = Eigen::Vector3d(epsilon, 1, 1).asDiagonal();
-#pragma omp parallel for schedule(static)
-    for (int i = 0; i < (int)output->normals_.size(); i++) {
-        const auto Rx = GetRotationFromE1ToX(output->normals_[i]);
-        output->covariances_[i] = Rx * C * Rx.transpose();
-    }
+
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, output->normals_.size(), 1),
+            [&](const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            const auto Rx = GetRotationFromE1ToX(output->normals_[i]);
+            output->covariances_[i] = Rx * C * Rx.transpose();
+        }
+    });
+
     return output;
 }
 }  // namespace
diff --git a/cpp/open3d/pipelines/registration/Registration.cpp b/cpp/open3d/pipelines/registration/Registration.cpp
index 6820d2f7c33..a418ee9038a 100644
--- a/cpp/open3d/pipelines/registration/Registration.cpp
+++ b/cpp/open3d/pipelines/registration/Registration.cpp
@@ -7,6 +7,9 @@
 
 #include "open3d/pipelines/registration/Registration.h"
 
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/pipelines/registration/Feature.h"
@@ -18,6 +21,50 @@ namespace open3d {
 namespace pipelines {
 namespace registration {
 
+struct RegistrationReduction {
+    // Globals:
+    const geometry::KDTreeFlann& target_kdtree;
+    const geometry::PointCloud& source;
+    double max_distance;
+    // Locals:
+    CorrespondenceSet correspondences;
+    double error2;
+
+    RegistrationReduction(const geometry::KDTreeFlann& target_kdtree_,
+                          const geometry::PointCloud& source_,
+                          double max_correspondence_distance_)
+        : target_kdtree(target_kdtree_), source(source_),
+          max_distance(max_correspondence_distance_),
+          correspondences(), error2(0.0) {}
+
+    RegistrationReduction(RegistrationReduction& other, tbb::split)
+        : target_kdtree(other.target_kdtree), source(other.source),
+          max_distance(other.max_distance), correspondences(), error2(0.0) {}
+
+    void operator()(const tbb::blocked_range<std::size_t>& range) {
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> indices(1);
+            std::vector<double> dists(1);
+            const auto &point = source.points_[i];
+            if (target_kdtree.SearchHybrid(point,
+                    max_distance, 1, indices, dists) > 0) {
+                error2 += dists[0];
+                correspondences.emplace_back(i, indices[0]);
+            }
+        }
+    }
+
+    void join(RegistrationReduction& other) {
+        correspondences.insert(correspondences.end(),
+            other.correspondences.begin(), other.correspondences.end());
+        error2 += other.error2;
+    }
+
+    std::tuple<CorrespondenceSet, double> as_tuple() && {
+        return {std::move(correspondences), error2};
+    }
+};
+
 static RegistrationResult GetRegistrationResultAndCorrespondences(
         const geometry::PointCloud &source,
         const geometry::PointCloud &target,
@@ -29,33 +76,14 @@ static RegistrationResult GetRegistrationResultAndCorrespondences(
         return result;
     }
 
-    double error2 = 0.0;
+    RegistrationReduction reducer(target_kdtree,
+        source, max_correspondence_distance);
+    tbb::parallel_reduce(tbb::blocked_range<std::size_t>(
+        0, source.points_.size(), utility::DefaultGrainSizeTBB()), reducer);
 
-#pragma omp parallel
-    {
-        double error2_private = 0.0;
-        CorrespondenceSet correspondence_set_private;
-#pragma omp for nowait
-        for (int i = 0; i < (int)source.points_.size(); i++) {
-            std::vector<int> indices(1);
-            std::vector<double> dists(1);
-            const auto &point = source.points_[i];
-            if (target_kdtree.SearchHybrid(point, max_correspondence_distance,
-                                           1, indices, dists) > 0) {
-                error2_private += dists[0];
-                correspondence_set_private.push_back(
-                        Eigen::Vector2i(i, indices[0]));
-            }
-        }
-#pragma omp critical(GetRegistrationResultAndCorrespondences)
-        {
-            for (int i = 0; i < (int)correspondence_set_private.size(); i++) {
-                result.correspondence_set_.push_back(
-                        correspondence_set_private[i]);
-            }
-            error2 += error2_private;
-        }
-    }
+    double error2;
+    std::tie(result.correspondence_set_, error2)
+            = std::move(reducer).as_tuple();
 
     if (result.correspondence_set_.empty()) {
         result.fitness_ = 0.0;
@@ -167,39 +195,64 @@ RegistrationResult RegistrationICP(
     return result;
 }
 
-RegistrationResult RegistrationRANSACBasedOnCorrespondence(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
-        const CorrespondenceSet &corres,
-        double max_correspondence_distance,
-        const TransformationEstimation &estimation
-        /* = TransformationEstimationPointToPoint(false)*/,
-        int ransac_n /* = 3*/,
-        const std::vector<std::reference_wrapper<const CorrespondenceChecker>>
-                &checkers /* = {}*/,
-        const RANSACConvergenceCriteria &criteria
-        /* = RANSACConvergenceCriteria()*/) {
-    if (ransac_n < 3 || (int)corres.size() < ransac_n ||
-        max_correspondence_distance <= 0.0) {
-        return RegistrationResult();
-    }
+template <typename T>
+void atomic_min(std::atomic<T>& min_val, const T& val) noexcept {
+    T prev_val = min_val;
+    while (prev_val > val && min_val.compare_exchange_weak(prev_val, val));
+}
 
+struct RANSACCorrespondenceReduction {
+    // Globals
+    const geometry::PointCloud& source;
+    const geometry::PointCloud& target;
+    const CorrespondenceSet& corres;
+    const TransformationEstimation& estimation;
+    using CheckerType = std::vector<std::reference_wrapper<
+            const CorrespondenceChecker>>;
+    const CheckerType& checkers;
+    const geometry::KDTreeFlann& kdtree;
+    std::atomic<int>& est_k_global;
+    std::atomic<int>& total_validation;
+    using RandomIntGen = utility::random::UniformIntGenerator<int>;
+    RandomIntGen& rand_gen;
+    // Constants
+    const double max_distance;
+    const int ransac_n;
+    const double log_confidence;
+    // Locals
+    CorrespondenceSet ransac_corres;
     RegistrationResult best_result;
-    geometry::KDTreeFlann kdtree(target);
-    int est_k_global = criteria.max_iteration_;
-    int total_validation = 0;
-
-#pragma omp parallel
-    {
-        CorrespondenceSet ransac_corres(ransac_n);
-        RegistrationResult best_result_local;
-        int est_k_local = criteria.max_iteration_;
-        utility::random::UniformIntGenerator<int> rand_gen(0,
-                                                           corres.size() - 1);
-
-#pragma omp for nowait
-        for (int itr = 0; itr < criteria.max_iteration_; itr++) {
-            if (itr < est_k_global) {
+
+
+    RANSACCorrespondenceReduction(const geometry::PointCloud& source_,
+                                  const geometry::PointCloud& target_,
+                                  const CorrespondenceSet& corres_,
+                                  const TransformationEstimation& estimation_,
+                                  const CheckerType& checkers_,
+                                  const geometry::KDTreeFlann& kdtree_,
+                                  std::atomic<int>& est_k_global_,
+                                  std::atomic<int>& total_validation_,
+                                  RandomIntGen& rand_gen_,
+                                  double max_dist_,
+                                  int ransac_n_,
+                                  double confidence)
+        : source(source_), target(target_), corres(corres_),
+          estimation(estimation_), checkers(checkers_), kdtree(kdtree_),
+          est_k_global(est_k_global_), total_validation(total_validation_),
+          rand_gen(rand_gen_), max_distance(max_dist_), ransac_n(ransac_n_),
+          log_confidence(std::log(1.0 - confidence)), ransac_corres(ransac_n) {}
+
+    RANSACCorrespondenceReduction(RANSACCorrespondenceReduction& o,
+        tbb::split): source(o.source), target(o.target), corres(o.corres),
+            estimation(o.estimation), checkers(o.checkers), kdtree(o.kdtree),
+            est_k_global(o.est_k_global), total_validation(o.total_validation),
+            rand_gen(o.rand_gen), max_distance(o.max_distance),
+            ransac_n(o.ransac_n), log_confidence(o.log_confidence) {}
+
+    void operator()(const tbb::blocked_range<int>& range) {
+        int est_k_local = est_k_global;
+        for (int i = range.begin(); i < range.end(); ++i) {
+            if (i < est_k_global) {
                 for (int j = 0; j < ransac_n; j++) {
                     ransac_corres[j] = corres[rand_gen()];
                 }
@@ -209,69 +262,83 @@ RegistrationResult RegistrationRANSACBasedOnCorrespondence(
                                                          ransac_corres);
 
                 // Check transformation: inexpensive
-                bool check = true;
-                for (const auto &checker : checkers) {
-                    if (!checker.get().Check(source, target, ransac_corres,
-                                             transformation)) {
-                        check = false;
-                        break;
-                    }
+                if (!std::all_of(checkers.begin(), checkers.end(),
+                        [&](const auto& checker){ return checker.get().Check(
+                    source, target, ransac_corres, transformation); })) {
+                    continue;
                 }
-                if (!check) continue;
 
                 // Expensive validation
                 geometry::PointCloud pcd = source;
                 pcd.Transform(transformation);
                 auto result = GetRegistrationResultAndCorrespondences(
-                        pcd, target, kdtree, max_correspondence_distance,
-                        transformation);
+                        pcd, target, kdtree, max_distance, transformation);
 
-                if (result.IsBetterRANSACThan(best_result_local)) {
-                    best_result_local = result;
+                if (result.IsBetterRANSACThan(best_result)) {
+                    best_result = std::move(result);
 
                     double corres_inlier_ratio =
                             EvaluateInlierCorrespondenceRatio(
-                                    pcd, target, corres,
-                                    max_correspondence_distance,
+                                    pcd, target, corres, max_distance,
                                     transformation);
 
-                    // Update exit condition if necessary
-                    double est_k_local_d =
-                            std::log(1.0 - criteria.confidence_) /
-                            std::log(1.0 -
-                                     std::pow(corres_inlier_ratio, ransac_n));
-                    // This prevents having a negative number of iterations:
-                    // est_k_local_d = -inf if corres_inlier_ratio = 0.0
-                    est_k_local_d =
-                            est_k_local_d < 0 ? est_k_local : est_k_local_d;
-                    est_k_local =
-                            est_k_local_d < est_k_global
-                                    ? static_cast<int>(std::ceil(est_k_local_d))
-                                    : est_k_local;
+                    // Update exit condition if necessary.
+                    // If confidence is 1.0, then it is safely inf, we always
+                    // consume all the iterations.
+                    double est_k_local_d = log_confidence /
+                        std::log(1.0 - std::pow(corres_inlier_ratio, ransac_n));
+                    if (est_k_local_d < 0) {
+                        est_k_local_d = est_k_local;
+                    }
+                    if (est_k_local_d < est_k_global) {
+                        est_k_local = std::ceil(est_k_local_d);
+                    }
                     utility::LogDebug(
                             "Thread {:06d}: registration fitness={:.3f}, "
-                            "corres inlier ratio={:.3f}, "
-                            "Est. max k = {}",
-                            itr, result.fitness_, corres_inlier_ratio,
+                            "corres inlier ratio={:.3f}, Est. max k = {}",
+                            i, best_result.fitness_, corres_inlier_ratio,
                             est_k_local_d);
                 }
-#pragma omp critical
-                {
-                    total_validation += 1;
-                    if (est_k_local < est_k_global) {
-                        est_k_global = est_k_local;
-                    }
-                }
-            }  // if
-        }      // for loop
-
-#pragma omp critical(RegistrationRANSACBasedOnCorrespondence)
-        {
-            if (best_result_local.IsBetterRANSACThan(best_result)) {
-                best_result = best_result_local;
+                total_validation += 1;
+                atomic_min(est_k_global, est_k_local);
             }
         }
     }
+
+    void join(RANSACCorrespondenceReduction& other) {
+        if (!best_result.IsBetterRANSACThan(other.best_result)) {
+            best_result = std::move(other.best_result);
+        }
+    }
+};
+
+RegistrationResult RegistrationRANSACBasedOnCorrespondence(
+        const geometry::PointCloud &source,
+        const geometry::PointCloud &target,
+        const CorrespondenceSet &corres,
+        double max_correspondence_distance,
+        const TransformationEstimation &estimation
+        /* = TransformationEstimationPointToPoint(false)*/,
+        int ransac_n /* = 3*/,
+        const std::vector<std::reference_wrapper<const CorrespondenceChecker>>
+                &checkers /* = {}*/,
+        const RANSACConvergenceCriteria &criteria
+        /* = RANSACConvergenceCriteria()*/) {
+    if (ransac_n < 3 || (int)corres.size() < ransac_n ||
+        max_correspondence_distance <= 0.0) {
+        return {};
+    }
+
+    geometry::KDTreeFlann kdtree(target);
+    std::atomic<int> est_k_global = criteria.max_iteration_;
+    std::atomic<int> total_validation = 0;
+    utility::random::UniformIntGenerator<int> rand_gen(0, corres.size() - 1);
+    RANSACCorrespondenceReduction reducer(source, target, corres, estimation,
+        checkers, kdtree, est_k_global, total_validation, rand_gen,
+        max_correspondence_distance, ransac_n, criteria.confidence_);
+    tbb::parallel_reduce(tbb::blocked_range<int>(
+        0, criteria.max_iteration_, utility::DefaultGrainSizeTBB()), reducer);
+    auto best_result = std::move(reducer.best_result);
     utility::LogDebug(
             "RANSAC exits after {:d} validations. Best inlier ratio {:e}, "
             "RMSE {:e}",
@@ -305,6 +372,53 @@ RegistrationResult RegistrationRANSACBasedOnFeatureMatching(
             ransac_n, checkers, criteria);
 }
 
+struct InformationMatrixReducer {
+    // Globals
+    const CorrespondenceSet& corres;
+    const geometry::PointCloud& target;
+    // Locals
+    Eigen::Matrix6d GTG;
+
+    InformationMatrixReducer(const CorrespondenceSet& corres_,
+                             const geometry::PointCloud& target_)
+        : corres(corres_), target(target_), GTG(Eigen::Matrix6d::Zero()) {}
+
+    InformationMatrixReducer(InformationMatrixReducer& o, tbb::split)
+        : corres(o.corres), target(o.target), GTG(Eigen::Matrix6d::Zero()) {}
+
+    void operator()(const tbb::blocked_range<std::size_t>& range) {
+        // write q^*
+        // see http://redwood-data.org/indoor/registration.html
+        // note: I comes first in this implementation
+        Eigen::Vector6d G_r;
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            int t = corres[i](1);
+            double x = target.points_[t](0);
+            double y = target.points_[t](1);
+            double z = target.points_[t](2);
+            G_r.setZero();
+            G_r(1) = z;
+            G_r(2) = -y;
+            G_r(3) = 1.0;
+            GTG.noalias() += G_r * G_r.transpose();
+            G_r.setZero();
+            G_r(0) = -z;
+            G_r(2) = x;
+            G_r(4) = 1.0;
+            GTG.noalias() += G_r * G_r.transpose();
+            G_r.setZero();
+            G_r(0) = y;
+            G_r(1) = -x;
+            G_r(5) = 1.0;
+            GTG.noalias() += G_r * G_r.transpose();
+        }
+    }
+
+    void join(InformationMatrixReducer& other) {
+        GTG += other.GTG;
+    }
+};
+
 Eigen::Matrix6d GetInformationMatrixFromPointClouds(
         const geometry::PointCloud &source,
         const geometry::PointCloud &target,
@@ -320,40 +434,13 @@ Eigen::Matrix6d GetInformationMatrixFromPointClouds(
             pcd, target, target_kdtree, max_correspondence_distance,
             transformation);
 
-    // write q^*
-    // see http://redwood-data.org/indoor/registration.html
-    // note: I comes first in this implementation
-    Eigen::Matrix6d GTG = Eigen::Matrix6d::Zero();
-#pragma omp parallel
-    {
-        Eigen::Matrix6d GTG_private = Eigen::Matrix6d::Zero();
-        Eigen::Vector6d G_r_private = Eigen::Vector6d::Zero();
-#pragma omp for nowait
-        for (int c = 0; c < int(result.correspondence_set_.size()); c++) {
-            int t = result.correspondence_set_[c](1);
-            double x = target.points_[t](0);
-            double y = target.points_[t](1);
-            double z = target.points_[t](2);
-            G_r_private.setZero();
-            G_r_private(1) = z;
-            G_r_private(2) = -y;
-            G_r_private(3) = 1.0;
-            GTG_private.noalias() += G_r_private * G_r_private.transpose();
-            G_r_private.setZero();
-            G_r_private(0) = -z;
-            G_r_private(2) = x;
-            G_r_private(4) = 1.0;
-            GTG_private.noalias() += G_r_private * G_r_private.transpose();
-            G_r_private.setZero();
-            G_r_private(0) = y;
-            G_r_private(1) = -x;
-            G_r_private(5) = 1.0;
-            GTG_private.noalias() += G_r_private * G_r_private.transpose();
-        }
-#pragma omp critical(GetInformationMatrixFromPointClouds)
-        { GTG += GTG_private; }
-    }
-    return GTG;
+    InformationMatrixReducer reducer(result.correspondence_set_, target);
+    tbb::parallel_reduce(
+            tbb::blocked_range<std::size_t>(0,
+                result.correspondence_set_.size(),
+                utility::DefaultGrainSizeTBB()),
+            reducer);
+    return std::move(reducer.GTG);
 }
 
 }  // namespace registration
diff --git a/cpp/open3d/pipelines/registration/Registration.h b/cpp/open3d/pipelines/registration/Registration.h
index 276ef6b64e2..6eeabfdf1c7 100644
--- a/cpp/open3d/pipelines/registration/Registration.h
+++ b/cpp/open3d/pipelines/registration/Registration.h
@@ -109,6 +109,11 @@ class RegistrationResult {
                                              inlier_rmse_ < other.inlier_rmse_);
     }
 
+    RegistrationResult(const RegistrationResult&) = default;
+    RegistrationResult(RegistrationResult&&) = default;
+    RegistrationResult& operator=(const RegistrationResult&) = default;
+    RegistrationResult& operator=(RegistrationResult&&) = default;
+
 public:
     /// The estimated transformation matrix.
     Eigen::Matrix4d_u transformation_;
diff --git a/cpp/open3d/t/geometry/PointCloud.cpp b/cpp/open3d/t/geometry/PointCloud.cpp
index b345e0ea985..df3ad04fcaa 100644
--- a/cpp/open3d/t/geometry/PointCloud.cpp
+++ b/cpp/open3d/t/geometry/PointCloud.cpp
@@ -369,9 +369,9 @@ PointCloud PointCloud::RandomDownSample(double sampling_ratio) const {
     std::vector<int64_t> indices(length);
     std::iota(std::begin(indices), std::end(indices), 0);
     {
-        std::lock_guard<std::mutex> lock(*utility::random::GetMutex());
+        tbb::spin_mutex::scoped_lock lock(utility::random::GetMutex());
         std::shuffle(indices.begin(), indices.end(),
-                     *utility::random::GetEngine());
+                     utility::random::GetEngine());
     }
 
     const int sample_size = sampling_ratio * length;
diff --git a/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp b/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp
index 2fdc949fda3..9fea392807d 100644
--- a/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp
+++ b/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp
@@ -7,6 +7,9 @@
 
 #include "open3d/t/geometry/kernel/PointCloudImpl.h"
 
+#include <tbb/spin_mutex.h>
+#include <tbb/parallel_for.h>
+
 namespace open3d {
 namespace t {
 namespace geometry {
@@ -26,8 +29,6 @@ void ProjectCPU(
         float depth_max) {
     const bool has_colors = image_colors.has_value();
 
-    int64_t n = points.GetLength();
-
     const float* points_ptr = points.GetDataPtr<float>();
     const float* point_colors_ptr =
             has_colors ? colors.value().get().GetDataPtr<float>() : nullptr;
@@ -40,7 +41,9 @@ void ProjectCPU(
         color_indexer = NDArrayIndexer(image_colors.value().get(), 2);
     }
 
-    core::ParallelFor(core::Device("CPU:0"), n, [&](int64_t workload_idx) {
+    tbb::spin_mutex mtx;
+    core::ParallelFor(core::Device("CPU:0"), points.GetLength(),
+            [&](int64_t workload_idx) {
         float x = points_ptr[3 * workload_idx + 0];
         float y = points_ptr[3 * workload_idx + 1];
         float z = points_ptr[3 * workload_idx + 2];
@@ -60,10 +63,8 @@ void ProjectCPU(
         float* depth_ptr = depth_indexer.GetDataPtr<float>(
                 static_cast<int64_t>(u), static_cast<int64_t>(v));
         float d = zc * depth_scale;
-        // TODO: this can be wrong if ParallelFor is not implemented with
-        // OpenMP.
-#pragma omp critical(ProjectCPU)
         {
+            tbb::spin_mutex::scoped_lock lock(mtx);
             if (*depth_ptr == 0 || *depth_ptr >= d) {
                 *depth_ptr = d;
 
diff --git a/cpp/open3d/t/geometry/kernel/UVUnwrapping.cpp b/cpp/open3d/t/geometry/kernel/UVUnwrapping.cpp
index 866e620d6bf..e3e23ab4813 100644
--- a/cpp/open3d/t/geometry/kernel/UVUnwrapping.cpp
+++ b/cpp/open3d/t/geometry/kernel/UVUnwrapping.cpp
@@ -173,6 +173,7 @@ std::tuple<float, int, int> ComputeUVAtlas(TriangleMesh& mesh,
         mesh_partitions.emplace_back(mesh_tmp);
     }
 
+    utility::LogDebug("Partitioning");
     std::vector<UVAtlasPartitionOutput> uvatlas_partitions(parallel_partitions);
 
     // By default UVAtlas uses a fast mode if there are more than 25k faces.
@@ -202,6 +203,7 @@ std::tuple<float, int, int> ComputeUVAtlas(TriangleMesh& mesh,
         LoopFn(tbb::blocked_range<size_t>(0, parallel_partitions));
     }
 
+    utility::LogDebug("Combining partitions");
     // merge outputs for the packing algorithm
     UVAtlasPartitionOutput& combined_output = uvatlas_partitions.front();
     for (int i = 1; i < parallel_partitions; ++i) {
@@ -246,6 +248,7 @@ std::tuple<float, int, int> ComputeUVAtlas(TriangleMesh& mesh,
         output = UVAtlasPartitionOutput();
     }
 
+    utility::LogDebug("Packing UVAtlas");
     HRESULT hr = UVAtlasPack(combined_output.vb, combined_output.ib,
                              DXGI_FORMAT_R32_UINT, width, height, gutter,
                              combined_output.partition_result_adjacency,
diff --git a/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h b/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h
index dd0fb9dc148..35f78c1539d 100644
--- a/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h
+++ b/cpp/open3d/t/geometry/kernel/VoxelBlockGridImpl.h
@@ -8,6 +8,10 @@
 #include <atomic>
 #include <cmath>
 
+#if !defined(__CUDACC__)
+#include <tbb/spin_mutex.h>
+#endif
+
 #include "open3d/core/Dispatch.h"
 #include "open3d/core/Dtype.h"
 #include "open3d/core/MemoryManager.h"
@@ -346,7 +350,7 @@ void EstimateRangeCPU
     std::atomic<int>* count_ptr = &count_atomic;
 #endif
 
-#ifndef __CUDACC__
+#if !defined(__CUDACC__)
     using std::max;
     using std::min;
 #endif
@@ -463,10 +467,17 @@ void EstimateRangeCPU
                           range_ptr[1] = depth_min;
                       });
 
+#if !defined(__CUDACC__)
+    tbb::spin_mutex estimate_range_mutex;
+    tbb::profiling::set_name(estimate_range_mutex, "EstimateRangeCPU");
+#define LOCAL_LAMBDA_CAPTURE =, &estimate_range_mutex
+#else
+#defined LOCAL_LAMBDA_CAPTURE =
+#endif
     // Pass 1: iterate over rendering fragment array, fill-in range
     core::ParallelFor(
             block_keys.GetDevice(), frag_count * fragment_size * fragment_size,
-            [=] OPEN3D_DEVICE(int64_t workload_idx) {
+            [LOCAL_LAMBDA_CAPTURE] OPEN3D_DEVICE(int64_t workload_idx) {
                 int frag_idx = workload_idx / (fragment_size * fragment_size);
                 int local_idx = workload_idx % (fragment_size * fragment_size);
                 int dv = local_idx / fragment_size;
@@ -486,12 +497,12 @@ void EstimateRangeCPU
                 float z_min = frag_ptr[0];
                 float z_max = frag_ptr[1];
                 float* range_ptr = range_map_indexer.GetDataPtr<float>(u, v);
-#ifdef __CUDACC__
+#if defined(__CUDACC__)
                 atomicMinf(&(range_ptr[0]), z_min);
                 atomicMaxf(&(range_ptr[1]), z_max);
 #else
-#pragma omp critical(EstimateRangeCPU)
                 {
+                    tbb::spin_mutex::scoped_lock lock(estimate_range_mutex);
                     range_ptr[0] = min(z_min, range_ptr[0]);
                     range_ptr[1] = max(z_max, range_ptr[1]);
                 }
@@ -501,6 +512,7 @@ void EstimateRangeCPU
 #if defined(__CUDACC__)
     core::cuda::Synchronize();
 #endif
+#undef LOCAL_LAMBDA_CAPTURE
 
     if (needed_frag_count != frag_count) {
         utility::LogInfo("Reallocating {} fragments for EstimateRange (was {})",
diff --git a/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp b/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp
index 4115a42e12e..e135caea14d 100644
--- a/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp
+++ b/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp
@@ -9,6 +9,8 @@
 
 #include <string>
 
+#include <tbb/parallel_invoke.h>
+
 #include "open3d/io/IJsonConvertibleIO.h"
 #include "open3d/io/ImageIO.h"
 #include "open3d/t/io/sensor/realsense/RSBagReader.h"
@@ -53,25 +55,20 @@ void RGBDVideoReader::SaveFrames(const std::string &frame_path,
     open3d::geometry::Image im_color, im_depth;
     for (auto tim_rgbd = NextFrame(); !IsEOF() && GetTimestamp() < end_time;
          ++idx, tim_rgbd = NextFrame())
-#pragma omp parallel sections
-    {
-#pragma omp section
-        {
+    tbb::parallel_invoke([&](){
             im_color = tim_rgbd.color_.ToLegacy();
             auto color_file =
                     fmt::format("{0}/color/{1:05d}.jpg", frame_path, idx);
             open3d::io::WriteImage(color_file, im_color);
             utility::LogDebug("Written color image to {}", color_file);
-        }
-#pragma omp section
-        {
+        }, [&](){
             im_depth = tim_rgbd.depth_.ToLegacy();
             auto depth_file =
                     fmt::format("{0}/depth/{1:05d}.png", frame_path, idx);
             open3d::io::WriteImage(depth_file, im_depth);
             utility::LogDebug("Written depth image to {}", depth_file);
-        }
-    }
+    });
+
     utility::LogInfo("Written {} depth and color images to {}/{{depth,color}}/",
                      idx, frame_path);
 }
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
index b541082e2d2..23f3fced488 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
@@ -5,6 +5,10 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#if !defined(__CUDACC__)
+#include <tbb/spin_mutex.h>
+#endif
+
 #include "open3d/core/linalg/kernel/SVD3x3.h"
 #include "open3d/t/geometry/kernel/GeometryIndexer.h"
 #include "open3d/t/pipelines/kernel/FillInLinearSystem.h"
@@ -49,8 +53,18 @@ void FillInRigidAlignmentTermCPU
     const float *Ri_normal_ps_ptr =
             static_cast<const float *>(Ri_normal_ps.GetDataPtr());
 
+#if !defined(__CUDACC__)
+    tbb::spin_mutex fill_alignment_mutex;
+    tbb::profiling::set_name(fill_alignment_mutex,
+        "FillInRigidAlignmentTermCPU");
+#define LOCAL_LAMBDA_CAPTURE =, &fill_alignment_mutex
+#else
+#defined LOCAL_LAMBDA_CAPTURE =
+#endif
+
     core::ParallelFor(
-            AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
+            AtA.GetDevice(), n, [LOCAL_LAMBDA_CAPTURE]
+                    OPEN3D_DEVICE(int64_t workload_idx) {
                 const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
                 const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
                 const float *normal_p_prime =
@@ -86,8 +100,8 @@ void FillInRigidAlignmentTermCPU
                 }
                 atomicAdd(residual_ptr, r * r);
 #else
-#pragma omp critical(FillInRigidAlignmentTermCPU)
         {
+            tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
             for (int i_local = 0; i_local < 12; ++i_local) {
                 for (int j_local = 0; j_local < 12; ++j_local) {
                     AtA_local_ptr[i_local * 12 + j_local]
@@ -98,7 +112,8 @@ void FillInRigidAlignmentTermCPU
             *residual_ptr += r * r;
         }
 #endif
-            });
+    });
+#undef LOCAL_LAMBDA_CAPTURE
 
     // Then fill-in the large linear system
     std::vector<int64_t> indices_vec(12);
@@ -182,8 +197,18 @@ void FillInSLACAlignmentTermCPU
     const float *cgrid_ratio_qs_ptr =
             static_cast<const float *>(cgrid_ratio_qs.GetDataPtr());
 
+#if !defined(__CUDACC__)
+    tbb::spin_mutex fill_alignment_mutex;
+    tbb::profiling::set_name(fill_alignment_mutex,
+        "FillInSLACAlignmentTermCPU");
+#define LOCAL_LAMBDA_CAPTURE =, &fill_alignment_mutex
+#else
+#defined LOCAL_LAMBDA_CAPTURE =
+#endif
+
     core::ParallelFor(
-            AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
+            AtA.GetDevice(), n, [LOCAL_LAMBDA_CAPTURE]
+                    OPEN3D_DEVICE(int64_t workload_idx) {
                 const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
                 const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
                 const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
@@ -259,8 +284,8 @@ void FillInSLACAlignmentTermCPU
                 }
                 atomicAdd(residual_ptr, r * r);
 #else
-#pragma omp critical(FillInSLACAlignmentTermCPU)
         {
+            tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
             for (int ki = 0; ki < 60; ++ki) {
                 for (int kj = 0; kj < 60; ++kj) {
                     AtA_ptr[idx[ki] * n_vars + idx[kj]]
@@ -271,7 +296,8 @@ void FillInSLACAlignmentTermCPU
             *residual_ptr += r * r;
         }
 #endif
-            });
+    });
+#undef LOCAL_LAMBDA_CAPTURE
 }
 
 #if defined(__CUDACC__)
@@ -309,8 +335,17 @@ void FillInSLACRegularizerTermCPU
     const float *positions_curr_ptr =
             static_cast<const float *>(positions_curr.GetDataPtr());
 
-    core::ParallelFor(
-            AtA.GetDevice(), n, [=] OPEN3D_DEVICE(int64_t workload_idx) {
+#if !defined(__CUDACC__)
+    tbb::spin_mutex fill_alignment_mutex;
+    tbb::profiling::set_name(fill_alignment_mutex,
+        "FillInSLACRegularizerTermCPU");
+#define LOCAL_LAMBDA_CAPTURE =, &fill_alignment_mutex
+#else
+#defined LOCAL_LAMBDA_CAPTURE =
+#endif
+
+    core::ParallelFor(AtA.GetDevice(), n, [LOCAL_LAMBDA_CAPTURE]
+                    OPEN3D_DEVICE(int64_t workload_idx) {
                 // Enumerate 6 neighbors
                 int idx_i = grid_idx_ptr[workload_idx];
 
@@ -442,8 +477,8 @@ void FillInSLACRegularizerTermCPU
                                       -weight * local_r[axis]);
                         }
 #else
-#pragma omp critical(FillInSLACRegularizerTermCPU)
                 {
+                    tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
                     // Update residual
                     *residual_ptr += weight * (local_r[0] * local_r[0] +
                                                local_r[1] * local_r[1] +
@@ -469,7 +504,8 @@ void FillInSLACRegularizerTermCPU
 #endif
                     }
                 }
-            });
+    });
+#undef LOCAL_LAMBDA_CAPTURE
 }
 }  // namespace kernel
 }  // namespace pipelines
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
index e140c455e96..8ce0634653d 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
@@ -45,52 +45,44 @@ void ComputeOdometryInformationMatrixCPU(const core::Tensor& source_vertex_map,
 
     std::vector<float> A_1x21(21, 0.0);
 
-#ifdef _MSC_VER
     std::vector<float> zeros_21(21, 0.0);
     A_1x21 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_21,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-                for (int workload_idx = r.begin(); workload_idx < r.end();
-                     workload_idx++) {
-#else
-    float* A_reduction = A_1x21.data();
-#pragma omp parallel for reduction(+ : A_reduction[:21]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; workload_idx++) {
-#endif
-                    int y = workload_idx / cols;
-                    int x = workload_idx % cols;
-
-                    float J_x[6], J_y[6], J_z[6];
-                    float rx, ry, rz;
-
-                    bool valid = GetJacobianPointToPoint(
-                            x, y, square_dist_thr, source_vertex_indexer,
-                            target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
-                            rz);
-
-                    if (valid) {
-                        for (int i = 0, j = 0; j < 6; j++) {
-                            for (int k = 0; k <= j; k++) {
-                                A_reduction[i] += J_x[j] * J_x[k];
-                                A_reduction[i] += J_y[j] * J_y[k];
-                                A_reduction[i] += J_z[j] * J_z[k];
-                                i++;
-                            }
-                        }
+        for (int workload_idx = r.begin(); workload_idx < r.end();
+             workload_idx++) {
+            int y = workload_idx / cols;
+            int x = workload_idx % cols;
+
+            float J_x[6], J_y[6], J_z[6];
+            float rx, ry, rz;
+
+            bool valid = GetJacobianPointToPoint(
+                    x, y, square_dist_thr, source_vertex_indexer,
+                    target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
+                    rz);
+
+            if (valid) {
+                for (int i = 0, j = 0; j < 6; j++) {
+                    for (int k = 0; k <= j; k++) {
+                        A_reduction[i] += J_x[j] * J_x[k];
+                        A_reduction[i] += J_y[j] * J_y[k];
+                        A_reduction[i] += J_z[j] * J_z[k];
+                        i++;
                     }
                 }
-#ifdef _MSC_VER
-                return A_reduction;
-            },
-            // TBB: Defining reduction operation.
-            [&](std::vector<float> a, std::vector<float> b) {
-                std::vector<float> result(21);
-                for (int j = 0; j < 21; j++) {
-                    result[j] = a[j] + b[j];
-                }
-                return result;
-            });
-#endif
+            }
+        }
+        return A_reduction;
+    },
+    // TBB: Defining reduction operation.
+    [&](std::vector<float> a, std::vector<float> b) {
+        std::vector<float> result(21);
+        for (int j = 0; j < 21; j++) {
+            result[j] = a[j] + b[j];
+        }
+        return result;
+    });
     core::Tensor A_reduction_tensor(A_1x21, {21}, core::Float32, device);
     float* reduction_ptr = A_reduction_tensor.GetDataPtr<float>();
 
@@ -145,59 +137,52 @@ void ComputeOdometryResultIntensityCPU(
 
     std::vector<float> A_1x29(29, 0.0);
 
-#ifdef _MSC_VER
     std::vector<float> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-                for (int workload_idx = r.begin(); workload_idx < r.end();
-                     workload_idx++) {
-#else
-    float* A_reduction = A_1x29.data();
-#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; workload_idx++) {
-#endif
-                    int y = workload_idx / cols;
-                    int x = workload_idx % cols;
-
-                    float J_I[6];
-                    float r_I;
-
-                    bool valid = GetJacobianIntensity(
-                            x, y, depth_outlier_trunc, source_depth_indexer,
-                            target_depth_indexer, source_intensity_indexer,
-                            target_intensity_indexer,
-                            target_intensity_dx_indexer,
-                            target_intensity_dy_indexer, source_vertex_indexer,
-                            ti, J_I, r_I);
-
-                    if (valid) {
-                        float d_huber = HuberDeriv(r_I, intensity_huber_delta);
-                        float r_huber = HuberLoss(r_I, intensity_huber_delta);
-
-                        for (int i = 0, j = 0; j < 6; j++) {
-                            for (int k = 0; k <= j; k++) {
-                                A_reduction[i] += J_I[j] * J_I[k];
-                                i++;
-                            }
-                            A_reduction[21 + j] += J_I[j] * d_huber;
-                        }
-                        A_reduction[27] += r_huber;
-                        A_reduction[28] += 1;
+        for (int workload_idx = r.begin(); workload_idx < r.end();
+             workload_idx++) {
+            int y = workload_idx / cols;
+            int x = workload_idx % cols;
+
+            float J_I[6];
+            float r_I;
+
+            bool valid = GetJacobianIntensity(
+                    x, y, depth_outlier_trunc, source_depth_indexer,
+                    target_depth_indexer, source_intensity_indexer,
+                    target_intensity_indexer,
+                    target_intensity_dx_indexer,
+                    target_intensity_dy_indexer, source_vertex_indexer,
+                    ti, J_I, r_I);
+
+            if (valid) {
+                float d_huber = HuberDeriv(r_I, intensity_huber_delta);
+                float r_huber = HuberLoss(r_I, intensity_huber_delta);
+
+                for (int i = 0, j = 0; j < 6; j++) {
+                    for (int k = 0; k <= j; k++) {
+                        A_reduction[i] += J_I[j] * J_I[k];
+                        i++;
                     }
+                    A_reduction[21 + j] += J_I[j] * d_huber;
                 }
-#ifdef _MSC_VER
-                return A_reduction;
-            },
-            // TBB: Defining reduction operation.
-            [&](std::vector<float> a, std::vector<float> b) {
-                std::vector<float> result(29);
-                for (int j = 0; j < 29; j++) {
-                    result[j] = a[j] + b[j];
-                }
-                return result;
-            });
-#endif
+                A_reduction[27] += r_huber;
+                A_reduction[28] += 1;
+            }
+        }
+        return A_reduction;
+    },
+    // TBB: Defining reduction operation.
+    [&](std::vector<float> a, std::vector<float> b) {
+        std::vector<float> result(29);
+        for (int j = 0; j < 29; j++) {
+            result[j] = a[j] + b[j];
+        }
+        return result;
+    });
+
     core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
     DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
 }
@@ -245,66 +230,58 @@ void ComputeOdometryResultHybridCPU(const core::Tensor& source_depth,
 
     std::vector<float> A_1x29(29, 0.0);
 
-#ifdef _MSC_VER
     std::vector<float> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-                for (int workload_idx = r.begin(); workload_idx < r.end();
-                     workload_idx++) {
-#else
-    float* A_reduction = A_1x29.data();
-#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; workload_idx++) {
-#endif
-                    int y = workload_idx / cols;
-                    int x = workload_idx % cols;
-
-                    float J_I[6], J_D[6];
-                    float r_I, r_D;
-
-                    bool valid = GetJacobianHybrid(
-                            x, y, depth_outlier_trunc, source_depth_indexer,
-                            target_depth_indexer, source_intensity_indexer,
-                            target_intensity_indexer, target_depth_dx_indexer,
-                            target_depth_dy_indexer,
-                            target_intensity_dx_indexer,
-                            target_intensity_dy_indexer, source_vertex_indexer,
-                            ti, J_I, J_D, r_I, r_D);
-
-                    if (valid) {
-                        float d_huber_I =
-                                HuberDeriv(r_I, intensity_huber_delta);
-                        float d_huber_D = HuberDeriv(r_D, depth_huber_delta);
-
-                        float r_huber_I = HuberLoss(r_I, intensity_huber_delta);
-                        float r_huber_D = HuberLoss(r_D, depth_huber_delta);
-
-                        for (int i = 0, j = 0; j < 6; j++) {
-                            for (int k = 0; k <= j; k++) {
-                                A_reduction[i] +=
-                                        J_I[j] * J_I[k] + J_D[j] * J_D[k];
-                                i++;
-                            }
-                            A_reduction[21 + j] +=
-                                    J_I[j] * d_huber_I + J_D[j] * d_huber_D;
-                        }
-                        A_reduction[27] += r_huber_I + r_huber_D;
-                        A_reduction[28] += 1;
+        for (int workload_idx = r.begin(); workload_idx < r.end();
+             workload_idx++) {
+            int y = workload_idx / cols;
+            int x = workload_idx % cols;
+
+            float J_I[6], J_D[6];
+            float r_I, r_D;
+
+            bool valid = GetJacobianHybrid(
+                    x, y, depth_outlier_trunc, source_depth_indexer,
+                    target_depth_indexer, source_intensity_indexer,
+                    target_intensity_indexer, target_depth_dx_indexer,
+                    target_depth_dy_indexer,
+                    target_intensity_dx_indexer,
+                    target_intensity_dy_indexer, source_vertex_indexer,
+                    ti, J_I, J_D, r_I, r_D);
+
+            if (valid) {
+                float d_huber_I =
+                        HuberDeriv(r_I, intensity_huber_delta);
+                float d_huber_D = HuberDeriv(r_D, depth_huber_delta);
+
+                float r_huber_I = HuberLoss(r_I, intensity_huber_delta);
+                float r_huber_D = HuberLoss(r_D, depth_huber_delta);
+
+                for (int i = 0, j = 0; j < 6; j++) {
+                    for (int k = 0; k <= j; k++) {
+                        A_reduction[i] +=
+                                J_I[j] * J_I[k] + J_D[j] * J_D[k];
+                        i++;
                     }
+                    A_reduction[21 + j] +=
+                            J_I[j] * d_huber_I + J_D[j] * d_huber_D;
                 }
-#ifdef _MSC_VER
-                return A_reduction;
-            },
-            // TBB: Defining reduction operation.
-            [&](std::vector<float> a, std::vector<float> b) {
-                std::vector<float> result(29);
-                for (int j = 0; j < 29; j++) {
-                    result[j] = a[j] + b[j];
-                }
-                return result;
-            });
-#endif
+                A_reduction[27] += r_huber_I + r_huber_D;
+                A_reduction[28] += 1;
+            }
+        }
+        return A_reduction;
+    },
+    // TBB: Defining reduction operation.
+    [&](std::vector<float> a, std::vector<float> b) {
+        std::vector<float> result(29);
+        for (int j = 0; j < 29; j++) {
+            result[j] = a[j] + b[j];
+        }
+        return result;
+    });
     core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
     DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
 }
@@ -337,55 +314,48 @@ void ComputeOdometryResultPointToPlaneCPU(
 
     std::vector<float> A_1x29(29, 0.0);
 
-#ifdef _MSC_VER
     std::vector<float> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-                for (int workload_idx = r.begin(); workload_idx < r.end();
-                     workload_idx++) {
-#else
-    float* A_reduction = A_1x29.data();
-#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; workload_idx++) {
-#endif
-                    int y = workload_idx / cols;
-                    int x = workload_idx % cols;
-
-                    float J_ij[6];
-                    float r;
-
-                    bool valid = GetJacobianPointToPlane(
-                            x, y, depth_outlier_trunc, source_vertex_indexer,
-                            target_vertex_indexer, target_normal_indexer, ti,
-                            J_ij, r);
-
-                    if (valid) {
-                        float d_huber = HuberDeriv(r, depth_huber_delta);
-                        float r_huber = HuberLoss(r, depth_huber_delta);
-                        for (int i = 0, j = 0; j < 6; j++) {
-                            for (int k = 0; k <= j; k++) {
-                                A_reduction[i] += J_ij[j] * J_ij[k];
-                                i++;
-                            }
-                            A_reduction[21 + j] += J_ij[j] * d_huber;
-                        }
-                        A_reduction[27] += r_huber;
-                        A_reduction[28] += 1;
+        for (int workload_idx = r.begin(); workload_idx < r.end();
+             workload_idx++) {
+            int y = workload_idx / cols;
+            int x = workload_idx % cols;
+
+            float J_ij[6];
+            float r;
+
+            bool valid = GetJacobianPointToPlane(
+                    x, y, depth_outlier_trunc, source_vertex_indexer,
+                    target_vertex_indexer, target_normal_indexer, ti,
+                    J_ij, r);
+
+            if (valid) {
+                float d_huber = HuberDeriv(r, depth_huber_delta);
+                float r_huber = HuberLoss(r, depth_huber_delta);
+                for (int i = 0, j = 0; j < 6; j++) {
+                    for (int k = 0; k <= j; k++) {
+                        A_reduction[i] += J_ij[j] * J_ij[k];
+                        i++;
                     }
+                    A_reduction[21 + j] += J_ij[j] * d_huber;
                 }
-#ifdef _MSC_VER
-                return A_reduction;
-            },
-            // TBB: Defining reduction operation.
-            [&](std::vector<float> a, std::vector<float> b) {
-                std::vector<float> result(29);
-                for (int j = 0; j < 29; j++) {
-                    result[j] = a[j] + b[j];
-                }
-                return result;
-            });
-#endif
+                A_reduction[27] += r_huber;
+                A_reduction[28] += 1;
+            }
+        }
+        return A_reduction;
+    },
+    // TBB: Defining reduction operation.
+    [&](std::vector<float> a, std::vector<float> b) {
+        std::vector<float> result(29);
+        for (int j = 0; j < 29; j++) {
+            result[j] = a[j] + b[j];
+        }
+        return result;
+    });
+
     core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
     DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
 }
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
index d2ea8b584ab..ba3b440c932 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
@@ -41,54 +41,47 @@ static void ComputePosePointToPlaneKernelCPU(
     // and 28th as inlier_count.
     std::vector<scalar_t> A_1x29(29, 0.0);
 
-#ifdef _WIN32
     std::vector<scalar_t> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
-            [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
-                for (int workload_idx = r.begin(); workload_idx < r.end();
-                     ++workload_idx) {
-#else
-    scalar_t *A_reduction = A_1x29.data();
-#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; ++workload_idx) {
-#endif
-                    scalar_t J_ij[6];
-                    scalar_t r = 0;
-
-                    bool valid = kernel::GetJacobianPointToPlane<scalar_t>(
-                            workload_idx, source_points_ptr, target_points_ptr,
-                            target_normals_ptr, correspondence_indices, J_ij,
-                            r);
-
-                    scalar_t w = GetWeightFromRobustKernel(r);
-
-                    if (valid) {
-                        // Dump J, r into JtJ and Jtr
-                        int i = 0;
-                        for (int j = 0; j < 6; ++j) {
-                            for (int k = 0; k <= j; ++k) {
-                                A_reduction[i] += J_ij[j] * w * J_ij[k];
-                                ++i;
-                            }
-                            A_reduction[21 + j] += J_ij[j] * w * r;
-                        }
-                        A_reduction[27] += r;
-                        A_reduction[28] += 1;
+            [&](const tbb::blocked_range<int>& range,
+                    std::vector<scalar_t> A_reduction) {
+        for (int workload_idx = range.begin(); workload_idx < range.end();
+             ++workload_idx) {
+            scalar_t J_ij[6];
+            scalar_t r = 0;
+
+            bool valid = kernel::GetJacobianPointToPlane<scalar_t>(
+                    workload_idx, source_points_ptr, target_points_ptr,
+                    target_normals_ptr, correspondence_indices, J_ij,
+                    r);
+
+            scalar_t w = GetWeightFromRobustKernel(r);
+
+            if (valid) {
+                // Dump J, r into JtJ and Jtr
+                int i = 0;
+                for (int j = 0; j < 6; ++j) {
+                    for (int k = 0; k <= j; ++k) {
+                        A_reduction[i] += J_ij[j] * w * J_ij[k];
+                        ++i;
                     }
+                    A_reduction[21 + j] += J_ij[j] * w * r;
                 }
-#ifdef _WIN32
-                return A_reduction;
-            },
-            // TBB: Defining reduction operation.
-            [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
-                std::vector<scalar_t> result(29);
-                for (int j = 0; j < 29; ++j) {
-                    result[j] = a[j] + b[j];
-                }
-                return result;
-            });
-#endif
+                A_reduction[27] += r;
+                A_reduction[28] += 1;
+            }
+        }
+        return A_reduction;
+    },
+    // TBB: Defining reduction operation.
+    [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
+        std::vector<scalar_t> result(29);
+        for (int j = 0; j < 29; ++j) {
+            result[j] = a[j] + b[j];
+        }
+        return result;
+    });
 
     for (int i = 0; i < 29; ++i) {
         global_sum[i] = A_1x29[i];
@@ -147,59 +140,52 @@ static void ComputePoseColoredICPKernelCPU(
     // and 28th as inlier_count.
     std::vector<scalar_t> A_1x29(29, 0.0);
 
-#ifdef _WIN32
     std::vector<scalar_t> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
-            [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
-                for (int workload_idx = r.begin(); workload_idx < r.end();
-                     ++workload_idx) {
-#else
-    scalar_t *A_reduction = A_1x29.data();
-#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; ++workload_idx) {
-#endif
-                    scalar_t J_G[6] = {0}, J_I[6] = {0};
-                    scalar_t r_G = 0, r_I = 0;
-
-                    bool valid = GetJacobianColoredICP<scalar_t>(
-                            workload_idx, source_points_ptr, source_colors_ptr,
-                            target_points_ptr, target_normals_ptr,
-                            target_colors_ptr, target_color_gradients_ptr,
-                            correspondence_indices, sqrt_lambda_geometric,
-                            sqrt_lambda_photometric, J_G, J_I, r_G, r_I);
-
-                    scalar_t w_G = GetWeightFromRobustKernel(r_G);
-                    scalar_t w_I = GetWeightFromRobustKernel(r_I);
-
-                    if (valid) {
-                        // Dump J, r into JtJ and Jtr
-                        int i = 0;
-                        for (int j = 0; j < 6; ++j) {
-                            for (int k = 0; k <= j; ++k) {
-                                A_reduction[i] += J_G[j] * w_G * J_G[k] +
-                                                  J_I[j] * w_I * J_I[k];
-                                ++i;
-                            }
-                            A_reduction[21 + j] +=
-                                    J_G[j] * w_G * r_G + J_I[j] * w_I * r_I;
-                        }
-                        A_reduction[27] += r_G * r_G + r_I * r_I;
-                        A_reduction[28] += 1;
+            [&](const tbb::blocked_range<int>& r,
+                std::vector<scalar_t> A_reduction) {
+        for (int workload_idx = r.begin(); workload_idx < r.end();
+             ++workload_idx) {
+            scalar_t J_G[6] = {0}, J_I[6] = {0};
+            scalar_t r_G = 0, r_I = 0;
+
+            bool valid = GetJacobianColoredICP<scalar_t>(
+                    workload_idx, source_points_ptr, source_colors_ptr,
+                    target_points_ptr, target_normals_ptr,
+                    target_colors_ptr, target_color_gradients_ptr,
+                    correspondence_indices, sqrt_lambda_geometric,
+                    sqrt_lambda_photometric, J_G, J_I, r_G, r_I);
+
+            scalar_t w_G = GetWeightFromRobustKernel(r_G);
+            scalar_t w_I = GetWeightFromRobustKernel(r_I);
+
+            if (valid) {
+                // Dump J, r into JtJ and Jtr
+                int i = 0;
+                for (int j = 0; j < 6; ++j) {
+                    for (int k = 0; k <= j; ++k) {
+                        A_reduction[i] += J_G[j] * w_G * J_G[k] +
+                                          J_I[j] * w_I * J_I[k];
+                        ++i;
                     }
+                    A_reduction[21 + j] +=
+                            J_G[j] * w_G * r_G + J_I[j] * w_I * r_I;
                 }
-#ifdef _WIN32
-                return A_reduction;
-            },
-            // TBB: Defining reduction operation.
-            [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
-                std::vector<scalar_t> result(29);
-                for (int j = 0; j < 29; ++j) {
-                    result[j] = a[j] + b[j];
-                }
-                return result;
-            });
-#endif
+                A_reduction[27] += r_G * r_G + r_I * r_I;
+                A_reduction[28] += 1;
+            }
+        }
+        return A_reduction;
+    },
+    // TBB: Defining reduction operation.
+    [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
+        std::vector<scalar_t> result(29);
+        for (int j = 0; j < 29; ++j) {
+            result[j] = a[j] + b[j];
+        }
+        return result;
+    });
 
     for (int i = 0; i < 29; ++i) {
         global_sum[i] = A_1x29[i];
@@ -585,48 +571,40 @@ void ComputeInformationMatrixKernelCPU(const scalar_t *target_points_ptr,
     // As, AtA is a symmetric matrix, we only need 21 elements instead of 36.
     std::vector<scalar_t> AtA(21, 0.0);
 
-#ifdef _WIN32
     std::vector<scalar_t> zeros_21(21, 0.0);
     AtA = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_21,
             [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
-                for (int workload_idx = r.begin(); workload_idx < r.end();
-                     ++workload_idx) {
-#else
-    scalar_t *A_reduction = AtA.data();
-#pragma omp parallel for reduction(+ : A_reduction[:21]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; workload_idx++) {
-#endif
-                    scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
-
-                    bool valid = GetInformationJacobians<scalar_t>(
-                            workload_idx, target_points_ptr,
-                            correspondence_indices, J_x, J_y, J_z);
-
-                    if (valid) {
-                        int i = 0;
-                        for (int j = 0; j < 6; ++j) {
-                            for (int k = 0; k <= j; ++k) {
-                                A_reduction[i] += J_x[j] * J_x[k] +
-                                                  J_y[j] * J_y[k] +
-                                                  J_z[j] * J_z[k];
-                                ++i;
-                            }
-                        }
+        for (int workload_idx = r.begin(); workload_idx < r.end();
+             ++workload_idx) {
+            scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+
+            bool valid = GetInformationJacobians<scalar_t>(
+                    workload_idx, target_points_ptr,
+                    correspondence_indices, J_x, J_y, J_z);
+
+            if (valid) {
+                int i = 0;
+                for (int j = 0; j < 6; ++j) {
+                    for (int k = 0; k <= j; ++k) {
+                        A_reduction[i] += J_x[j] * J_x[k] +
+                                          J_y[j] * J_y[k] +
+                                          J_z[j] * J_z[k];
+                        ++i;
                     }
                 }
-#ifdef _WIN32
-                return A_reduction;
-            },
-            // TBB: Defining reduction operation.
-            [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
-                std::vector<scalar_t> result(21);
-                for (int j = 0; j < 21; ++j) {
-                    result[j] = a[j] + b[j];
-                }
-                return result;
-            });
-#endif
+            }
+        }
+        return A_reduction;
+    },
+    // TBB: Defining reduction operation.
+    [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
+        std::vector<scalar_t> result(21);
+        for (int j = 0; j < 21; ++j) {
+            result[j] = a[j] + b[j];
+        }
+        return result;
+    });
 
     for (int i = 0; i < 21; ++i) {
         global_sum[i] = AtA[i];
diff --git a/cpp/open3d/t/pipelines/registration/Feature.cpp b/cpp/open3d/t/pipelines/registration/Feature.cpp
index d20e1a1b58c..56cd9a76e81 100644
--- a/cpp/open3d/t/pipelines/registration/Feature.cpp
+++ b/cpp/open3d/t/pipelines/registration/Feature.cpp
@@ -7,6 +7,8 @@
 
 #include "open3d/t/pipelines/registration/Feature.h"
 
+#include <tbb/parallel_for.h>
+
 #include "open3d/core/nns/NearestNeighborSearch.h"
 #include "open3d/t/geometry/PointCloud.h"
 #include "open3d/t/pipelines/kernel/Feature.h"
@@ -100,20 +102,18 @@ core::Tensor CorrespondencesFromFeatures(const core::Tensor &source_features,
     std::array<core::Tensor, 2> features{source_features, target_features};
     std::vector<core::Tensor> corres(num_searches);
 
-    const int kMaxThreads = utility::EstimateMaxThreads();
-    const int kOuterThreads = std::min(kMaxThreads, num_searches);
-    (void)kOuterThreads;  // Avoids compiler warning if OpenMP is disabled
-
     // corres[0]: corres_ij, corres[1]: corres_ji
-#pragma omp parallel for num_threads(kOuterThreads)
-    for (int i = 0; i < num_searches; ++i) {
-        core::nns::NearestNeighborSearch nns(features[1 - i],
-                                             core::Dtype::Int64);
-        nns.KnnIndex();
-        auto result = nns.KnnSearch(features[i], 1);
-
-        corres[i] = result.first.View({-1});
-    }
+    tbb::parallel_for(tbb::blocked_range<int>(0, num_searches, 1),
+            [&](const tbb::blocked_range<int>& range){
+        for (int i = range.begin(); i < range.end(); ++i) {
+            core::nns::NearestNeighborSearch nns(
+                    features[1 - i], core::Dtype::Int64);
+            nns.KnnIndex();
+            auto result = nns.KnnSearch(features[i], 1);
+
+            corres[i] = result.first.View({-1});
+        }
+    });
 
     auto corres_ij = corres[0];
     core::Tensor arange_source =
diff --git a/cpp/open3d/utility/Eigen.cpp b/cpp/open3d/utility/Eigen.cpp
index b3f6fb34476..292834e0050 100644
--- a/cpp/open3d/utility/Eigen.cpp
+++ b/cpp/open3d/utility/Eigen.cpp
@@ -7,10 +7,15 @@
 
 #include "open3d/utility/Eigen.h"
 
+#include <tbb/blocked_range.h>
+#include <tbb/parallel_reduce.h>
+#include <tbb/spin_mutex.h>
+
 #include <Eigen/Geometry>
 #include <Eigen/Sparse>
 
 #include "open3d/utility/Logging.h"
+#include "open3d/utility/Parallel.h"
 
 namespace open3d {
 namespace utility {
@@ -142,48 +147,112 @@ SolveJacobianSystemAndObtainExtrinsicMatrixArray(const Eigen::MatrixXd &JTJ,
     }
 }
 
+// Shared state for both the single valued and vector valued versions
+// This is used instead of lambdas in tbb::parallel reduce
+// to prevent extra allocations of the shared state
 template <typename MatType, typename VecType>
-std::tuple<MatType, VecType, double> ComputeJTJandJTr(
-        std::function<void(int, VecType &, double &, double &)> f,
-        int iteration_num,
-        bool verbose /*=true*/) {
-    MatType JTJ;
-    VecType JTr;
+struct JTJandJTrReduceBodyHelper {
+    // Local data
+    MatType JTJ = MatType::Zero();
+    VecType JTr = VecType::Zero();
     double r2_sum = 0.0;
-    JTJ.setZero();
-    JTr.setZero();
-#pragma omp parallel
-    {
-        MatType JTJ_private;
-        VecType JTr_private;
-        double r2_sum_private = 0.0;
-        JTJ_private.setZero();
-        JTr_private.setZero();
-        VecType J_r;
-        J_r.setZero();
-        double r = 0.0;
-        double w = 0.0;
-#pragma omp for nowait
-        for (int i = 0; i < iteration_num; i++) {
+
+    void join(const JTJandJTrReduceBodyHelper& rhs) {
+        JTJ += rhs.JTJ;
+        JTr += rhs.JTr;
+        r2_sum += rhs.r2_sum;
+    }
+
+    std::tuple<MatType, VecType, double> as_tuple() && {
+        return {std::move(JTJ), std::move(JTr), r2_sum};
+    }
+};
+
+template <typename MatType, typename VecType, bool VectorFunc>
+struct JTJandJTrReduceBody;
+
+// Code specific to the single valued reduction
+template <typename MatType, typename VecType>
+struct JTJandJTrReduceBody<MatType, VecType, false>: public
+        JTJandJTrReduceBodyHelper<MatType, VecType> {
+    using FuncType = std::function<void(int, VecType&, double&, double&)>;
+    // Global data
+    FuncType& f;
+
+    // Local data
+    VecType J_r = VecType::Zero();
+    double r = 0.0;
+    double w = 0.0;
+
+    JTJandJTrReduceBody(FuncType& f_)
+        : JTJandJTrReduceBodyHelper<MatType, VecType>(), f(f_) {}
+
+    JTJandJTrReduceBody(JTJandJTrReduceBody& other, tbb::split sp)
+        : JTJandJTrReduceBodyHelper<MatType, VecType>(), f(other.f) {}
+
+    void operator()(const tbb::blocked_range<int>& range) {
+        for (int i = range.begin(); i < range.end(); ++i) {
             f(i, J_r, r, w);
-            JTJ_private.noalias() += J_r * w * J_r.transpose();
-            JTr_private.noalias() += J_r * w * r;
-            r2_sum_private += r * r;
-        }
-#pragma omp critical(ComputeJTJandJTr)
-        {
-            JTJ += JTJ_private;
-            JTr += JTr_private;
-            r2_sum += r2_sum_private;
+            this->JTJ.noalias() += J_r * w * J_r.transpose();
+            this->JTr.noalias() += J_r * w * r;
+            this->r2_sum += r * r;
         }
     }
+
+};
+
+template <typename MatType, typename VecType>
+std::tuple<MatType, VecType, double> ComputeJTJandJTr(
+        std::function<void(int, VecType &, double &, double &)> f,
+        int iteration_num,
+        bool verbose /*=true*/) {
+    JTJandJTrReduceBody<MatType, VecType, false> reducer(f);
+    tbb::parallel_reduce(tbb::blocked_range<int>(0, iteration_num,
+            utility::DefaultGrainSizeTBB()), reducer);
+    std::tuple<MatType, VecType, double> result = std::move(reducer).as_tuple();
     if (verbose) {
         LogDebug("Residual : {:.2e} (# of elements : {:d})",
-                 r2_sum / (double)iteration_num, iteration_num);
+                 std::get<2>(result) / (double)iteration_num, iteration_num);
     }
-    return std::make_tuple(std::move(JTJ), std::move(JTr), r2_sum);
+    return result;
 }
 
+// Code specific to the vector valued reduction
+template <typename MatType, typename VecType>
+struct JTJandJTrReduceBody<MatType, VecType, true>: public
+        JTJandJTrReduceBodyHelper<MatType, VecType> {
+    using FuncType = std::function<void(int,
+        std::vector<VecType, Eigen::aligned_allocator<VecType>>&,
+        std::vector<double>&, std::vector<double>&)>;
+    // Global data
+    FuncType& f;
+
+    // Local data
+    std::vector<VecType, Eigen::aligned_allocator<VecType>> J_r;
+    std::vector<double> r;
+    std::vector<double> w;
+
+    JTJandJTrReduceBody(FuncType& f_)
+        : JTJandJTrReduceBodyHelper<MatType, VecType>(),
+                f(f_), J_r(), r(), w() {}
+
+    JTJandJTrReduceBody(JTJandJTrReduceBody& other, tbb::split sp)
+        : JTJandJTrReduceBodyHelper<MatType, VecType>(),
+                f(other.f), J_r(), r(), w() {}
+
+    void operator()(const tbb::blocked_range<int>& range) {
+        for (int i = range.begin(); i < range.end(); ++i) {
+            f(i, J_r, r, w);
+            for (int j = 0; j < (int)r.size(); j++) {
+                this->JTJ.noalias() += J_r[j] * w[j] * J_r[j].transpose();
+                this->JTr.noalias() += J_r[j] * w[j] * r[j];
+                this->r2_sum += r[j] * r[j];
+            }
+        }
+    }
+
+};
+
 template <typename MatType, typename VecType>
 std::tuple<MatType, VecType, double> ComputeJTJandJTr(
         std::function<
@@ -193,42 +262,15 @@ std::tuple<MatType, VecType, double> ComputeJTJandJTr(
                      std::vector<double> &)> f,
         int iteration_num,
         bool verbose /*=true*/) {
-    MatType JTJ;
-    VecType JTr;
-    double r2_sum = 0.0;
-    JTJ.setZero();
-    JTr.setZero();
-#pragma omp parallel
-    {
-        MatType JTJ_private;
-        VecType JTr_private;
-        double r2_sum_private = 0.0;
-        JTJ_private.setZero();
-        JTr_private.setZero();
-        std::vector<double> r;
-        std::vector<double> w;
-        std::vector<VecType, Eigen::aligned_allocator<VecType>> J_r;
-#pragma omp for nowait
-        for (int i = 0; i < iteration_num; i++) {
-            f(i, J_r, r, w);
-            for (int j = 0; j < (int)r.size(); j++) {
-                JTJ_private.noalias() += J_r[j] * w[j] * J_r[j].transpose();
-                JTr_private.noalias() += J_r[j] * w[j] * r[j];
-                r2_sum_private += r[j] * r[j];
-            }
-        }
-#pragma omp critical(ComputeJTJandJTr)
-        {
-            JTJ += JTJ_private;
-            JTr += JTr_private;
-            r2_sum += r2_sum_private;
-        }
-    }
+    JTJandJTrReduceBody<MatType, VecType, true> reducer(f);
+    tbb::parallel_reduce(tbb::blocked_range<int>(0, iteration_num,
+            utility::DefaultGrainSizeTBB()), reducer);
+    std::tuple<MatType, VecType, double> result = std::move(reducer).as_tuple();
     if (verbose) {
         LogDebug("Residual : {:.2e} (# of elements : {:d})",
-                 r2_sum / (double)iteration_num, iteration_num);
+                 std::get<2>(result) / (double)iteration_num, iteration_num);
     }
-    return std::make_tuple(std::move(JTJ), std::move(JTr), r2_sum);
+    return result;
 }
 
 // clang-format off
diff --git a/cpp/open3d/utility/Parallel.cpp b/cpp/open3d/utility/Parallel.cpp
index b05f2375a95..5bb4ef733c3 100644
--- a/cpp/open3d/utility/Parallel.cpp
+++ b/cpp/open3d/utility/Parallel.cpp
@@ -45,13 +45,14 @@ int EstimateMaxThreads() {
 #endif
 }
 
-bool InParallel() {
-    // TODO: when we add TBB/Parallel STL support to ParallelFor, update this.
-#ifdef _OPENMP
-    return omp_in_parallel();
-#else
-    return false;
-#endif
+std::size_t& DefaultGrainSizeTBB() noexcept {
+    static std::size_t GrainSize = 256;
+    return GrainSize;
+}
+
+std::size_t& DefaultGrainSizeTBB2D() noexcept {
+    static std::size_t GrainSize = 32;
+    return GrainSize;
 }
 
 }  // namespace utility
diff --git a/cpp/open3d/utility/Parallel.h b/cpp/open3d/utility/Parallel.h
index 9763f533221..61452ac85c3 100644
--- a/cpp/open3d/utility/Parallel.h
+++ b/cpp/open3d/utility/Parallel.h
@@ -7,14 +7,22 @@
 
 #pragma once
 
+#include <cstdint>
+
 namespace open3d {
 namespace utility {
 
 /// Estimate the maximum number of threads to be used in a parallel region.
 int EstimateMaxThreads();
 
-/// Returns true if in an parallel section.
-bool InParallel();
+/// Returns a reference to the default grain size used by TBB.
+/// Can be altered if needed.
+std::size_t& DefaultGrainSizeTBB() noexcept;
+
+/// Returns a reference to the default grain size used by TBB
+/// for 2d blocked parallel ranges
+/// Can be altered if needed
+std::size_t& DefaultGrainSizeTBB2D() noexcept;
 
 }  // namespace utility
 }  // namespace open3d
diff --git a/cpp/open3d/utility/ProgressBar.cpp b/cpp/open3d/utility/ProgressBar.cpp
index 5a5c6fce1ab..780affbaecb 100644
--- a/cpp/open3d/utility/ProgressBar.cpp
+++ b/cpp/open3d/utility/ProgressBar.cpp
@@ -13,6 +13,8 @@
 #include <omp.h>
 #endif
 
+#include <tbb/tbb.h>
+
 namespace open3d {
 namespace utility {
 
@@ -86,5 +88,28 @@ ProgressBar &OMPProgressBar::operator++() {
     return *this;
 }
 
+TBBProgressBar::TBBProgressBar(std::size_t expected_count,
+                               const std::string &progress_info,
+                               bool active)
+    : ProgressBar(expected_count, progress_info, active),
+      tbb::task_scheduler_observer(), num_threads(0) {
+    tbb::task_scheduler_observer::observe(true);
+}
+
+void TBBProgressBar::on_scheduler_entry(bool is_worker) {
+    if (is_worker) { ++num_threads; }
+}
+void TBBProgressBar::on_scheduler_exit(bool is_worker) {
+    if (is_worker) { --num_threads; }
+}
+
+ProgressBar &TBBProgressBar::operator++() {
+    // Ref: https://stackoverflow.com/a/44555438
+    if (tbb::this_task_arena::current_thread_index() == 0) {
+        SetCurrentCount(current_count_ + num_threads);
+    }
+    return *this;
+}
+
 }  // namespace utility
 }  // namespace open3d
diff --git a/cpp/open3d/utility/ProgressBar.h b/cpp/open3d/utility/ProgressBar.h
index 710fa370f0e..813730e1156 100644
--- a/cpp/open3d/utility/ProgressBar.h
+++ b/cpp/open3d/utility/ProgressBar.h
@@ -9,6 +9,8 @@
 
 #include <string>
 
+#include <tbb/task_scheduler_observer.h>
+
 namespace open3d {
 namespace utility {
 
@@ -42,5 +44,17 @@ class OMPProgressBar : public ProgressBar {
     ProgressBar &operator++() override;
 };
 
+class TBBProgressBar : public ProgressBar, tbb::task_scheduler_observer {
+public:
+    std::atomic<int> num_threads;
+    TBBProgressBar(std::size_t expected_count,
+                   const std::string &progress_info,
+                   bool active = false);
+    ProgressBar &operator++() override;
+
+    void on_scheduler_entry(bool is_worker) override;
+    void on_scheduler_exit(bool is_worker) override;
+};
+
 }  // namespace utility
 }  // namespace open3d
diff --git a/cpp/open3d/utility/Random.cpp b/cpp/open3d/utility/Random.cpp
index 8d0c54a9697..f04a9c4e626 100644
--- a/cpp/open3d/utility/Random.cpp
+++ b/cpp/open3d/utility/Random.cpp
@@ -34,10 +34,10 @@ class RandomContext {
 
     /// This is used by other downstream random generators.
     /// You must also lock the GetMutex() before calling the engine.
-    std::mt19937* GetEngine() { return &engine_; }
+    std::mt19937& GetEngine() { return engine_; }
 
     /// Get global singleton mutex to protect the engine call.
-    std::mutex* GetMutex() { return &mutex_; }
+    tbb::spin_mutex& GetMutex() { return mutex_; }
 
 private:
     RandomContext() {
@@ -47,18 +47,18 @@ class RandomContext {
     }
     int seed_;
     std::mt19937 engine_;
-    std::mutex mutex_;
+    tbb::spin_mutex mutex_;
 };
 
 void Seed(const int seed) { RandomContext::GetInstance().Seed(seed); }
 
-std::mt19937* GetEngine() { return RandomContext::GetInstance().GetEngine(); }
+std::mt19937& GetEngine() { return RandomContext::GetInstance().GetEngine(); }
 
-std::mutex* GetMutex() { return RandomContext::GetInstance().GetMutex(); }
+tbb::spin_mutex& GetMutex() { return RandomContext::GetInstance().GetMutex(); }
 
 uint32_t RandUint32() {
-    std::lock_guard<std::mutex> lock(*GetMutex());
-    return (*GetEngine())();
+    tbb::spin_mutex::scoped_lock lock(GetMutex());
+    return GetEngine()();
 }
 
 }  // namespace random
diff --git a/cpp/open3d/utility/Random.h b/cpp/open3d/utility/Random.h
index 1a3411d29a2..6b38a2fd02d 100644
--- a/cpp/open3d/utility/Random.h
+++ b/cpp/open3d/utility/Random.h
@@ -7,7 +7,7 @@
 
 #pragma once
 
-#include <mutex>
+#include <tbb/spin_mutex.h>
 #include <random>
 
 #include "open3d/utility/Logging.h"
@@ -32,11 +32,11 @@ void Seed(const int seed);
 ///     std::shuffle(vals.begin(), vals.end(), *utility::random::GetEngine());
 /// }
 /// ```
-std::mt19937* GetEngine();
+std::mt19937& GetEngine();
 
 /// Get global singleton mutex to protect the engine call. Also see
 /// random::GetEngine().
-std::mutex* GetMutex();
+tbb::spin_mutex& GetMutex();
 
 /// Generate a random uint32.
 /// This function is globally seeded by utility::random::Seed().
@@ -80,8 +80,8 @@ class UniformIntGenerator {
 
     /// Call this to generate a uniformly distributed integer.
     T operator()() {
-        std::lock_guard<std::mutex> lock(*GetMutex());
-        return distribution_(*GetEngine());
+        tbb::spin_mutex::scoped_lock lock(GetMutex());
+        return distribution_(GetEngine());
     }
 
 protected:
@@ -122,8 +122,8 @@ class UniformRealGenerator {
 
     /// Call this to generate a uniformly distributed floating point value.
     T operator()() {
-        std::lock_guard<std::mutex> lock(*GetMutex());
-        return distribution_(*GetEngine());
+        tbb::spin_mutex::scoped_lock lock(GetMutex());
+        return distribution_(GetEngine());
     }
 
 protected:
@@ -163,8 +163,8 @@ class NormalGenerator {
 
     /// Call this to generate a normally distributed floating point value.
     T operator()() {
-        std::lock_guard<std::mutex> lock(*GetMutex());
-        return distribution_(*GetEngine());
+        tbb::spin_mutex::scoped_lock lock(GetMutex());
+        return distribution_(GetEngine());
     }
 
 protected:
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 3d58a53ad84..b2be6212066 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -40,6 +40,7 @@ target_link_libraries(tests PRIVATE
     Open3D::3rdparty_googletest
     Open3D::3rdparty_threads
     Open3D::3rdparty_vtk
+    Open3D::3rdparty_tbb
 )
 
 if (TARGET Open3D::3rdparty_openmp)
diff --git a/cpp/tests/utility/ProgressBar.cpp b/cpp/tests/utility/ProgressBar.cpp
index 4641305813c..f94f9bd55d8 100644
--- a/cpp/tests/utility/ProgressBar.cpp
+++ b/cpp/tests/utility/ProgressBar.cpp
@@ -10,6 +10,8 @@
 #include <chrono>
 #include <thread>
 
+#include <tbb/parallel_for.h>
+
 #include "open3d/utility/Parallel.h"
 #include "tests/Tests.h"
 
@@ -41,5 +43,20 @@ TEST(ProgressBar, OMPProgressBar) {
     EXPECT_TRUE(static_cast<int>(progress_bar.GetCurrentCount()) >= iterations);
 }
 
+TEST(ProgressBar, TBBProgressBar) {
+    int iterations = 1000;
+    utility::TBBProgressBar progress_bar(
+            iterations, "TBBProgressBar test: ", true);
+    tbb::parallel_for(tbb::blocked_range<int>(
+            0, iterations, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<int>& range) {
+        for (int i = range.begin(); i < range.end(); ++i) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            ++progress_bar;
+        }
+    });
+    EXPECT_TRUE(static_cast<int>(progress_bar.GetCurrentCount()) >= iterations);
+}
+
 }  // namespace tests
 }  // namespace open3d
diff --git a/examples/cpp/CMakeLists.txt b/examples/cpp/CMakeLists.txt
index f98562acdd5..b5d6cf188c3 100644
--- a/examples/cpp/CMakeLists.txt
+++ b/examples/cpp/CMakeLists.txt
@@ -62,11 +62,7 @@ open3d_add_example(SLACIntegrate)
 open3d_add_example(TIntegrateRGBD)
 open3d_add_example(TOdometryRGBD)
 open3d_add_example(TriangleMesh)
-if (TARGET Open3D::3rdparty_openmp)
-    open3d_add_example(TrimMeshBasedOnPointCloud Open3D::3rdparty_openmp)
-else ()
-    open3d_add_example(TrimMeshBasedOnPointCloud)
-endif()
+open3d_add_example(TrimMeshBasedOnPointCloud Open3D::3rdparty_tbb)
 open3d_add_example(ViewDistances)
 open3d_add_example(ViewPCDMatch)
 open3d_add_example(Visualizer)
@@ -93,11 +89,7 @@ if(BUILD_WEBRTC)
 endif()
 
 if (BUILD_LIBREALSENSE)
-    if (TARGET Open3D::3rdparty_openmp)
-        open3d_add_example(RealSenseBagReader    Open3D::3rdparty_librealsense Open3D::3rdparty_jsoncpp Open3D::3rdparty_openmp)
-    else()
-        open3d_add_example(RealSenseBagReader    Open3D::3rdparty_librealsense Open3D::3rdparty_jsoncpp)
-    endif()
+    open3d_add_example(RealSenseBagReader    Open3D::3rdparty_librealsense Open3D::3rdparty_jsoncpp Open3D::3rdparty_tbb)
     open3d_add_example(RealSenseRecorder     Open3D::3rdparty_librealsense)
 endif()
 
diff --git a/examples/cpp/OpenMP.cpp b/examples/cpp/OpenMP.cpp
index 0995a0bf49e..bcf707d3cb3 100644
--- a/examples/cpp/OpenMP.cpp
+++ b/examples/cpp/OpenMP.cpp
@@ -62,7 +62,7 @@ void TestMatrixMultiplication(int argc, char **argv) {
     }
 
 #ifdef _OPENMP
-    utility::LogInfo("OpenMP is supported.");
+    utility::LogWarning("OpenMP is support is deprecated.");
 #else
     utility::LogInfo("OpenMP is not supported.");
 #endif
diff --git a/examples/cpp/RealSenseBagReader.cpp b/examples/cpp/RealSenseBagReader.cpp
index 8fd81397022..8c58801b0da 100644
--- a/examples/cpp/RealSenseBagReader.cpp
+++ b/examples/cpp/RealSenseBagReader.cpp
@@ -14,6 +14,8 @@
 #include <string>
 #include <thread>
 
+#include <tbb/parallel_invoke.h>
+
 #include "open3d/Open3D.h"
 
 using namespace open3d;
@@ -203,23 +205,18 @@ int main(int argc, char *argv[]) {
             }
 
             ++idx;
-            if (write_image)
-#pragma omp parallel sections
-            {
-#pragma omp section
-                {
+            if (write_image) {
+                tbb::parallel_invoke([&](){
                     auto color_file = fmt::format("{0}/color/{1:05d}.jpg",
                                                   output_path, idx);
                     utility::LogInfo("Writing to {}", color_file);
                     io::WriteImage(color_file, im_rgbd.color_);
-                }
-#pragma omp section
-                {
+                }, [&](){
                     auto depth_file = fmt::format("{0}/depth/{1:05d}.png",
                                                   output_path, idx);
                     utility::LogInfo("Writing to {}", depth_file);
                     io::WriteImage(depth_file, im_rgbd.depth_);
-                }
+                });
             }
             vis.UpdateGeometry();
             vis.UpdateRender();
diff --git a/examples/cpp/TrimMeshBasedOnPointCloud.cpp b/examples/cpp/TrimMeshBasedOnPointCloud.cpp
index a5f82506f0a..ef45c51b801 100644
--- a/examples/cpp/TrimMeshBasedOnPointCloud.cpp
+++ b/examples/cpp/TrimMeshBasedOnPointCloud.cpp
@@ -7,6 +7,8 @@
 
 #include "open3d/Open3D.h"
 
+#include <tbb/parallel_for.h>
+
 void PrintHelp() {
     using namespace open3d;
 
@@ -63,19 +65,21 @@ int main(int argc, char* argv[]) {
     geometry::KDTreeFlann kdtree;
     kdtree.SetGeometry(*pcd);
     std::vector<bool> remove_vertex_mask(mesh->vertices_.size(), false);
-    utility::ProgressBar progress_bar(mesh->vertices_.size(),
-                                             "Prune vetices: ");
-#pragma omp parallel for schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int i = 0; i < (int)mesh->vertices_.size(); i++) {
-        std::vector<int> indices(1);
-        std::vector<double> dists(1);
-        int k = kdtree.SearchKNN(mesh->vertices_[i], 1, indices, dists);
-        if (k == 0 || dists[0] > distance * distance) {
-            remove_vertex_mask[i] = true;
+    utility::TBBProgressBar progress_bar(mesh->vertices_.size(),
+                                         "Prune vetices: ");
+    tbb::parallel_for(tbb::blocked_range<std::size_t>(
+            0, mesh->vertices_.size(), utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t>& range){
+        for (std::size_t i = range.begin(); i < range.end(); ++i) {
+            std::vector<int> indices(1);
+            std::vector<double> dists(1);
+            int k = kdtree.SearchKNN(mesh->vertices_[i], 1, indices, dists);
+            if (k == 0 || dists[0] > distance * distance) {
+                remove_vertex_mask[i] = true;
+            }
+            ++progress_bar;
         }
-#pragma omp critical
-        { ++progress_bar; }
-    }
+    });
 
     std::vector<int> index_old_to_new(mesh->vertices_.size());
     bool has_vert_normal = mesh->HasVertexNormals();

From 0c367840ff96558940773b414ee528cff4af30c9 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 13:49:06 -0800
Subject: [PATCH 02/22] Swichted usage of std::mutex to tbb::mutex for
 consistency when used in conjunction with tbb parallel constructs.

---
 .../impl/continuous_conv/ContinuousConvBackpropFilter.h  | 7 +++----
 .../ContinuousConvTransposeBackpropFilter.h              | 7 +++----
 .../ml/impl/sparse_conv/SparseConvBackpropFilter.h       | 9 +++++----
 .../impl/sparse_conv/SparseConvTransposeBackpropFilter.h | 9 +++++----
 4 files changed, 16 insertions(+), 16 deletions(-)

diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.h b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.h
index b9b37e9410c..6fe3f60255f 100644
--- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.h
+++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvBackpropFilter.h
@@ -7,10 +7,9 @@
 
 #pragma once
 
+#include <tbb/mutex.h>
 #include <tbb/parallel_for.h>
 
-#include <mutex>
-
 #include "open3d/ml/impl/continuous_conv/CoordinateTransformation.h"
 
 namespace open3d {
@@ -61,7 +60,7 @@ void _CConvBackropFilterCPU(TOut* filter_backprop,
                                             filter_dims[0]);
 
     memset(filter_backprop, 0, sizeof(TOut) * total_filter_size);
-    std::mutex filter_backprop_mutex;
+    tbb::mutex filter_backprop_mutex;
 
     tbb::parallel_for(
             tbb::blocked_range<size_t>(0, num_out, 32),
@@ -201,7 +200,7 @@ void _CConvBackropFilterCPU(TOut* filter_backprop,
                 A = (C * B.transpose()).template cast<TOut>();
 
                 {
-                    std::lock_guard<std::mutex> lock(filter_backprop_mutex);
+                    tbb::mutex::scoped_lock lock(filter_backprop_mutex);
                     int linear_i = 0;
                     for (int j = 0; j < spatial_filter_size * in_channels; ++j)
                         for (int i = 0; i < out_channels; ++i, ++linear_i) {
diff --git a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.h b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.h
index 251f59cdf18..56e569a3a36 100644
--- a/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.h
+++ b/cpp/open3d/ml/impl/continuous_conv/ContinuousConvTransposeBackpropFilter.h
@@ -7,10 +7,9 @@
 
 #pragma once
 
+#include <tbb/mutex.h>
 #include <tbb/parallel_for.h>
 
-#include <mutex>
-
 #include "open3d/ml/impl/continuous_conv/CoordinateTransformation.h"
 
 namespace open3d {
@@ -62,7 +61,7 @@ void _CConvTransposeBackpropFilterCPU(TOut* filter_backprop,
 
     memset(filter_backprop, 0,
            sizeof(TOut) * spatial_filter_size * in_channels * out_channels);
-    std::mutex filter_backprop_mutex;
+    tbb::mutex filter_backprop_mutex;
 
     tbb::parallel_for(
             tbb::blocked_range<size_t>(0, num_out, 32),
@@ -207,7 +206,7 @@ void _CConvTransposeBackpropFilterCPU(TOut* filter_backprop,
                 A = (C * B.transpose()).template cast<TOut>();
 
                 {
-                    std::lock_guard<std::mutex> lock(filter_backprop_mutex);
+                    tbb::mutex::scoped_lock lock(filter_backprop_mutex);
                     int linear_i = 0;
                     for (int j = 0; j < spatial_filter_size * in_channels; ++j)
                         for (int i = 0; i < out_channels; ++i, ++linear_i) {
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h
index 67a2712f556..68e31e43e9a 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h
@@ -6,10 +6,11 @@
 // ----------------------------------------------------------------------------
 
 #pragma once
-#include <tbb/parallel_for.h>
 
 #include <Eigen/Core>
-#include <mutex>
+
+#include <tbb/mutex.h>
+#include <tbb/parallel_for.h>
 
 namespace open3d {
 namespace ml {
@@ -45,7 +46,7 @@ void _SparseConvBackropFilterCPU(TOut* filter_backprop,
             num_kernel_elements * in_channels * out_channels;
 
     memset(filter_backprop, 0, sizeof(TOut) * total_filter_size);
-    std::mutex filter_backprop_mutex;
+    tbb::mutex filter_backprop_mutex;
 
     tbb::parallel_for(
             tbb::blocked_range<size_t>(0, num_out, 10032),
@@ -112,7 +113,7 @@ void _SparseConvBackropFilterCPU(TOut* filter_backprop,
                 A = C * B.transpose();
 
                 {
-                    std::lock_guard<std::mutex> lock(filter_backprop_mutex);
+                    tbb::mutex::scoped_lock lock(filter_backprop_mutex);
                     int linear_i = 0;
                     for (int j = 0; j < num_kernel_elements * in_channels; ++j)
                         for (int i = 0; i < out_channels; ++i, ++linear_i) {
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h
index d6559844c0b..6266a178afb 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h
@@ -6,10 +6,11 @@
 // ----------------------------------------------------------------------------
 
 #pragma once
-#include <tbb/parallel_for.h>
 
 #include <Eigen/Core>
-#include <mutex>
+
+#include <tbb/mutex.h>
+#include <tbb/parallel_for.h>
 
 namespace open3d {
 namespace ml {
@@ -47,7 +48,7 @@ void _SparseConvTransposeBackpropFilterCPU(
 
     memset(filter_backprop, 0,
            sizeof(TOut) * num_kernel_elements * in_channels * out_channels);
-    std::mutex filter_backprop_mutex;
+    tbb::mutex filter_backprop_mutex;
 
     tbb::parallel_for(
             tbb::blocked_range<size_t>(0, num_out, 32),
@@ -130,7 +131,7 @@ void _SparseConvTransposeBackpropFilterCPU(
                 A = (C * B.transpose()).template cast<TOut>();
 
                 {
-                    std::lock_guard<std::mutex> lock(filter_backprop_mutex);
+                    tbb::mutex::scoped_lock lock(filter_backprop_mutex);
                     int linear_i = 0;
                     for (int j = 0; j < num_kernel_elements * in_channels; ++j)
                         for (int i = 0; i < out_channels; ++i, ++linear_i) {

From 1514fc1bc2a77268beee82ca54bc895b0b768d40 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 16:27:51 -0800
Subject: [PATCH 03/22] Fixed bug in CPU reduction

---
 cpp/open3d/core/kernel/ReductionCPU.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/open3d/core/kernel/ReductionCPU.cpp b/cpp/open3d/core/kernel/ReductionCPU.cpp
index cf4c7f12e85..05fe7c4fab9 100644
--- a/cpp/open3d/core/kernel/ReductionCPU.cpp
+++ b/cpp/open3d/core/kernel/ReductionCPU.cpp
@@ -194,7 +194,7 @@ class CPUArgReductionEngine {
                 // sub_indexer's workload_idx is indexer_'s ipo_idx.
                 Indexer sub_indexer = indexer_.GetPerOutputIndexer(output_idx);
                 using result_t = std::pair<int64_t, scalar_t>;
-                result_t val_idx{identity, identity};
+                result_t val_idx{-1, identity};
                 val_idx = tbb::parallel_deterministic_reduce(
                         tbb::blocked_range<int64_t>(0, sub_indexer.NumWorkloads(),
                         utility::DefaultGrainSizeTBB()), val_idx,
@@ -213,7 +213,7 @@ class CPUArgReductionEngine {
                             std::get<0>(b), std::get<1>(b));
                 });
                 *reinterpret_cast<int64_t *>(sub_indexer.GetOutputPtr(0)) =
-                        std::get<1>(val_idx);
+                        std::get<0>(val_idx);
             }
         });
     }

From 31f1e25a6feacb1d23f0a1379c5bc7805530c59b Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 16:29:08 -0800
Subject: [PATCH 04/22] Shift atomic to outside of RW mutex in
 PointCloudSegmentation.cpp

---
 cpp/open3d/geometry/PointCloudSegmentation.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/cpp/open3d/geometry/PointCloudSegmentation.cpp b/cpp/open3d/geometry/PointCloudSegmentation.cpp
index e4da2a382b4..48ae5b22bd3 100644
--- a/cpp/open3d/geometry/PointCloudSegmentation.cpp
+++ b/cpp/open3d/geometry/PointCloudSegmentation.cpp
@@ -187,13 +187,12 @@ std::tuple<Eigen::Vector4d, std::vector<size_t>> PointCloud::SegmentPlane(
 
     // Use size_t here to avoid large integer which exceed max of int.
     std::size_t break_iteration = std::numeric_limits<std::size_t>::max();
-    std::size_t iteration_count = 0;
+    std::atomic<std::size_t> iteration_count = 0;
 
     tbb::spin_rw_mutex mtx;
     tbb::parallel_for(tbb::blocked_range<int>(0, num_iterations, 1),
             [&](const tbb::blocked_range<int>& range) {
         for (int i = range.begin(); i < range.end(); ++i) {
-            // Eschew using a mutex here because doing an extra iteration is ok
             if (iteration_count > break_iteration) {
                 continue;
             }
@@ -238,8 +237,8 @@ std::tuple<Eigen::Vector4d, std::vector<size_t>> PointCloud::SegmentPlane(
                         }
                     }
                 }
-                iteration_count++;
             }
+            iteration_count++;
         }
     });
 

From d7a82f16724749673901ae0906604300a4ce04b9 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 16:30:29 -0800
Subject: [PATCH 05/22] Switch from using a mutex to a concurrent vector for
 parallelization of self intersecting triangles.

---
 cpp/open3d/geometry/TriangleMesh.cpp | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/open3d/geometry/TriangleMesh.cpp b/cpp/open3d/geometry/TriangleMesh.cpp
index f77c1965001..154c1bb6957 100644
--- a/cpp/open3d/geometry/TriangleMesh.cpp
+++ b/cpp/open3d/geometry/TriangleMesh.cpp
@@ -1358,8 +1358,7 @@ bool TriangleMesh::IsVertexManifold() const {
 
 std::vector<Eigen::Vector2i> TriangleMesh::GetSelfIntersectingTriangles()
         const {
-    std::vector<Eigen::Vector2i> self_intersecting_triangles;
-    tbb::spin_mutex out_mtx;
+    tbb::concurrent_vector<Eigen::Vector2i> self_intersecting_triangles;
     tbb::parallel_for(tbb::blocked_range<std::size_t>(
             0, triangles_.size(), utility::DefaultGrainSizeTBB()),
             [&](const tbb::blocked_range<std::size_t>& range){
@@ -1396,14 +1395,14 @@ std::vector<Eigen::Vector2i> TriangleMesh::GetSelfIntersectingTriangles()
                         q0.array().max(q1.array().max(q2.array()));
                 if (IntersectionTest::AABBAABB(bb_min1, bb_max1, bb_min2, bb_max2) &&
                     IntersectionTest::TriangleTriangle3d(p0, p1, p2, q0, q1, q2)) {
-                    tbb::spin_mutex::scoped_lock lock(out_mtx);
                     self_intersecting_triangles.push_back(
                             Eigen::Vector2i(tidx0, tidx1));
                 }
             }
         }
     });
-    return self_intersecting_triangles;
+    return {self_intersecting_triangles.begin(),
+            self_intersecting_triangles.end()};
 }
 
 bool TriangleMesh::IsSelfIntersecting() const {

From 181a431ff3a5bbc300afca2f97ef8f23cff7aab8 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 16:35:16 -0800
Subject: [PATCH 06/22] Get maximum threads from TBB instead of OpenMP

---
 cpp/open3d/utility/Parallel.cpp | 32 ++------------------------------
 1 file changed, 2 insertions(+), 30 deletions(-)

diff --git a/cpp/open3d/utility/Parallel.cpp b/cpp/open3d/utility/Parallel.cpp
index 5bb4ef733c3..313f80a1756 100644
--- a/cpp/open3d/utility/Parallel.cpp
+++ b/cpp/open3d/utility/Parallel.cpp
@@ -7,42 +7,14 @@
 
 #include "open3d/utility/Parallel.h"
 
-#ifdef _OPENMP
-#include <omp.h>
-#endif
+#include <tbb/task_arena.h>
 
-#include <cstdlib>
-#include <string>
-
-#include "open3d/utility/CPUInfo.h"
-#include "open3d/utility/Logging.h"
 
 namespace open3d {
 namespace utility {
 
-static std::string GetEnvVar(const std::string& name) {
-    if (const char* value = std::getenv(name.c_str())) {
-        return std::string(value);
-    } else {
-        return "";
-    }
-}
-
 int EstimateMaxThreads() {
-#ifdef _OPENMP
-    if (!GetEnvVar("OMP_NUM_THREADS").empty() ||
-        !GetEnvVar("OMP_DYNAMIC").empty()) {
-        // See the full list of OpenMP environment variables at:
-        // https://www.openmp.org/spec-html/5.0/openmpch6.html
-        return omp_get_max_threads();
-    } else {
-        // Returns the number of physical cores.
-        return utility::CPUInfo::GetInstance().NumCores();
-    }
-#else
-    (void)&GetEnvVar;  // Avoids compiler warning.
-    return 1;
-#endif
+    return tbb::this_task_arena::max_concurrency();
 }
 
 std::size_t& DefaultGrainSizeTBB() noexcept {

From 2312299c461760c4e3d792548d44bb0d0c96922a Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 16:39:11 -0800
Subject: [PATCH 07/22] Updates to ProgressBar.h/.cpp Including separating code
 for writing the progress bar into its own function and a bulk inplace add
 function operator+=. Also added TBBProgressBar. It does not inherit from
 ProgressBar as it uses an atomic for counting and has slightly different
 internals to use that atomicity.

---
 cpp/open3d/utility/ProgressBar.cpp | 112 ++++++++++++++++++++---------
 cpp/open3d/utility/ProgressBar.h   |  57 ++++++++++-----
 cpp/tests/utility/ProgressBar.cpp  |  24 +++++--
 3 files changed, 136 insertions(+), 57 deletions(-)

diff --git a/cpp/open3d/utility/ProgressBar.cpp b/cpp/open3d/utility/ProgressBar.cpp
index 780affbaecb..7a1efae3a9e 100644
--- a/cpp/open3d/utility/ProgressBar.cpp
+++ b/cpp/open3d/utility/ProgressBar.cpp
@@ -5,6 +5,7 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include "open3d/utility/Logging.h"
 #include "open3d/utility/ProgressBar.h"
 
 #include <fmt/printf.h>
@@ -18,30 +19,36 @@
 namespace open3d {
 namespace utility {
 
-ProgressBar::ProgressBar(size_t expected_count,
-                         const std::string &progress_info,
+ProgressBar::ProgressBar(std::size_t expected_count,
+                         std::string progress_info,
                          bool active) {
-    Reset(expected_count, progress_info, active);
+    Reset(expected_count, std::move(progress_info), active);
 }
 
-void ProgressBar::Reset(size_t expected_count,
-                        const std::string &progress_info,
+void ProgressBar::Reset(std::size_t expected_count,
+                        std::string progress_info,
                         bool active) {
     expected_count_ = expected_count;
-    current_count_ = static_cast<size_t>(-1);  // Guaranteed to wraparound
-    progress_info_ = progress_info;
+    progress_info_ = std::move(progress_info);
     progress_pixel_ = 0;
     active_ = active;
+    // Essentially set current count to zero but
+    // goes through an overridden increment operator
+    current_count_ = std::numeric_limits<std::size_t>::max();
     operator++();
 }
 
-ProgressBar &ProgressBar::operator++() {
+ProgressBar& ProgressBar::operator+=(std::size_t n) {
     SetCurrentCount(current_count_ + 1);
     return *this;
 }
 
-void ProgressBar::SetCurrentCount(size_t n) {
+void ProgressBar::SetCurrentCount(std::size_t n) {
     current_count_ = n;
+    UpdateBar();
+}
+
+void ProgressBar::UpdateBar() {
     if (!active_) {
         return;
     }
@@ -49,7 +56,7 @@ void ProgressBar::SetCurrentCount(size_t n) {
         fmt::print("{}[{}] 100%\n", progress_info_,
                    std::string(resolution_, '='));
     } else {
-        size_t new_progress_pixel =
+        std::size_t new_progress_pixel =
                 int(current_count_ * resolution_ / expected_count_);
         if (new_progress_pixel > progress_pixel_) {
             progress_pixel_ = new_progress_pixel;
@@ -63,53 +70,92 @@ void ProgressBar::SetCurrentCount(size_t n) {
     }
 }
 
-size_t ProgressBar::GetCurrentCount() const { return current_count_; }
-
-OMPProgressBar::OMPProgressBar(size_t expected_count,
-                               const std::string &progress_info,
+OMPProgressBar::OMPProgressBar(std::size_t expected_count,
+                               std::string progress_info,
                                bool active)
-    : ProgressBar(expected_count, progress_info, active) {}
+    : ProgressBar(expected_count, std::move(progress_info), active) {}
 
-ProgressBar &OMPProgressBar::operator++() {
+ProgressBar& OMPProgressBar::operator+=(std::size_t n) {
     // Ref: https://stackoverflow.com/a/44555438
 #ifdef _OPENMP
     int number_of_threads = omp_get_num_threads();
     int thread_id = omp_get_thread_num();
     if (number_of_threads > 1) {
         if (thread_id == 0) {
-            SetCurrentCount(current_count_ + number_of_threads);
+            SetCurrentCount(current_count_ + number_of_threads * n);
         }
     } else {
-        SetCurrentCount(current_count_ + 1);
+        SetCurrentCount(current_count_ + n);
     }
 #else
-    SetCurrentCount(current_count_ + 1);
+    SetCurrentCount(current_count_ + n);
 #endif
     return *this;
 }
 
 TBBProgressBar::TBBProgressBar(std::size_t expected_count,
-                               const std::string &progress_info,
-                               bool active)
-    : ProgressBar(expected_count, progress_info, active),
-      tbb::task_scheduler_observer(), num_threads(0) {
-    tbb::task_scheduler_observer::observe(true);
+                               std::string progress_info,
+                               bool active) {
+    Reset(expected_count, std::move(progress_info), active);
 }
 
-void TBBProgressBar::on_scheduler_entry(bool is_worker) {
-    if (is_worker) { ++num_threads; }
+void TBBProgressBar::Reset(std::size_t expected_count,
+                        std::string progress_info,
+                        bool active) noexcept(false) {
+    if (expected_count & flag_bit_mask) {
+        utility::LogError("Expected count out of range [0, 2^31)");
+    }
+    expected_count_ = expected_count;
+    progress_info_ = std::move(progress_info);
+    progress_pixel_ = 0;
+    active_ = active;
+    current_count_ = 0;
+    UpdateBar();
 }
-void TBBProgressBar::on_scheduler_exit(bool is_worker) {
-    if (is_worker) { --num_threads; }
+
+TBBProgressBar& TBBProgressBar::operator++() {
+    ++current_count_;
+    UpdateBar();
+    return *this;
 }
 
-ProgressBar &TBBProgressBar::operator++() {
-    // Ref: https://stackoverflow.com/a/44555438
-    if (tbb::this_task_arena::current_thread_index() == 0) {
-        SetCurrentCount(current_count_ + num_threads);
-    }
+TBBProgressBar& TBBProgressBar::operator+=(std::size_t n) {
+    current_count_ += n;
+    UpdateBar();
     return *this;
 }
 
+void TBBProgressBar::SetCurrentCount(std::size_t n) {
+    current_count_ = n;
+    UpdateBar();
+}
+
+void TBBProgressBar::UpdateBar() {
+    if (!active_ || current_count_ & flag_bit_mask) {
+        return;
+    }
+    // Check if the current count equals the expected count
+    // If so set the flag bit and print 100%
+    // tmp is created so that compare_exchange doesn't modify expected_count
+    if (std::size_t tmp = expected_count_;
+            current_count_.compare_exchange_strong(
+            tmp, expected_count_ | flag_bit_mask)) {
+        fmt::print("{}[{}] 100%\n", progress_info_,
+                   std::string(resolution_, '='));
+    } else if (tbb::this_task_arena::current_thread_index() != 0) {
+        std::size_t new_progress_pixel = current_count_ *
+                resolution_ / expected_count_;
+        if (new_progress_pixel > progress_pixel_) {
+            progress_pixel_ = new_progress_pixel;
+            int percent = int(current_count_ * 100 / expected_count_);
+            fmt::print("{}[{}>{}] {:d}%\r", progress_info_,
+                       std::string(progress_pixel_, '='),
+                       std::string(resolution_ - 1 - progress_pixel_, ' '),
+                       percent);
+            fflush(stdout);
+        }
+    }
+}
+
 }  // namespace utility
 }  // namespace open3d
diff --git a/cpp/open3d/utility/ProgressBar.h b/cpp/open3d/utility/ProgressBar.h
index 813730e1156..cc68a4f2c6b 100644
--- a/cpp/open3d/utility/ProgressBar.h
+++ b/cpp/open3d/utility/ProgressBar.h
@@ -7,9 +7,10 @@
 
 #pragma once
 
-#include <string>
+#include <tbb/collaborative_call_once.h>
 
-#include <tbb/task_scheduler_observer.h>
+#include <atomic>
+#include <string>
 
 namespace open3d {
 namespace utility {
@@ -17,43 +18,61 @@ namespace utility {
 class ProgressBar {
 public:
     ProgressBar(size_t expected_count,
-                const std::string &progress_info,
+                std::string progress_info,
                 bool active = false);
     void Reset(size_t expected_count,
-               const std::string &progress_info,
+               std::string progress_info,
                bool active);
-    virtual ProgressBar &operator++();
+    inline ProgressBar& operator++() { return *this += 1; };
+    virtual ProgressBar& operator+=(std::size_t n);
     void SetCurrentCount(size_t n);
-    size_t GetCurrentCount() const;
+    void UpdateBar();
+    inline std::size_t GetCurrentCount() const { return current_count_; }
     virtual ~ProgressBar() = default;
 
 protected:
-    const size_t resolution_ = 40;
-    size_t expected_count_;
-    size_t current_count_;
+    static constexpr size_t resolution_ = 40;
+    std::size_t expected_count_;
+    std::size_t current_count_;
     std::string progress_info_;
-    size_t progress_pixel_;
+    std::size_t progress_pixel_;
     bool active_;
 };
 
 class OMPProgressBar : public ProgressBar {
 public:
-    OMPProgressBar(size_t expected_count,
-                   const std::string &progress_info,
+    OMPProgressBar(std::size_t expected_count,
+                   std::string progress_info,
                    bool active = false);
-    ProgressBar &operator++() override;
+    ProgressBar& operator+=(std::size_t) override;
 };
 
-class TBBProgressBar : public ProgressBar, tbb::task_scheduler_observer {
+class TBBProgressBar {
 public:
-    std::atomic<int> num_threads;
     TBBProgressBar(std::size_t expected_count,
-                   const std::string &progress_info,
+                   std::string progress_info,
                    bool active = false);
-    ProgressBar &operator++() override;
+    void Reset(std::size_t expected_count,
+               std::string progress_info,
+               bool active);
+    TBBProgressBar& operator++();
+    TBBProgressBar& operator+=(std::size_t n);
+    void SetCurrentCount(std::size_t n);
+    void UpdateBar();
+    inline std::size_t GetCurrentCount() const {
+        return current_count_ & ~flag_bit_mask;
+    }
+
 
-    void on_scheduler_entry(bool is_worker) override;
-    void on_scheduler_exit(bool is_worker) override;
+protected:
+    static constexpr std::size_t flag_bit_mask = ~(~std::size_t{} >> 1);
+    static constexpr std::size_t resolution_ = 40;
+    std::atomic<std::size_t> current_count_;
+    tbb::collaborative_once_flag finalized;
+    std::size_t expected_count_;
+    std::string progress_info_;
+    mutable std::size_t progress_pixel_;
+    bool active_;
 };
 
 }  // namespace utility
diff --git a/cpp/tests/utility/ProgressBar.cpp b/cpp/tests/utility/ProgressBar.cpp
index f94f9bd55d8..da08565c1b9 100644
--- a/cpp/tests/utility/ProgressBar.cpp
+++ b/cpp/tests/utility/ProgressBar.cpp
@@ -34,28 +34,42 @@ TEST(ProgressBar, OMPProgressBar) {
     utility::OMPProgressBar progress_bar(iterations,
                                          "OMPProgressBar test: ", true);
 
-#pragma omp parallel for schedule(static) \
+#pragma omp parallel for schedule(static) default(none) \
+        shared(iterations, progress_bar) \
         num_threads(utility::EstimateMaxThreads())
     for (int i = 0; i < iterations; ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
         ++progress_bar;
     }
-    EXPECT_TRUE(static_cast<int>(progress_bar.GetCurrentCount()) >= iterations);
+    EXPECT_GE(static_cast<int>(progress_bar.GetCurrentCount()), iterations);
 }
 
 TEST(ProgressBar, TBBProgressBar) {
     int iterations = 1000;
     utility::TBBProgressBar progress_bar(
             iterations, "TBBProgressBar test: ", true);
-    tbb::parallel_for(tbb::blocked_range<int>(
-            0, iterations, utility::DefaultGrainSizeTBB()),
+    tbb::parallel_for(tbb::blocked_range<int>(0, iterations, 10),
             [&](const tbb::blocked_range<int>& range) {
         for (int i = range.begin(); i < range.end(); ++i) {
             std::this_thread::sleep_for(std::chrono::milliseconds(10));
             ++progress_bar;
         }
     });
-    EXPECT_TRUE(static_cast<int>(progress_bar.GetCurrentCount()) >= iterations);
+    EXPECT_EQ(static_cast<int>(progress_bar.GetCurrentCount()), iterations);
+}
+
+TEST(ProgressBar, TBBProgressBarBatched) {
+    int iterations = 1000;
+    utility::TBBProgressBar progress_bar(
+            iterations, "Batched TBBProgressBar test: ", true);
+    tbb::parallel_for(tbb::blocked_range<int>(0, iterations, 10),
+            [&](const tbb::blocked_range<int>& range) {
+        for (int i = range.begin(); i < range.end(); ++i) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+        }
+        progress_bar += (range.end() - range.begin());
+    });
+    EXPECT_EQ(static_cast<int>(progress_bar.GetCurrentCount()), iterations);
 }
 
 }  // namespace tests

From bd258e34a24d0cdccdaa91ce306cd297ece6c712 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 16:40:01 -0800
Subject: [PATCH 08/22] Updated ClusterDBSCAN in PointCloudCluster.cpp to bulk
 update the progress bar to limit spinning on the mutex.

---
 cpp/open3d/geometry/PointCloudCluster.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/cpp/open3d/geometry/PointCloudCluster.cpp b/cpp/open3d/geometry/PointCloudCluster.cpp
index 5166b9ac244..acc42a89350 100644
--- a/cpp/open3d/geometry/PointCloudCluster.cpp
+++ b/cpp/open3d/geometry/PointCloudCluster.cpp
@@ -27,8 +27,8 @@ std::vector<int> PointCloud::ClusterDBSCAN(double eps,
 
     // Precompute all neighbors.
     utility::LogDebug("Precompute neighbors.");
-    utility::ProgressBar progress_bar(points_.size(), "Precompute neighbors.",
-                                      print_progress);
+    utility::ProgressBar progress_bar(points_.size(),
+            "Precompute neighbors.", print_progress);
     std::vector<std::vector<int>> nbs(points_.size());
 
     tbb::spin_mutex mtx;
@@ -39,11 +39,9 @@ std::vector<int> PointCloud::ClusterDBSCAN(double eps,
         for (std::size_t i = range.begin(); i < range.end(); ++i) {
             std::vector<double> dists2;
             kdtree.SearchRadius(points_[i], eps, nbs[i], dists2);
-            {
-                tbb::spin_mutex::scoped_lock lock(mtx);
-                ++progress_bar;
-            }
         }
+        tbb::spin_mutex::scoped_lock lock(mtx);
+        progress_bar += (range.end() - range.begin());
     });
 
     utility::LogDebug("Done Precompute neighbors.");

From 27898178ed417d33416112f2136160910855777b Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Fri, 26 Jan 2024 18:50:25 -0800
Subject: [PATCH 09/22] Applied Open3D style

---
 .../LegacyReconstructionUtil.h                |  84 +++--
 cpp/open3d/core/ParallelFor.h                 |  16 +-
 .../core/hashmap/CPU/CPUHashBackendBuffer.cpp |  17 +-
 cpp/open3d/core/hashmap/CPU/TBBHashBackend.h  | 104 +++---
 cpp/open3d/core/kernel/NonZeroCPU.cpp         |  34 +-
 cpp/open3d/core/kernel/ReductionCPU.cpp       | 123 ++++---
 cpp/open3d/geometry/EstimateNormals.cpp       | 101 +++---
 cpp/open3d/geometry/ISSKeypoints.cpp          |  93 ++---
 cpp/open3d/geometry/Image.cpp                 | 247 ++++++++------
 cpp/open3d/geometry/PointCloud.cpp            | 199 ++++++-----
 cpp/open3d/geometry/PointCloudCluster.cpp     |  29 +-
 .../PointCloudPlanarPatchDetection.cpp        |  22 +-
 .../geometry/PointCloudSegmentation.cpp       | 107 +++---
 cpp/open3d/geometry/TriangleMesh.cpp          | 186 +++++-----
 .../geometry/TriangleMeshDeformation.cpp      | 155 +++++----
 .../sensor/azure_kinect/AzureKinectSensor.cpp |  32 +-
 .../sparse_conv/SparseConvBackpropFilter.h    |   4 +-
 .../SparseConvTransposeBackpropFilter.h       |   4 +-
 .../pipelines/color_map/ColorMapUtils.cpp     | 265 ++++++++-------
 .../pipelines/color_map/NonRigidOptimizer.cpp | 174 +++++-----
 .../pipelines/color_map/RigidOptimizer.cpp    |  89 ++---
 .../integration/UniformTSDFVolume.cpp         | 321 ++++++++++--------
 cpp/open3d/pipelines/odometry/Odometry.cpp    |  85 ++---
 .../registration/FastGlobalRegistration.cpp   |  65 ++--
 cpp/open3d/pipelines/registration/Feature.cpp | 181 +++++-----
 .../pipelines/registration/GeneralizedICP.cpp |  19 +-
 .../pipelines/registration/Registration.cpp   | 194 ++++++-----
 .../pipelines/registration/Registration.h     |   8 +-
 .../t/geometry/kernel/PointCloudCPU.cpp       |  72 ++--
 cpp/open3d/t/io/sensor/RGBDVideoReader.cpp    |  32 +-
 .../pipelines/kernel/FillInLinearSystemImpl.h | 120 +++----
 .../t/pipelines/kernel/RGBDOdometryCPU.cpp    | 304 ++++++++---------
 .../t/pipelines/kernel/RegistrationCPU.cpp    | 208 ++++++------
 .../t/pipelines/registration/Feature.cpp      |  20 +-
 cpp/open3d/utility/Eigen.cpp                  |  60 ++--
 cpp/open3d/utility/Logging.h                  |   8 +-
 cpp/open3d/utility/Parallel.cpp               |   5 +-
 cpp/open3d/utility/ProgressBar.cpp            |  15 +-
 cpp/open3d/utility/ProgressBar.h              |   5 +-
 cpp/open3d/utility/Random.h                   |   1 +
 cpp/tests/utility/ProgressBar.cpp             |  43 +--
 examples/cpp/RealSenseBagReader.cpp           |  27 +-
 examples/cpp/TrimMeshBasedOnPointCloud.cpp    |   4 +-
 43 files changed, 2070 insertions(+), 1812 deletions(-)

diff --git a/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h b/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h
index 4070e808e39..93a92c88930 100644
--- a/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h
+++ b/cpp/apps/OfflineReconstruction/LegacyReconstructionUtil.h
@@ -549,15 +549,17 @@ class ReconstructionPipeline {
 
         const size_t num_pairs = fragment_matching_results.size();
         if (config_["multi_threading"].asBool()) {
-            tbb::parallel_for(tbb::blocked_range<std::size_t>(0, num_pairs),
-                    [&](const tbb::blocked_range<std::size_t>& range){
-                for (std::size_t i = range.begin(); i < range.end(); ++i) {
-                    RegisterFragmentPair(ply_files,
-                                         fragment_matching_results[i].s_,
-                                         fragment_matching_results[i].t_,
-                                         fragment_matching_results[i]);
-                }
-            });
+            tbb::parallel_for(
+                    tbb::blocked_range<std::size_t>(0, num_pairs),
+                    [&](const tbb::blocked_range<std::size_t>& range) {
+                        for (std::size_t i = range.begin(); i < range.end();
+                             ++i) {
+                            RegisterFragmentPair(
+                                    ply_files, fragment_matching_results[i].s_,
+                                    fragment_matching_results[i].t_,
+                                    fragment_matching_results[i]);
+                        }
+                    });
         } else {
             for (size_t i = 0; i < num_pairs; i++) {
                 RegisterFragmentPair(ply_files, fragment_matching_results[i].s_,
@@ -640,16 +642,18 @@ class ReconstructionPipeline {
         }
 
         if (config_["multi_threading"].asBool()) {
-            tbb::parallel_for(tbb::blocked_range<std::size_t>(0,
-                    fragment_matching_results.size()),
-                    [&](const tbb::blocked_range<std::size_t>& range){
-                for (std::size_t i = range.begin(); i < range.end(); ++i) {
-                    const int s = fragment_matching_results[i].s_;
-                    const int t = fragment_matching_results[i].t_;
-                    RefineFragmentPair(ply_files, s, t,
-                                       fragment_matching_results[i]);
-                }
-            });
+            tbb::parallel_for(
+                    tbb::blocked_range<std::size_t>(
+                            0, fragment_matching_results.size()),
+                    [&](const tbb::blocked_range<std::size_t>& range) {
+                        for (std::size_t i = range.begin(); i < range.end();
+                             ++i) {
+                            const int s = fragment_matching_results[i].s_;
+                            const int t = fragment_matching_results[i].t_;
+                            RefineFragmentPair(ply_files, s, t,
+                                               fragment_matching_results[i]);
+                        }
+                    });
         } else {
             for (size_t i = 0; i < fragment_matching_results.size(); i++) {
                 const int s = fragment_matching_results[i].s_;
@@ -748,23 +752,32 @@ class ReconstructionPipeline {
                              const std::vector<std::string>& depth_files_,
                              const camera::PinholeCameraIntrinsic& intrinsic_,
                              const PoseGraphT& pose_graph_)
-                : pipeline(pipeline_), fragment_id(fragment_id_),
-              color_files(color_files_), depth_files(depth_files_),
-              intrinsic(intrinsic_), pose_graph(pose_graph_) {}
+            : pipeline(pipeline_),
+              fragment_id(fragment_id_),
+              color_files(color_files_),
+              depth_files(depth_files_),
+              intrinsic(intrinsic_),
+              pose_graph(pose_graph_) {}
 
         PointCloudIntegrator(PointCloudIntegrator& o, tbb::split)
-                : pipeline(o.pipeline), fragment_id(o.fragment_id),
-              color_files(o.color_files), depth_files(o.depth_files),
-              intrinsic(o.intrinsic), pose_graph(o.pose_graph) {}
+            : pipeline(o.pipeline),
+              fragment_id(o.fragment_id),
+              color_files(o.color_files),
+              depth_files(o.depth_files),
+              intrinsic(o.intrinsic),
+              pose_graph(o.pose_graph) {}
 
         void operator()(const tbb::blocked_range<std::size_t>& range) {
             for (std::size_t i = range.begin(); i < range.end(); ++i) {
-                const int i_abs = fragment_id *
-                    pipeline.config_["n_frames_per_fragment"].asInt() + i;
-                utility::LogInfo("Fragment {:03d} / {:03d} :: "
+                const int i_abs =
+                        fragment_id * pipeline.config_["n_frames_per_fragment"]
+                                              .asInt() +
+                        i;
+                utility::LogInfo(
+                        "Fragment {:03d} / {:03d} :: "
                         "Integrate rgbd frame {:d} ({:d} of {:d}).",
-                        fragment_id, pipeline.n_fragments_ - 1, i_abs,
-                        i + 1, pose_graph.nodes_.size());
+                        fragment_id, pipeline.n_fragments_ - 1, i_abs, i + 1,
+                        pose_graph.nodes_.size());
                 const geometry::RGBDImage rgbd = pipeline.ReadRGBDImage(
                         color_files[i_abs], depth_files[i_abs], false);
                 auto pcd = geometry::PointCloud::CreateFromRGBDImage(
@@ -773,9 +786,7 @@ class ReconstructionPipeline {
             }
         }
 
-        void join(PointCloudIntegrator& rhs) {
-            fragment += rhs.fragment;
-        }
+        void join(PointCloudIntegrator& rhs) { fragment += rhs.fragment; }
     };
 
     void IntegrateFragmentRGBD(
@@ -784,16 +795,15 @@ class ReconstructionPipeline {
             const std::vector<std::string>& depth_files,
             const camera::PinholeCameraIntrinsic& intrinsic,
             const pipelines::registration::PoseGraph& pose_graph) {
-
         const size_t graph_num = pose_graph.nodes_.size();
         PointCloudIntegrator reducer(*this, fragment_id, color_files,
                                      depth_files, intrinsic, pose_graph);
-        tbb::parallel_reduce(tbb::blocked_range<std::size_t>(
-                0, graph_num), reducer);
+        tbb::parallel_reduce(tbb::blocked_range<std::size_t>(0, graph_num),
+                             reducer);
 
         const geometry::PointCloud fragment_down =
                 *reducer.fragment.VoxelDownSample(
-                    config_["tsdf_cubic_size"].asDouble() / 512.0);
+                        config_["tsdf_cubic_size"].asDouble() / 512.0);
         io::WritePointCloud(
                 utility::filesystem::JoinPath(
                         config_["path_dataset"].asString(),
diff --git a/cpp/open3d/core/ParallelFor.h b/cpp/open3d/core/ParallelFor.h
index ba8104fda89..1d09241fe8a 100644
--- a/cpp/open3d/core/ParallelFor.h
+++ b/cpp/open3d/core/ParallelFor.h
@@ -81,17 +81,13 @@ void ParallelForCPU_(const Device& device, int64_t n, const func_t& func) {
         return;
     }
 
-//    #pragma omp parallel for num_threads(utility::EstimateMaxThreads())
-//    for (int64_t i = 0; i < n; ++i) {
-//        func(i);
-//    }
-
     tbb::parallel_for(tbb::blocked_range<int64_t>(0, n, 32),
-            [&func](const tbb::blocked_range<int64_t>& range){
-        for (int64_t i = range.begin(); i < range.end(); ++i) {
-            func(i);
-        }
-    });
+                      [&func](const tbb::blocked_range<int64_t>& range) {
+                          for (int64_t i = range.begin(); i < range.end();
+                               ++i) {
+                              func(i);
+                          }
+                      });
 }
 
 #endif
diff --git a/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp b/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp
index 86a23c7e285..9a8a693e95f 100644
--- a/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp
+++ b/cpp/open3d/core/hashmap/CPU/CPUHashBackendBuffer.cpp
@@ -5,11 +5,11 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+
 #include "open3d/core/hashmap/HashBackendBuffer.h"
 #include "open3d/utility/Parallel.h"
 
-#include <tbb/parallel_for.h>
-
 namespace open3d {
 namespace core {
 void CPUResetHeap(Tensor& heap) {
@@ -17,12 +17,13 @@ void CPUResetHeap(Tensor& heap) {
     int64_t capacity = heap.GetLength();
 
     tbb::parallel_for(tbb::blocked_range<int64_t>(
-            0, capacity, utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<int64_t>& range){
-        for (int64_t i = range.begin(); i < range.end(); ++i) {
-            heap_ptr[i] = i;
-        }
-    });
+                              0, capacity, utility::DefaultGrainSizeTBB()),
+                      [&](const tbb::blocked_range<int64_t>& range) {
+                          for (int64_t i = range.begin(); i < range.end();
+                               ++i) {
+                              heap_ptr[i] = i;
+                          }
+                      });
 };
 }  // namespace core
 }  // namespace open3d
diff --git a/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h b/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h
index b72ed98bf3d..cb76c786bb4 100644
--- a/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h
+++ b/cpp/open3d/core/hashmap/CPU/TBBHashBackend.h
@@ -94,17 +94,18 @@ void TBBHashBackend<Key, Hash, Eq>::Find(const void* input_keys,
                                          int64_t count) {
     const Key* input_keys_templated = static_cast<const Key*>(input_keys);
 
-    tbb::parallel_for(tbb::blocked_range<int64_t>(0, count, 64),
-            [=, &impl=impl_](const tbb::blocked_range<int64_t>& range){
-        for (int64_t i = range.begin(); i < range.end(); ++i) {
-            const Key& key = input_keys_templated[i];
-
-            auto iter = impl->find(key);
-            bool flag = (iter != impl->end());
-            output_masks[i] = flag;
-            output_buf_indices[i] = flag ? iter->second : 0;
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<int64_t>(0, count, 64),
+            [=, &impl = impl_](const tbb::blocked_range<int64_t>& range) {
+                for (int64_t i = range.begin(); i < range.end(); ++i) {
+                    const Key& key = input_keys_templated[i];
+
+                    auto iter = impl->find(key);
+                    bool flag = (iter != impl->end());
+                    output_masks[i] = flag;
+                    output_buf_indices[i] = flag ? iter->second : 0;
+                }
+            });
 }
 
 template <typename Key, typename Hash, typename Eq>
@@ -180,45 +181,50 @@ void TBBHashBackend<Key, Hash, Eq>::Insert(
 
     size_t n_values = input_values_soa.size();
 
-    tbb::parallel_for(tbb::blocked_range<int64_t>(0, count, 64),
-            [&](const tbb::blocked_range<int64_t>& range){
-        for (int64_t i = range.begin(); i < range.end(); ++i) {
-            output_buf_indices[i] = 0;
-            output_masks[i] = false;
-
-            const Key& key = input_keys_templated[i];
-
-            // Try to insert a dummy buffer index.
-            auto res = impl_->insert({key, 0});
-
-            // Lazy copy key value pair to buffer only if succeeded
-            if (res.second) {
-                buf_index_t buf_index = buffer_accessor_->DeviceAllocate();
-                void* key_ptr = buffer_accessor_->GetKeyPtr(buf_index);
-
-                // Copy templated key to buffer
-                *static_cast<Key*>(key_ptr) = key;
-
-                // Copy/reset non-templated value in buffer
-                for (size_t j = 0; j < n_values; ++j) {
-                    uint8_t* dst_value = static_cast<uint8_t*>(
-                            buffer_accessor_->GetValuePtr(buf_index, j));
-
-                    const uint8_t* src_value =
-                            static_cast<const uint8_t*>(input_values_soa[j]) +
-                            this->value_dsizes_[j] * i;
-                    std::memcpy(dst_value, src_value, this->value_dsizes_[j]);
+    tbb::parallel_for(
+            tbb::blocked_range<int64_t>(0, count, 64),
+            [&](const tbb::blocked_range<int64_t>& range) {
+                for (int64_t i = range.begin(); i < range.end(); ++i) {
+                    output_buf_indices[i] = 0;
+                    output_masks[i] = false;
+
+                    const Key& key = input_keys_templated[i];
+
+                    // Try to insert a dummy buffer index.
+                    auto res = impl_->insert({key, 0});
+
+                    // Lazy copy key value pair to buffer only if succeeded
+                    if (res.second) {
+                        buf_index_t buf_index =
+                                buffer_accessor_->DeviceAllocate();
+                        void* key_ptr = buffer_accessor_->GetKeyPtr(buf_index);
+
+                        // Copy templated key to buffer
+                        *static_cast<Key*>(key_ptr) = key;
+
+                        // Copy/reset non-templated value in buffer
+                        for (size_t j = 0; j < n_values; ++j) {
+                            uint8_t* dst_value = static_cast<uint8_t*>(
+                                    buffer_accessor_->GetValuePtr(buf_index,
+                                                                  j));
+
+                            const uint8_t* src_value =
+                                    static_cast<const uint8_t*>(
+                                            input_values_soa[j]) +
+                                    this->value_dsizes_[j] * i;
+                            std::memcpy(dst_value, src_value,
+                                        this->value_dsizes_[j]);
+                        }
+
+                        // Update from dummy 0
+                        res.first->second = buf_index;
+
+                        // Write to return variables
+                        output_buf_indices[i] = buf_index;
+                        output_masks[i] = true;
+                    }
                 }
-
-                // Update from dummy 0
-                res.first->second = buf_index;
-
-                // Write to return variables
-                output_buf_indices[i] = buf_index;
-                output_masks[i] = true;
-            }
-        }
-    });
+            });
 }
 
 template <typename Key, typename Hash, typename Eq>
diff --git a/cpp/open3d/core/kernel/NonZeroCPU.cpp b/cpp/open3d/core/kernel/NonZeroCPU.cpp
index 67618aca5d0..68ffff32a60 100644
--- a/cpp/open3d/core/kernel/NonZeroCPU.cpp
+++ b/cpp/open3d/core/kernel/NonZeroCPU.cpp
@@ -5,6 +5,8 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+
 #include <numeric>
 
 #include "open3d/core/Indexer.h"
@@ -12,8 +14,6 @@
 #include "open3d/utility/Logging.h"
 #include "open3d/utility/Parallel.h"
 
-#include <tbb/parallel_for.h>
-
 namespace open3d {
 namespace core {
 namespace kernel {
@@ -48,19 +48,23 @@ Tensor NonZeroCPU(const Tensor& src) {
 
     std::vector<std::vector<int64_t>> non_zero_indices_by_dimensions(
             num_dims, std::vector<int64_t>(num_non_zeros, 0));
-    tbb::parallel_for(tbb::blocked_range<int64_t>(
-            0, num_non_zeros, utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<int64_t>& range){
-        for (int64_t i = range.begin(); i < range.end(); i++) {
-            int64_t non_zero_index = non_zero_indices[i];
-            for (int64_t dim = num_dims - 1; dim >= 0; dim--) {
-                void* result_ptr = result_iter.GetPtr(dim * num_non_zeros + i);
-                OPEN3D_ASSERT(result_ptr != nullptr && "Internal error.");
-                *static_cast<int64_t*>(result_ptr) = non_zero_index % shape[dim];
-                non_zero_index = non_zero_index / shape[dim];
-            }
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<int64_t>(0, num_non_zeros,
+                                        utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<int64_t>& range) {
+                for (int64_t i = range.begin(); i < range.end(); i++) {
+                    int64_t non_zero_index = non_zero_indices[i];
+                    for (int64_t dim = num_dims - 1; dim >= 0; dim--) {
+                        void* result_ptr =
+                                result_iter.GetPtr(dim * num_non_zeros + i);
+                        OPEN3D_ASSERT(result_ptr != nullptr &&
+                                      "Internal error.");
+                        *static_cast<int64_t*>(result_ptr) =
+                                non_zero_index % shape[dim];
+                        non_zero_index = non_zero_index / shape[dim];
+                    }
+                }
+            });
 
     return result;
 }
diff --git a/cpp/open3d/core/kernel/ReductionCPU.cpp b/cpp/open3d/core/kernel/ReductionCPU.cpp
index 05fe7c4fab9..bdde3acfec6 100644
--- a/cpp/open3d/core/kernel/ReductionCPU.cpp
+++ b/cpp/open3d/core/kernel/ReductionCPU.cpp
@@ -5,6 +5,9 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
 #include <limits>
 
 #include "open3d/core/Dispatch.h"
@@ -14,9 +17,6 @@
 #include "open3d/utility/Logging.h"
 #include "open3d/utility/Parallel.h"
 
-#include <tbb/parallel_reduce.h>
-#include <tbb/parallel_for.h>
-
 namespace open3d {
 namespace core {
 namespace kernel {
@@ -113,18 +113,20 @@ class CPUReductionEngine {
                     "single-output reduction ops.");
         }
         scalar_t& dst = *reinterpret_cast<scalar_t*>(indexer.GetOutputPtr(0));
-        dst = tbb::parallel_reduce(tbb::blocked_range<int64_t>(
-                0, indexer.NumWorkloads(), utility::DefaultGrainSizeTBB()),
-                identity, [&](const tbb::blocked_range<int64_t>& range,
-                                           scalar_t so_far){
-            for (int64_t workload_idx = range.begin();
-                    workload_idx < range.end(); ++workload_idx) {
-                scalar_t* src = reinterpret_cast<scalar_t*>(
-                        indexer.GetInputPtr(0, workload_idx));
-                so_far = element_kernel(*src, so_far);
-            }
-            return so_far;
-        }, element_kernel);
+        dst = tbb::parallel_reduce(
+                tbb::blocked_range<int64_t>(0, indexer.NumWorkloads(),
+                                            utility::DefaultGrainSizeTBB()),
+                identity,
+                [&](const tbb::blocked_range<int64_t>& range, scalar_t so_far) {
+                    for (int64_t workload_idx = range.begin();
+                         workload_idx < range.end(); ++workload_idx) {
+                        scalar_t* src = reinterpret_cast<scalar_t*>(
+                                indexer.GetInputPtr(0, workload_idx));
+                        so_far = element_kernel(*src, so_far);
+                    }
+                    return so_far;
+                },
+                element_kernel);
     }
 
     template <typename scalar_t, typename func_t>
@@ -155,16 +157,18 @@ class CPUReductionEngine {
                     "LaunchReductionKernelTwoPass instead.");
         }
 
-        // TODO: could theoretically do inner reductions in parallel too with TBB
-        tbb::parallel_for(tbb::blocked_range<int64_t>(
-                0, indexer_shape[best_dim], 1),
-                [&](const tbb::blocked_range<int64_t>& range){
-            for (int64_t i = range.begin(); i < range.end(); ++i) {
-                Indexer sub_indexer(indexer);
-                sub_indexer.ShrinkDim(best_dim, i, 1);
-                LaunchReductionKernelSerial<scalar_t>(sub_indexer, element_kernel);
-            }
-        });
+        // TODO: could theoretically do inner reductions in parallel too with
+        // TBB
+        tbb::parallel_for(
+                tbb::blocked_range<int64_t>(0, indexer_shape[best_dim], 1),
+                [&](const tbb::blocked_range<int64_t>& range) {
+                    for (int64_t i = range.begin(); i < range.end(); ++i) {
+                        Indexer sub_indexer(indexer);
+                        sub_indexer.ShrinkDim(best_dim, i, 1);
+                        LaunchReductionKernelSerial<scalar_t>(sub_indexer,
+                                                              element_kernel);
+                    }
+                });
     }
 
 private:
@@ -185,37 +189,48 @@ class CPUArgReductionEngine {
         // sub-iteration.
         int64_t num_output_elements = indexer_.NumOutputElements();
 
-        tbb::parallel_for(tbb::blocked_range<int64_t>(
-                0, num_output_elements, 1),
-                [&](const tbb::blocked_range<int64_t>& range){
-            for (int64_t output_idx = range.begin();
-                    output_idx < range.end(); ++output_idx) {
-                // sub_indexer.NumWorkloads() == ipo.
-                // sub_indexer's workload_idx is indexer_'s ipo_idx.
-                Indexer sub_indexer = indexer_.GetPerOutputIndexer(output_idx);
-                using result_t = std::pair<int64_t, scalar_t>;
-                result_t val_idx{-1, identity};
-                val_idx = tbb::parallel_deterministic_reduce(
-                        tbb::blocked_range<int64_t>(0, sub_indexer.NumWorkloads(),
-                        utility::DefaultGrainSizeTBB()), val_idx,
-                        [&](const tbb::blocked_range<int64_t>& range,
-                            result_t so_far){
-                    for (int64_t workload_idx = range.begin();
-                            workload_idx < range.end(); ++workload_idx) {
-                        scalar_t& src_val = *reinterpret_cast<scalar_t*>(
-                            sub_indexer.GetInputPtr(0, workload_idx));
-                        so_far = reduce_func(workload_idx, src_val,
-                            std::get<0>(so_far), std::get<1>(so_far));
+        tbb::parallel_for(
+                tbb::blocked_range<int64_t>(0, num_output_elements, 1),
+                [&](const tbb::blocked_range<int64_t>& range) {
+                    for (int64_t output_idx = range.begin();
+                         output_idx < range.end(); ++output_idx) {
+                        // sub_indexer.NumWorkloads() == ipo.
+                        // sub_indexer's workload_idx is indexer_'s ipo_idx.
+                        Indexer sub_indexer =
+                                indexer_.GetPerOutputIndexer(output_idx);
+                        using result_t = std::pair<int64_t, scalar_t>;
+                        result_t val_idx{-1, identity};
+                        val_idx = tbb::parallel_deterministic_reduce(
+                                tbb::blocked_range<int64_t>(
+                                        0, sub_indexer.NumWorkloads(),
+                                        utility::DefaultGrainSizeTBB()),
+                                val_idx,
+                                [&](const tbb::blocked_range<int64_t>& range,
+                                    result_t so_far) {
+                                    for (int64_t workload_idx = range.begin();
+                                         workload_idx < range.end();
+                                         ++workload_idx) {
+                                        scalar_t& src_val =
+                                                *reinterpret_cast<scalar_t*>(
+                                                        sub_indexer.GetInputPtr(
+                                                                0,
+                                                                workload_idx));
+                                        so_far = reduce_func(
+                                                workload_idx, src_val,
+                                                std::get<0>(so_far),
+                                                std::get<1>(so_far));
+                                    }
+                                    return so_far;
+                                },
+                                [&reduce_func](result_t a, result_t b) {
+                                    return reduce_func(
+                                            std::get<0>(a), std::get<1>(a),
+                                            std::get<0>(b), std::get<1>(b));
+                                });
+                        *reinterpret_cast<int64_t*>(sub_indexer.GetOutputPtr(
+                                0)) = std::get<0>(val_idx);
                     }
-                    return so_far;
-                }, [&reduce_func](result_t a, result_t b) {
-                    return reduce_func(std::get<0>(a), std::get<1>(a),
-                            std::get<0>(b), std::get<1>(b));
                 });
-                *reinterpret_cast<int64_t *>(sub_indexer.GetOutputPtr(0)) =
-                        std::get<0>(val_idx);
-            }
-        });
     }
 
 private:
diff --git a/cpp/open3d/geometry/EstimateNormals.cpp b/cpp/open3d/geometry/EstimateNormals.cpp
index 3f1ddd6bda2..00b96ca8fb4 100644
--- a/cpp/open3d/geometry/EstimateNormals.cpp
+++ b/cpp/open3d/geometry/EstimateNormals.cpp
@@ -5,10 +5,11 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+
 #include <Eigen/Eigenvalues>
 #include <queue>
 #include <tuple>
-#include <tbb/parallel_for.h>
 
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
@@ -300,24 +301,26 @@ void PointCloud::EstimateNormals(
         covariances = covariances_;
     }
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, covariances.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range){
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            auto normal = ComputeNormal(covariances[i], fast_normal_computation);
-            if (normal.norm() == 0.0) {
-                if (has_normal) {
-                    normal = normals_[i];
-                } else {
-                    normal = Eigen::Vector3d(0.0, 0.0, 1.0);
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, covariances.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    auto normal = ComputeNormal(covariances[i],
+                                                fast_normal_computation);
+                    if (normal.norm() == 0.0) {
+                        if (has_normal) {
+                            normal = normals_[i];
+                        } else {
+                            normal = Eigen::Vector3d(0.0, 0.0, 1.0);
+                        }
+                    }
+                    if (has_normal && normal.dot(normals_[i]) < 0.0) {
+                        normal *= -1.0;
+                    }
+                    normals_[i] = normal;
                 }
-            }
-            if (has_normal && normal.dot(normals_[i]) < 0.0) {
-                normal *= -1.0;
-            }
-            normals_[i] = normal;
-        }
-    });
+            });
 }
 
 void PointCloud::OrientNormalsToAlignWithDirection(
@@ -327,18 +330,19 @@ void PointCloud::OrientNormalsToAlignWithDirection(
         utility::LogError(
                 "No normals in the PointCloud. Call EstimateNormals() first.");
     }
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range){
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            auto &normal = normals_[i];
-            if (normal.norm() == 0.0) {
-                normal = orientation_reference;
-            } else if (normal.dot(orientation_reference) < 0.0) {
-                normal *= -1.0;
-            }
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    auto &normal = normals_[i];
+                    if (normal.norm() == 0.0) {
+                        normal = orientation_reference;
+                    } else if (normal.dot(orientation_reference) < 0.0) {
+                        normal *= -1.0;
+                    }
+                }
+            });
 }
 
 void PointCloud::OrientNormalsTowardsCameraLocation(
@@ -348,25 +352,26 @@ void PointCloud::OrientNormalsTowardsCameraLocation(
                 "No normals in the PointCloud. Call EstimateNormals() first.");
     }
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range){
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            Eigen::Vector3d orientation_reference =
-                    camera_location - points_[i];
-            auto &normal = normals_[i];
-            if (normal.norm() == 0.0) {
-                normal = orientation_reference;
-                if (normal.norm() == 0.0) {
-                    normal = Eigen::Vector3d(0.0, 0.0, 1.0);
-                } else {
-                    normal.normalize();
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    Eigen::Vector3d orientation_reference =
+                            camera_location - points_[i];
+                    auto &normal = normals_[i];
+                    if (normal.norm() == 0.0) {
+                        normal = orientation_reference;
+                        if (normal.norm() == 0.0) {
+                            normal = Eigen::Vector3d(0.0, 0.0, 1.0);
+                        } else {
+                            normal.normalize();
+                        }
+                    } else if (normal.dot(orientation_reference) < 0.0) {
+                        normal *= -1.0;
+                    }
                 }
-            } else if (normal.dot(orientation_reference) < 0.0) {
-                normal *= -1.0;
-            }
-        }
-    });
+            });
 }
 
 void PointCloud::OrientNormalsConsistentTangentPlane(
diff --git a/cpp/open3d/geometry/ISSKeypoints.cpp b/cpp/open3d/geometry/ISSKeypoints.cpp
index db4f633fc1a..b8d16f736a7 100644
--- a/cpp/open3d/geometry/ISSKeypoints.cpp
+++ b/cpp/open3d/geometry/ISSKeypoints.cpp
@@ -9,6 +9,9 @@
 // Copyright (c) 2020 Ignacio Vizzo, Cyrill Stachniss, University of Bonn.
 // ----------------------------------------------------------------------------
 
+#include <tbb/concurrent_vector.h>
+#include <tbb/parallel_for.h>
+
 #include <Eigen/Core>
 #include <Eigen/Eigenvalues>
 #include <algorithm>
@@ -18,9 +21,6 @@
 #include <tuple>
 #include <vector>
 
-#include <tbb/concurrent_vector.h>
-#include <tbb/parallel_for.h>
-
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/Keypoint.h"
 #include "open3d/geometry/PointCloud.h"
@@ -86,55 +86,56 @@ std::shared_ptr<PointCloud> ComputeISSKeypoints(
     }
 
     std::vector<double> third_eigen_values(points.size());
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points.size(), utility::DefaultGrainSizeTBB()),
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points.size(),
+                                            utility::DefaultGrainSizeTBB()),
             [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<int> indices;
-            std::vector<double> dist;
-            int nb_neighbors =
-                    kdtree.SearchRadius(points[i],
-                        salient_radius, indices, dist);
-            if (nb_neighbors < min_neighbors) {
-                continue;
-            }
-
-            Eigen::Matrix3d cov = utility::ComputeCovariance(points, indices);
-            if (cov.isZero()) {
-                continue;
-            }
-
-            Eigen::SelfAdjointEigenSolver<Eigen::Matrix3d> solver(cov);
-            const double& e1c = solver.eigenvalues()[2];
-            const double& e2c = solver.eigenvalues()[1];
-            const double& e3c = solver.eigenvalues()[0];
-
-            if ((e2c / e1c) < gamma_21 && e3c / e2c < gamma_32) {
-                third_eigen_values[i] = e3c;
-            }
-        }
-    });
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> indices;
+                    std::vector<double> dist;
+                    int nb_neighbors = kdtree.SearchRadius(
+                            points[i], salient_radius, indices, dist);
+                    if (nb_neighbors < min_neighbors) {
+                        continue;
+                    }
+
+                    Eigen::Matrix3d cov =
+                            utility::ComputeCovariance(points, indices);
+                    if (cov.isZero()) {
+                        continue;
+                    }
+
+                    Eigen::SelfAdjointEigenSolver<Eigen::Matrix3d> solver(cov);
+                    const double& e1c = solver.eigenvalues()[2];
+                    const double& e2c = solver.eigenvalues()[1];
+                    const double& e3c = solver.eigenvalues()[0];
+
+                    if ((e2c / e1c) < gamma_21 && e3c / e2c < gamma_32) {
+                        third_eigen_values[i] = e3c;
+                    }
+                }
+            });
 
     tbb::concurrent_vector<std::size_t> kp_indices;
     kp_indices.reserve(points.size());
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points.size(), utility::DefaultGrainSizeTBB()),
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points.size(),
+                                            utility::DefaultGrainSizeTBB()),
             [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            if (third_eigen_values[i] > 0.0) {
-                std::vector<int> nn_indices;
-                std::vector<double> dist;
-                int nb_neighbors = kdtree.SearchRadius(
-                        points[i], non_max_radius, nn_indices, dist);
-
-                if (nb_neighbors >= min_neighbors &&
-                    IsLocalMaxima(i, nn_indices, third_eigen_values)) {
-                    kp_indices.emplace_back(i);
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    if (third_eigen_values[i] > 0.0) {
+                        std::vector<int> nn_indices;
+                        std::vector<double> dist;
+                        int nb_neighbors = kdtree.SearchRadius(
+                                points[i], non_max_radius, nn_indices, dist);
+
+                        if (nb_neighbors >= min_neighbors &&
+                            IsLocalMaxima(i, nn_indices, third_eigen_values)) {
+                            kp_indices.emplace_back(i);
+                        }
+                    }
                 }
-            }
-        }
-    });
-
+            });
 
     utility::LogDebug("[ComputeISSKeypoints] Extracted {} keypoints",
                       kp_indices.size());
diff --git a/cpp/open3d/geometry/Image.cpp b/cpp/open3d/geometry/Image.cpp
index f8d6b265fb5..8995d05d11a 100644
--- a/cpp/open3d/geometry/Image.cpp
+++ b/cpp/open3d/geometry/Image.cpp
@@ -138,21 +138,24 @@ std::shared_ptr<Image> Image::Downsample() const {
     // Integer division always rounds towards zero
     output->Prepare(width_ / 2, height_ / 2, 1, 4);
 
-    tbb::parallel_for(tbb::blocked_range2d<int, int>(
-            0, output->height_, utility::DefaultGrainSizeTBB(),
-            0, output->width_, utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range2d<int, int>& range){
-        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
-            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
-                float *p1 = PointerAt<float>(x * 2, y * 2);
-                float *p2 = PointerAt<float>(x * 2 + 1, y * 2);
-                float *p3 = PointerAt<float>(x * 2, y * 2 + 1);
-                float *p4 = PointerAt<float>(x * 2 + 1, y * 2 + 1);
-                float *p = output->PointerAt<float>(x, y);
-                *p = (*p1 + *p2 + *p3 + *p4) / 4.0f;
-            }
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range2d<int, int>(
+                    0, output->height_, utility::DefaultGrainSizeTBB(), 0,
+                    output->width_, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range2d<int, int> &range) {
+                for (int y = range.rows().begin(); y < range.rows().end();
+                     ++y) {
+                    for (int x = range.cols().begin(); x < range.cols().end();
+                         ++x) {
+                        float *p1 = PointerAt<float>(x * 2, y * 2);
+                        float *p2 = PointerAt<float>(x * 2 + 1, y * 2);
+                        float *p3 = PointerAt<float>(x * 2, y * 2 + 1);
+                        float *p4 = PointerAt<float>(x * 2 + 1, y * 2 + 1);
+                        float *p = output->PointerAt<float>(x, y);
+                        *p = (*p1 + *p2 + *p3 + *p4) / 4.0f;
+                    }
+                }
+            });
     return output;
 }
 
@@ -170,28 +173,33 @@ std::shared_ptr<Image> Image::FilterHorizontal(
     const int half_kernel_size = (int)(floor((double)kernel.size() / 2.0));
     // Define the grain size based on the width of the memory read
     // Want no more than 1/4 of the read to be the halo cells
-    const std::size_t multiple = (utility::DefaultGrainSizeTBB() +
-        4 * (kernel.size() - 1)) / utility::DefaultGrainSizeTBB();
-    const std::size_t grain_size = multiple * utility::DefaultGrainSizeTBB()
-        + 1 - kernel.size();
-    tbb::parallel_for(tbb::blocked_range2d<int, int>(
-            0, height_, grain_size, 0, width_, grain_size),
-            [&](const tbb::blocked_range2d<int, int>& range){
-        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
-            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
-                float *po = output->PointerAt<float>(x, y, 0);
-                double temp = 0;
-                for (int i = -half_kernel_size; i <= half_kernel_size; ++i) {
-                    int x_shift = x + i;
-                    if (x_shift < 0) x_shift = 0;
-                    if (x_shift > width_ - 1) x_shift = width_ - 1;
-                    float *pi = PointerAt<float>(x_shift, y, 0);
-                    temp += (*pi * (float)kernel[i + half_kernel_size]);
+    const std::size_t multiple =
+            (utility::DefaultGrainSizeTBB() + 4 * (kernel.size() - 1)) /
+            utility::DefaultGrainSizeTBB();
+    const std::size_t grain_size =
+            multiple * utility::DefaultGrainSizeTBB() + 1 - kernel.size();
+    tbb::parallel_for(
+            tbb::blocked_range2d<int, int>(0, height_, grain_size, 0, width_,
+                                           grain_size),
+            [&](const tbb::blocked_range2d<int, int> &range) {
+                for (int y = range.rows().begin(); y < range.rows().end();
+                     ++y) {
+                    for (int x = range.cols().begin(); x < range.cols().end();
+                         ++x) {
+                        float *po = output->PointerAt<float>(x, y, 0);
+                        double temp = 0;
+                        for (int i = -half_kernel_size; i <= half_kernel_size;
+                             ++i) {
+                            int x_shift = x + i;
+                            if (x_shift < 0) x_shift = 0;
+                            if (x_shift > width_ - 1) x_shift = width_ - 1;
+                            float *pi = PointerAt<float>(x_shift, y, 0);
+                            temp += (*pi * (float)kernel[i + half_kernel_size]);
+                        }
+                        *po = (float)temp;
+                    }
                 }
-                *po = (float)temp;
-            }
-        }
-    });
+            });
     return output;
 }
 
@@ -257,20 +265,24 @@ std::shared_ptr<Image> Image::Transpose() const {
     int bytes_per_pixel = num_of_channels_ * bytes_per_channel_;
 
     tbb::parallel_for(tbb::blocked_range2d<int, int>(
-            0, height_, utility::DefaultGrainSizeTBB(),
-            0, width_, utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range2d<int, int>& range){
-        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
-            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
-                std::copy(
-                    data_.data() + y * in_bytes_per_line + x * bytes_per_pixel,
-                    data_.data() + y * in_bytes_per_line +
-                            (x + 1) * bytes_per_pixel,
-                    output->data_.data() + x * out_bytes_per_line +
-                            y * bytes_per_pixel);
-            }
-        }
-    });
+                              0, height_, utility::DefaultGrainSizeTBB(), 0,
+                              width_, utility::DefaultGrainSizeTBB()),
+                      [&](const tbb::blocked_range2d<int, int> &range) {
+                          for (int y = range.rows().begin();
+                               y < range.rows().end(); ++y) {
+                              for (int x = range.cols().begin();
+                                   x < range.cols().end(); ++x) {
+                                  std::copy(
+                                          data_.data() + y * in_bytes_per_line +
+                                                  x * bytes_per_pixel,
+                                          data_.data() + y * in_bytes_per_line +
+                                                  (x + 1) * bytes_per_pixel,
+                                          output->data_.data() +
+                                                  x * out_bytes_per_line +
+                                                  y * bytes_per_pixel);
+                              }
+                          }
+                      });
 
     return output;
 }
@@ -280,15 +292,16 @@ std::shared_ptr<Image> Image::FlipVertical() const {
     output->Prepare(width_, height_, num_of_channels_, bytes_per_channel_);
 
     int bytes_per_line = BytesPerLine();
-    tbb::parallel_for(tbb::blocked_range<int>(
-            0, height_, utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<int>& range){
-        for (int y = range.begin(); y < range.end(); ++y) {
-            std::copy(data_.data() + y * bytes_per_line,
-                      data_.data() + (y + 1) * bytes_per_line,
-                      output->data_.data() + (height_ - y - 1) * bytes_per_line);
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<int>(0, height_, utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<int> &range) {
+                for (int y = range.begin(); y < range.end(); ++y) {
+                    std::copy(data_.data() + y * bytes_per_line,
+                              data_.data() + (y + 1) * bytes_per_line,
+                              output->data_.data() +
+                                      (height_ - y - 1) * bytes_per_line);
+                }
+            });
     return output;
 }
 
@@ -299,22 +312,24 @@ std::shared_ptr<Image> Image::FlipHorizontal() const {
     int bytes_per_line = BytesPerLine();
     int bytes_per_pixel = num_of_channels_ * bytes_per_channel_;
 
-    tbb::parallel_for(tbb::blocked_range2d<int, int>(
-            0, height_, utility::DefaultGrainSizeTBB(),
-            0, width_, utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range2d<int, int>& range){
-        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
-            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
-                std::copy(
-                    data_.data() + y * bytes_per_line + x * bytes_per_pixel,
-                    data_.data() + y * bytes_per_line +
-                        (x + 1) * bytes_per_pixel,
-                    output->data_.data() + y * bytes_per_line +
-                        (width_ - x - 1) * bytes_per_pixel);
-            }
-        }
-    });
-
+    tbb::parallel_for(
+            tbb::blocked_range2d<int, int>(
+                    0, height_, utility::DefaultGrainSizeTBB(), 0, width_,
+                    utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range2d<int, int> &range) {
+                for (int y = range.rows().begin(); y < range.rows().end();
+                     ++y) {
+                    for (int x = range.cols().begin(); x < range.cols().end();
+                         ++x) {
+                        std::copy(data_.data() + y * bytes_per_line +
+                                          x * bytes_per_pixel,
+                                  data_.data() + y * bytes_per_line +
+                                          (x + 1) * bytes_per_pixel,
+                                  output->data_.data() + y * bytes_per_line +
+                                          (width_ - x - 1) * bytes_per_pixel);
+                    }
+                }
+            });
 
     return output;
 }
@@ -326,28 +341,37 @@ std::shared_ptr<Image> Image::Dilate(int half_kernel_size /* = 1 */) const {
     }
     output->Prepare(width_, height_, 1, 1);
 
-    tbb::parallel_for(tbb::blocked_range2d<int, int>(
-            0, height_, utility::DefaultGrainSizeTBB() - (2 * half_kernel_size),
-            0, width_, utility::DefaultGrainSizeTBB() - (2 * half_kernel_size)),
-            [&](const tbb::blocked_range2d<int, int>& range){
-        for (int y = range.rows().begin(); y < range.rows().end(); ++y) {
-            for (int x = range.cols().begin(); x < range.cols().end(); ++x) {
-                for (int yy = -half_kernel_size; yy <= half_kernel_size; ++yy) {
-                    for (int xx = -half_kernel_size; xx <= half_kernel_size; ++xx) {
-                        unsigned char *pi;
-                        if (TestImageBoundary(x + xx, y + yy)) {
-                            pi = PointerAt<unsigned char>(x + xx, y + yy);
-                            if (*pi == 255) {
-                                *output->PointerAt<unsigned char>(x, y, 0) = 255;
-                                xx = half_kernel_size;
-                                yy = half_kernel_size;
+    tbb::parallel_for(
+            tbb::blocked_range2d<int, int>(
+                    0, height_,
+                    utility::DefaultGrainSizeTBB() - (2 * half_kernel_size), 0,
+                    width_,
+                    utility::DefaultGrainSizeTBB() - (2 * half_kernel_size)),
+            [&](const tbb::blocked_range2d<int, int> &range) {
+                for (int y = range.rows().begin(); y < range.rows().end();
+                     ++y) {
+                    for (int x = range.cols().begin(); x < range.cols().end();
+                         ++x) {
+                        for (int yy = -half_kernel_size; yy <= half_kernel_size;
+                             ++yy) {
+                            for (int xx = -half_kernel_size;
+                                 xx <= half_kernel_size; ++xx) {
+                                unsigned char *pi;
+                                if (TestImageBoundary(x + xx, y + yy)) {
+                                    pi = PointerAt<unsigned char>(x + xx,
+                                                                  y + yy);
+                                    if (*pi == 255) {
+                                        *output->PointerAt<unsigned char>(
+                                                x, y, 0) = 255;
+                                        xx = half_kernel_size;
+                                        yy = half_kernel_size;
+                                    }
+                                }
                             }
                         }
                     }
                 }
-            }
-        }
-    });
+            });
 
     return output;
 }
@@ -365,23 +389,28 @@ std::shared_ptr<Image> Image::CreateDepthBoundaryMask(
     auto mask = std::make_shared<Image>();
     mask->Prepare(width, height, 1, 1);
 
-    tbb::parallel_for(tbb::blocked_range2d<int, int>(
-            0, height, utility::DefaultGrainSizeTBB(),
-            0, width, utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range2d<int, int>& range){
-        for (int v = range.rows().begin(); v < range.rows().end(); v++) {
-            for (int u = range.cols().begin(); u < range.cols().end(); u++) {
-                double dx = *depth_image_gradient_dx->PointerAt<float>(u, v);
-                double dy = *depth_image_gradient_dy->PointerAt<float>(u, v);
-                double mag = sqrt(dx * dx + dy * dy);
-                if (mag > depth_threshold_for_discontinuity_check) {
-                    *mask->PointerAt<unsigned char>(u, v) = 255;
-                } else {
-                    *mask->PointerAt<unsigned char>(u, v) = 0;
+    tbb::parallel_for(
+            tbb::blocked_range2d<int, int>(
+                    0, height, utility::DefaultGrainSizeTBB(), 0, width,
+                    utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range2d<int, int> &range) {
+                for (int v = range.rows().begin(); v < range.rows().end();
+                     v++) {
+                    for (int u = range.cols().begin(); u < range.cols().end();
+                         u++) {
+                        double dx = *depth_image_gradient_dx->PointerAt<float>(
+                                u, v);
+                        double dy = *depth_image_gradient_dy->PointerAt<float>(
+                                u, v);
+                        double mag = sqrt(dx * dx + dy * dy);
+                        if (mag > depth_threshold_for_discontinuity_check) {
+                            *mask->PointerAt<unsigned char>(u, v) = 255;
+                        } else {
+                            *mask->PointerAt<unsigned char>(u, v) = 0;
+                        }
+                    }
                 }
-            }
-        }
-    });
+            });
 
     if (half_dilation_kernel_size_for_discontinuity_map >= 1) {
         auto mask_dilated =
diff --git a/cpp/open3d/geometry/PointCloud.cpp b/cpp/open3d/geometry/PointCloud.cpp
index 0f7f4826de0..fb28f3a4e5e 100644
--- a/cpp/open3d/geometry/PointCloud.cpp
+++ b/cpp/open3d/geometry/PointCloud.cpp
@@ -7,13 +7,13 @@
 
 #include "open3d/geometry/PointCloud.h"
 
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
 #include <Eigen/Dense>
 #include <algorithm>
 #include <numeric>
 
-#include <tbb/parallel_for.h>
-#include <tbb/parallel_reduce.h>
-
 #include "open3d/geometry/BoundingVolume.h"
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/Qhull.h"
@@ -130,22 +130,24 @@ std::vector<double> PointCloud::ComputePointCloudDistance(
     std::vector<double> distances(points_.size());
     KDTreeFlann kdtree;
     kdtree.SetGeometry(target);
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<int> indices(1);
-            std::vector<double> dists(1);
-            if (kdtree.SearchKNN(points_[i], 1, indices, dists) == 0) {
-                utility::LogDebug(
-                        "[ComputePointCloudToPointCloudDistance] Found a point "
-                        "without neighbors.");
-                distances[i] = 0.0;
-            } else {
-                distances[i] = std::sqrt(dists[0]);
-            }
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> indices(1);
+                    std::vector<double> dists(1);
+                    if (kdtree.SearchKNN(points_[i], 1, indices, dists) == 0) {
+                        utility::LogDebug(
+                                "[ComputePointCloudToPointCloudDistance] Found "
+                                "a point "
+                                "without neighbors.");
+                        distances[i] = 0.0;
+                    } else {
+                        distances[i] = std::sqrt(dists[0]);
+                    }
+                }
+            });
     return distances;
 }
 
@@ -582,18 +584,19 @@ PointCloud::RemoveRadiusOutliers(size_t nb_points,
     utility::TBBProgressBar progress_bar(
             points_.size(), "Remove radius outliers: ", print_progress);
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<int> tmp_indices;
-            std::vector<double> dist;
-            size_t nb_neighbors = kdtree.SearchRadius(points_[i],
-                search_radius, tmp_indices, dist);
-            mask[i] = (nb_neighbors > nb_points);
-            ++progress_bar;
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> tmp_indices;
+                    std::vector<double> dist;
+                    size_t nb_neighbors = kdtree.SearchRadius(
+                            points_[i], search_radius, tmp_indices, dist);
+                    mask[i] = (nb_neighbors > nb_points);
+                    ++progress_bar;
+                }
+            });
 
     std::vector<size_t> indices;
     for (size_t i = 0; i < mask.size(); i++) {
@@ -627,26 +630,29 @@ PointCloud::RemoveStatisticalOutliers(size_t nb_neighbors,
 
     size_t valid_distances = tbb::parallel_reduce(
             tbb::blocked_range<std::size_t>(0, points_.size(),
-            utility::DefaultGrainSizeTBB()), std::size_t{0},
-            [&](const tbb::blocked_range<std::size_t>& range,
-                    std::size_t valid_so_far){
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<int> tmp_indices;
-            std::vector<double> dist;
-            kdtree.SearchKNN(points_[i], int(nb_neighbors), tmp_indices, dist);
-            double mean = -1.0;
-            if (dist.size() > 0u) {
-                ++valid_so_far;
-                std::for_each(dist.begin(), dist.end(),
-                              [](double &d) { d = std::sqrt(d); });
-                mean = std::accumulate(dist.begin(), dist.end(),
-                                       0.0) / dist.size();
-            }
-            avg_distances[i] = mean;
-            ++progress_bar;
-        }
-        return valid_so_far;
-    }, std::plus<std::size_t>{});
+                                            utility::DefaultGrainSizeTBB()),
+            std::size_t{0},
+            [&](const tbb::blocked_range<std::size_t> &range,
+                std::size_t valid_so_far) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> tmp_indices;
+                    std::vector<double> dist;
+                    kdtree.SearchKNN(points_[i], int(nb_neighbors), tmp_indices,
+                                     dist);
+                    double mean = -1.0;
+                    if (dist.size() > 0u) {
+                        ++valid_so_far;
+                        std::for_each(dist.begin(), dist.end(),
+                                      [](double &d) { d = std::sqrt(d); });
+                        mean = std::accumulate(dist.begin(), dist.end(), 0.0) /
+                               dist.size();
+                    }
+                    avg_distances[i] = mean;
+                    ++progress_bar;
+                }
+                return valid_so_far;
+            },
+            std::plus<std::size_t>{});
 
     if (valid_distances == 0) {
         return std::make_tuple(std::make_shared<PointCloud>(),
@@ -663,8 +669,8 @@ PointCloud::RemoveStatisticalOutliers(size_t nb_neighbors,
                 return x > 0 ? (x - cloud_mean) * (y - cloud_mean) : 0;
             });
     // Bessel's correction
-    double std_dev = std::sqrt(sq_sum /
-        static_cast<double>(valid_distances - 1));
+    double std_dev =
+            std::sqrt(sq_sum / static_cast<double>(valid_distances - 1));
     double distance_threshold = cloud_mean + std_ratio * std_dev;
     for (size_t i = 0; i < avg_distances.size(); i++) {
         if (avg_distances[i] > 0 && avg_distances[i] < distance_threshold) {
@@ -683,23 +689,28 @@ std::vector<Eigen::Matrix3d> PointCloud::EstimatePerPointCovariances(
 
     KDTreeFlann kdtree;
     kdtree.SetGeometry(input);
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(0, points.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<int> indices;
-            std::vector<double> distance2;
-            if (kdtree.Search(points[i], search_param, indices, distance2) >= 3) {
-                auto covariance = utility::ComputeCovariance(points, indices);
-                if (input.HasCovariances() && covariance.isIdentity(1e-4)) {
-                    covariances[i] = input.covariances_[i];
-                } else {
-                    covariances[i] = covariance;
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> indices;
+                    std::vector<double> distance2;
+                    if (kdtree.Search(points[i], search_param, indices,
+                                      distance2) >= 3) {
+                        auto covariance =
+                                utility::ComputeCovariance(points, indices);
+                        if (input.HasCovariances() &&
+                            covariance.isIdentity(1e-4)) {
+                            covariances[i] = input.covariances_[i];
+                        } else {
+                            covariances[i] = covariance;
+                        }
+                    } else {
+                        covariances[i] = Eigen::Matrix3d::Identity();
+                    }
                 }
-            } else {
-                covariances[i] = Eigen::Matrix3d::Identity();
-            }
-        }
-    });
+            });
     return covariances;
 }
 void PointCloud::EstimateCovariances(
@@ -724,14 +735,15 @@ std::vector<double> PointCloud::ComputeMahalanobisDistance() const {
     Eigen::Matrix3d covariance;
     std::tie(mean, covariance) = ComputeMeanAndCovariance();
     Eigen::Matrix3d cov_inv = covariance.inverse();
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            Eigen::Vector3d p = points_[i] - mean;
-            mahalanobis[i] = std::sqrt(p.transpose() * cov_inv * p);
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    Eigen::Vector3d p = points_[i] - mean;
+                    mahalanobis[i] = std::sqrt(p.transpose() * cov_inv * p);
+                }
+            });
     return mahalanobis;
 }
 
@@ -742,22 +754,23 @@ std::vector<double> PointCloud::ComputeNearestNeighborDistance() const {
 
     std::vector<double> nn_dis(points_.size());
     KDTreeFlann kdtree(*this);
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<int> indices(2);
-            std::vector<double> dists(2);
-            if (kdtree.SearchKNN(points_[i], 2, indices, dists) <= 1) {
-                utility::LogDebug(
-                        "[ComputePointCloudNearestNeighborDistance] "
-                        "Found a point without neighbors.");
-                nn_dis[i] = 0.0;
-            } else {
-                nn_dis[i] = std::sqrt(dists[1]);
-            }
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> indices(2);
+                    std::vector<double> dists(2);
+                    if (kdtree.SearchKNN(points_[i], 2, indices, dists) <= 1) {
+                        utility::LogDebug(
+                                "[ComputePointCloudNearestNeighborDistance] "
+                                "Found a point without neighbors.");
+                        nn_dis[i] = 0.0;
+                    } else {
+                        nn_dis[i] = std::sqrt(dists[1]);
+                    }
+                }
+            });
     return nn_dis;
 }
 
diff --git a/cpp/open3d/geometry/PointCloudCluster.cpp b/cpp/open3d/geometry/PointCloudCluster.cpp
index acc42a89350..334e54476b3 100644
--- a/cpp/open3d/geometry/PointCloudCluster.cpp
+++ b/cpp/open3d/geometry/PointCloudCluster.cpp
@@ -5,12 +5,12 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+#include <tbb/spin_mutex.h>
+
 #include <Eigen/Dense>
 #include <unordered_set>
 
-#include <tbb/spin_mutex.h>
-#include <tbb/parallel_for.h>
-
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/utility/Logging.h"
@@ -27,22 +27,23 @@ std::vector<int> PointCloud::ClusterDBSCAN(double eps,
 
     // Precompute all neighbors.
     utility::LogDebug("Precompute neighbors.");
-    utility::ProgressBar progress_bar(points_.size(),
-            "Precompute neighbors.", print_progress);
+    utility::ProgressBar progress_bar(points_.size(), "Precompute neighbors.",
+                                      print_progress);
     std::vector<std::vector<int>> nbs(points_.size());
 
     tbb::spin_mutex mtx;
     tbb::profiling::set_name(mtx, "ClusterDBSCAN");
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
             [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<double> dists2;
-            kdtree.SearchRadius(points_[i], eps, nbs[i], dists2);
-        }
-        tbb::spin_mutex::scoped_lock lock(mtx);
-        progress_bar += (range.end() - range.begin());
-    });
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<double> dists2;
+                    kdtree.SearchRadius(points_[i], eps, nbs[i], dists2);
+                }
+                tbb::spin_mutex::scoped_lock lock(mtx);
+                progress_bar += (range.end() - range.begin());
+            });
 
     utility::LogDebug("Done Precompute neighbors.");
 
diff --git a/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp b/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp
index ff435589937..88c1b602dc2 100644
--- a/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp
+++ b/cpp/open3d/geometry/PointCloudPlanarPatchDetection.cpp
@@ -5,6 +5,8 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+
 #include <Eigen/Dense>
 #include <algorithm>
 #include <cstdint>
@@ -14,8 +16,6 @@
 #include <unordered_map>
 #include <unordered_set>
 
-#include <tbb/parallel_for.h>
-
 #include "libqhullcpp/PointCoordinates.h"
 #include "libqhullcpp/Qhull.h"
 #include "libqhullcpp/QhullVertex.h"
@@ -997,15 +997,17 @@ PointCloud::DetectPlanarPatches(
     std::vector<std::vector<int>> neighbors;
     neighbors.resize(points_.size());
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, points_.size(), utility::DefaultGrainSizeTBB()),
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
             [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            std::vector<int> indices;
-            std::vector<double> distance2;
-            kdtree.Search(points_[i], search_param, neighbors[i], distance2);
-        }
-    });
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    std::vector<int> indices;
+                    std::vector<double> distance2;
+                    kdtree.Search(points_[i], search_param, neighbors[i],
+                                  distance2);
+                }
+            });
 
     const double normal_similarity_rad = normal_similarity_deg * M_PI / 180.0;
     const double coplanarity_rad = coplanarity_deg * M_PI / 180.0;
diff --git a/cpp/open3d/geometry/PointCloudSegmentation.cpp b/cpp/open3d/geometry/PointCloudSegmentation.cpp
index 48ae5b22bd3..74c63236dd5 100644
--- a/cpp/open3d/geometry/PointCloudSegmentation.cpp
+++ b/cpp/open3d/geometry/PointCloudSegmentation.cpp
@@ -5,6 +5,9 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+#include <tbb/spin_rw_mutex.h>
+
 #include <Eigen/Dense>
 #include <algorithm>
 #include <iostream>
@@ -12,9 +15,6 @@
 #include <numeric>
 #include <unordered_set>
 
-#include <tbb/spin_rw_mutex.h>
-#include <tbb/parallel_for.h>
-
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/geometry/TriangleMesh.h"
 #include "open3d/utility/Logging.h"
@@ -63,9 +63,9 @@ class RANSACResult {
     RANSACResult() : fitness_(0), inlier_rmse_(0) {}
     ~RANSACResult() {}
 
-    bool IsBetterRANSACThan(const RANSACResult& other) {
-        return fitness_ > other.fitness_ ||
-         (fitness_ == other.fitness_ && inlier_rmse_ < other.inlier_rmse_);
+    bool IsBetterRANSACThan(const RANSACResult &other) {
+        return fitness_ > other.fitness_ || (fitness_ == other.fitness_ &&
+                                             inlier_rmse_ < other.inlier_rmse_);
     }
 
 public:
@@ -190,57 +190,62 @@ std::tuple<Eigen::Vector4d, std::vector<size_t>> PointCloud::SegmentPlane(
     std::atomic<std::size_t> iteration_count = 0;
 
     tbb::spin_rw_mutex mtx;
-    tbb::parallel_for(tbb::blocked_range<int>(0, num_iterations, 1),
-            [&](const tbb::blocked_range<int>& range) {
-        for (int i = range.begin(); i < range.end(); ++i) {
-            if (iteration_count > break_iteration) {
-                continue;
-            }
+    tbb::parallel_for(
+            tbb::blocked_range<int>(0, num_iterations, 1),
+            [&](const tbb::blocked_range<int> &range) {
+                for (int i = range.begin(); i < range.end(); ++i) {
+                    if (iteration_count > break_iteration) {
+                        continue;
+                    }
 
-            const std::vector<size_t> sampled_indices = sampler(ransac_n);
-            std::vector<size_t> inliers = sampled_indices;
-
-            // Fit model to num_model_parameters randomly selected points
-            // among the inliers.
-            Eigen::Vector4d plane_model;
-            if (ransac_n == 3) {
-                plane_model = TriangleMesh::ComputeTrianglePlane(
-                        points_[inliers[0]], points_[inliers[1]],
-                        points_[inliers[2]]);
-            } else {
-                plane_model = GetPlaneFromPoints(points_, inliers);
-            }
+                    const std::vector<size_t> sampled_indices =
+                            sampler(ransac_n);
+                    std::vector<size_t> inliers = sampled_indices;
+
+                    // Fit model to num_model_parameters randomly selected
+                    // points among the inliers.
+                    Eigen::Vector4d plane_model;
+                    if (ransac_n == 3) {
+                        plane_model = TriangleMesh::ComputeTrianglePlane(
+                                points_[inliers[0]], points_[inliers[1]],
+                                points_[inliers[2]]);
+                    } else {
+                        plane_model = GetPlaneFromPoints(points_, inliers);
+                    }
 
-            if (plane_model.isZero(0)) {
-                continue;
-            }
+                    if (plane_model.isZero(0)) {
+                        continue;
+                    }
 
-            inliers.clear();
-            auto this_result = EvaluateRANSACBasedOnDistance(
-                    points_, plane_model, inliers, distance_threshold);
-            {
-                tbb::spin_rw_mutex::scoped_lock lock(mtx);
-                if (this_result.IsBetterRANSACThan(result)) {
-                    lock.upgrade_to_writer();
-                    // Have to recheck after upgrading lock because
-                    // upgrading requires releasing the lock
-                    if (this_result.IsBetterRANSACThan(result)) {
-                        result = this_result;
-                        best_plane_model = plane_model;
-                        if (result.fitness_ < 1.0) {
-                            break_iteration = std::min(log(1 - probability) /
-                                    log(1 - pow(result.fitness_, ransac_n)),
-                                    (double)num_iterations);
-                        } else {
-                            // Set break_iteration to 0 so we break the loop.
-                            break_iteration = 0;
+                    inliers.clear();
+                    auto this_result = EvaluateRANSACBasedOnDistance(
+                            points_, plane_model, inliers, distance_threshold);
+                    {
+                        tbb::spin_rw_mutex::scoped_lock lock(mtx);
+                        if (this_result.IsBetterRANSACThan(result)) {
+                            lock.upgrade_to_writer();
+                            // Have to recheck after upgrading lock because
+                            // upgrading requires releasing the lock
+                            if (this_result.IsBetterRANSACThan(result)) {
+                                result = this_result;
+                                best_plane_model = plane_model;
+                                if (result.fitness_ < 1.0) {
+                                    break_iteration = std::min(
+                                            log(1 - probability) /
+                                                    log(1 - pow(result.fitness_,
+                                                                ransac_n)),
+                                            (double)num_iterations);
+                                } else {
+                                    // Set break_iteration to 0 so we break the
+                                    // loop.
+                                    break_iteration = 0;
+                                }
+                            }
                         }
                     }
+                    iteration_count++;
                 }
-            }
-            iteration_count++;
-        }
-    });
+            });
 
     // Find the final inliers using best_plane_model.
     std::vector<size_t> final_inliers;
diff --git a/cpp/open3d/geometry/TriangleMesh.cpp b/cpp/open3d/geometry/TriangleMesh.cpp
index 154c1bb6957..96b87028695 100644
--- a/cpp/open3d/geometry/TriangleMesh.cpp
+++ b/cpp/open3d/geometry/TriangleMesh.cpp
@@ -7,11 +7,14 @@
 
 #include "open3d/geometry/TriangleMesh.h"
 
+#include <tbb/concurrent_vector.h>
+#include <tbb/parallel_for.h>
+#include <tbb/parallel_reduce.h>
+
 #include <Eigen/Dense>
 #include <numeric>
 #include <queue>
 #include <tuple>
-#include <tbb/tbb.h>
 
 #include "open3d/geometry/BoundingVolume.h"
 #include "open3d/geometry/IntersectionTest.h"
@@ -929,14 +932,16 @@ TriangleMesh &TriangleMesh::MergeCloseVertices(double eps) {
     utility::LogDebug("Precompute Neighbours");
     std::vector<std::vector<int>> nbs(vertices_.size());
     utility::LogDebug("Done Precompute Neighbours");
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, vertices_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range){
-        std::vector<double> dists2;
-        for (std::size_t idx = range.begin(); idx < range.end(); ++idx) {
-            kdtree.SearchRadius(vertices_[idx], eps, nbs[idx], dists2);
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, vertices_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                std::vector<double> dists2;
+                for (std::size_t idx = range.begin(); idx < range.end();
+                     ++idx) {
+                    kdtree.SearchRadius(vertices_[idx], eps, nbs[idx], dists2);
+                }
+            });
 
     bool has_vertex_normals = HasVertexNormals();
     bool has_vertex_colors = HasVertexColors();
@@ -1212,22 +1217,26 @@ double TriangleMesh::GetVolume() const {
                 "computed.");
     }
 
-    return std::abs(tbb::parallel_reduce(tbb::blocked_range<std::size_t>(
-            0, triangles_.size(), utility::DefaultGrainSizeTBB()), 0.0,
-            [&](const tbb::blocked_range<std::size_t>& range, double volume){
-        for (std::size_t tidx = range.begin(); tidx < range.end(); ++tidx) {
-            // Computes the signed volume of the tetrahedron defined by
-            // the three triangle vertices and the origin. The sign is determined by
-            // checking if the origin is at the same side as the normal with respect to
-            // the triangle.
-            const Eigen::Vector3i &triangle = triangles_[tidx];
-            const Eigen::Vector3d &vertex0 = vertices_[triangle(0)];
-            const Eigen::Vector3d &vertex1 = vertices_[triangle(1)];
-            const Eigen::Vector3d &vertex2 = vertices_[triangle(2)];
-            volume += vertex0.dot(vertex1.cross(vertex2)) / 6.0;
-        }
-        return volume;
-    }, std::plus<double>{}));
+    return std::abs(tbb::parallel_reduce(
+            tbb::blocked_range<std::size_t>(0, triangles_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            0.0,
+            [&](const tbb::blocked_range<std::size_t> &range, double volume) {
+                for (std::size_t tidx = range.begin(); tidx < range.end();
+                     ++tidx) {
+                    // Computes the signed volume of the tetrahedron defined by
+                    // the three triangle vertices and the origin. The sign is
+                    // determined by checking if the origin is at the same side
+                    // as the normal with respect to the triangle.
+                    const Eigen::Vector3i &triangle = triangles_[tidx];
+                    const Eigen::Vector3d &vertex0 = vertices_[triangle(0)];
+                    const Eigen::Vector3d &vertex1 = vertices_[triangle(1)];
+                    const Eigen::Vector3d &vertex2 = vertices_[triangle(2)];
+                    volume += vertex0.dot(vertex1.cross(vertex2)) / 6.0;
+                }
+                return volume;
+            },
+            std::plus<double>{}));
 }
 
 Eigen::Vector4d TriangleMesh::ComputeTrianglePlane(const Eigen::Vector3d &p0,
@@ -1359,48 +1368,53 @@ bool TriangleMesh::IsVertexManifold() const {
 std::vector<Eigen::Vector2i> TriangleMesh::GetSelfIntersectingTriangles()
         const {
     tbb::concurrent_vector<Eigen::Vector2i> self_intersecting_triangles;
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, triangles_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range){
-        for (std::size_t tidx0 = range.begin(); tidx0 < range.end(); ++tidx0) {
-            const Eigen::Vector3i &tria_p = triangles_[tidx0];
-            const Eigen::Vector3d &p0 = vertices_[tria_p(0)];
-            const Eigen::Vector3d &p1 = vertices_[tria_p(1)];
-            const Eigen::Vector3d &p2 = vertices_[tria_p(2)];
-
-            const Eigen::Vector3d bb_min1 =
-                    p0.array().min(p1.array().min(p2.array()));
-            const Eigen::Vector3d bb_max1 =
-                    p0.array().max(p1.array().max(p2.array()));
-
-            for (size_t tidx1 = tidx0 + 1; tidx1 < triangles_.size(); ++tidx1) {
-                const Eigen::Vector3i &tria_q = triangles_[tidx1];
-                // check if neighbour triangle
-                if (tria_p(0) == tria_q(0) || tria_p(0) == tria_q(1) ||
-                    tria_p(0) == tria_q(2) || tria_p(1) == tria_q(0) ||
-                    tria_p(1) == tria_q(1) || tria_p(1) == tria_q(2) ||
-                    tria_p(2) == tria_q(0) || tria_p(2) == tria_q(1) ||
-                    tria_p(2) == tria_q(2)) {
-                    continue;
-                }
-
-                // check for intersection
-                const Eigen::Vector3d &q0 = vertices_[tria_q(0)];
-                const Eigen::Vector3d &q1 = vertices_[tria_q(1)];
-                const Eigen::Vector3d &q2 = vertices_[tria_q(2)];
-
-                const Eigen::Vector3d bb_min2 =
-                        q0.array().min(q1.array().min(q2.array()));
-                const Eigen::Vector3d bb_max2 =
-                        q0.array().max(q1.array().max(q2.array()));
-                if (IntersectionTest::AABBAABB(bb_min1, bb_max1, bb_min2, bb_max2) &&
-                    IntersectionTest::TriangleTriangle3d(p0, p1, p2, q0, q1, q2)) {
-                    self_intersecting_triangles.push_back(
-                            Eigen::Vector2i(tidx0, tidx1));
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, triangles_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t tidx0 = range.begin(); tidx0 < range.end();
+                     ++tidx0) {
+                    const Eigen::Vector3i &tria_p = triangles_[tidx0];
+                    const Eigen::Vector3d &p0 = vertices_[tria_p(0)];
+                    const Eigen::Vector3d &p1 = vertices_[tria_p(1)];
+                    const Eigen::Vector3d &p2 = vertices_[tria_p(2)];
+
+                    const Eigen::Vector3d bb_min1 =
+                            p0.array().min(p1.array().min(p2.array()));
+                    const Eigen::Vector3d bb_max1 =
+                            p0.array().max(p1.array().max(p2.array()));
+
+                    for (size_t tidx1 = tidx0 + 1; tidx1 < triangles_.size();
+                         ++tidx1) {
+                        const Eigen::Vector3i &tria_q = triangles_[tidx1];
+                        // check if neighbour triangle
+                        if (tria_p(0) == tria_q(0) || tria_p(0) == tria_q(1) ||
+                            tria_p(0) == tria_q(2) || tria_p(1) == tria_q(0) ||
+                            tria_p(1) == tria_q(1) || tria_p(1) == tria_q(2) ||
+                            tria_p(2) == tria_q(0) || tria_p(2) == tria_q(1) ||
+                            tria_p(2) == tria_q(2)) {
+                            continue;
+                        }
+
+                        // check for intersection
+                        const Eigen::Vector3d &q0 = vertices_[tria_q(0)];
+                        const Eigen::Vector3d &q1 = vertices_[tria_q(1)];
+                        const Eigen::Vector3d &q2 = vertices_[tria_q(2)];
+
+                        const Eigen::Vector3d bb_min2 =
+                                q0.array().min(q1.array().min(q2.array()));
+                        const Eigen::Vector3d bb_max2 =
+                                q0.array().max(q1.array().max(q2.array()));
+                        if (IntersectionTest::AABBAABB(bb_min1, bb_max1,
+                                                       bb_min2, bb_max2) &&
+                            IntersectionTest::TriangleTriangle3d(p0, p1, p2, q0,
+                                                                 q1, q2)) {
+                            self_intersecting_triangles.push_back(
+                                    Eigen::Vector2i(tidx0, tidx1));
+                        }
+                    }
                 }
-            }
-        }
-    });
+            });
     return {self_intersecting_triangles.begin(),
             self_intersecting_triangles.end()};
 }
@@ -1445,25 +1459,27 @@ TriangleMesh::ClusterConnectedTriangles() const {
     utility::LogDebug("[ClusterConnectedTriangles] Compute triangle adjacency");
     auto edges_to_triangles = GetEdgeToTrianglesMap();
     std::vector<std::unordered_set<int>> adjacency_list(triangles_.size());
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, triangles_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range){
-        for (std::size_t tidx = range.begin(); tidx < range.end(); ++tidx) {
-            const auto &triangle = triangles_[tidx];
-            for (auto tnb :
-                 edges_to_triangles[GetOrderedEdge(triangle(0), triangle(1))]) {
-                adjacency_list[tidx].insert(tnb);
-            }
-            for (auto tnb :
-                 edges_to_triangles[GetOrderedEdge(triangle(0), triangle(2))]) {
-                adjacency_list[tidx].insert(tnb);
-            }
-            for (auto tnb :
-                 edges_to_triangles[GetOrderedEdge(triangle(1), triangle(2))]) {
-                adjacency_list[tidx].insert(tnb);
-            }
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, triangles_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t tidx = range.begin(); tidx < range.end();
+                     ++tidx) {
+                    const auto &triangle = triangles_[tidx];
+                    for (auto tnb : edges_to_triangles[GetOrderedEdge(
+                                 triangle(0), triangle(1))]) {
+                        adjacency_list[tidx].insert(tnb);
+                    }
+                    for (auto tnb : edges_to_triangles[GetOrderedEdge(
+                                 triangle(0), triangle(2))]) {
+                        adjacency_list[tidx].insert(tnb);
+                    }
+                    for (auto tnb : edges_to_triangles[GetOrderedEdge(
+                                 triangle(1), triangle(2))]) {
+                        adjacency_list[tidx].insert(tnb);
+                    }
+                }
+            });
     utility::LogDebug(
             "[ClusterConnectedTriangles] Done computing triangle adjacency");
 
diff --git a/cpp/open3d/geometry/TriangleMeshDeformation.cpp b/cpp/open3d/geometry/TriangleMeshDeformation.cpp
index 4541c0fd7c4..460fc2614a3 100644
--- a/cpp/open3d/geometry/TriangleMeshDeformation.cpp
+++ b/cpp/open3d/geometry/TriangleMeshDeformation.cpp
@@ -5,12 +5,12 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
+#include <tbb/parallel_for.h>
+
 #include <Eigen/Dense>
 #include <Eigen/Sparse>
 #include <algorithm>
 
-#include <tbb/tbb.h>
-
 #include "open3d/geometry/TriangleMesh.h"
 #include "open3d/utility/Logging.h"
 #include "open3d/utility/Parallel.h"
@@ -19,8 +19,8 @@ namespace open3d {
 namespace geometry {
 
 std::shared_ptr<TriangleMesh> TriangleMesh::DeformAsRigidAsPossible(
-        const std::vector<int> &constraint_vertex_indices,
-        const std::vector<Eigen::Vector3d> &constraint_vertex_positions,
+        const std::vector<int>& constraint_vertex_indices,
+        const std::vector<Eigen::Vector3d>& constraint_vertex_positions,
         size_t max_iter,
         DeformAsRigidAsPossibleEnergy energy_model,
         double smoothed_alpha) const {
@@ -95,77 +95,86 @@ std::shared_ptr<TriangleMesh> TriangleMesh::DeformAsRigidAsPossible(
             std::swap(Rs, Rs_old);
         }
 
-        tbb::parallel_for(tbb::blocked_range<int>(
-                0, vertices_.size(), utility::DefaultGrainSizeTBB()),
-                [&](const tbb::blocked_range<int>& range){
-            for (int i = range.begin(); i < range.end(); ++i) {
-                // Update rotations
-                Eigen::Matrix3d S = Eigen::Matrix3d::Zero();
-                Eigen::Matrix3d R = Eigen::Matrix3d::Zero();
-                int n_nbs = 0;
-                for (int j : prime->adjacency_list_[i]) {
-                    Eigen::Vector3d e0 = vertices_[i] - vertices_[j];
-                    Eigen::Vector3d e1 = prime->vertices_[i] - prime->vertices_[j];
-                    double w = edge_weights[GetOrderedEdge(i, j)];
-                    S += w * (e0 * e1.transpose());
-                    if (energy_model == DeformAsRigidAsPossibleEnergy::Smoothed) {
-                        R += Rs_old[j];
+        tbb::parallel_for(
+                tbb::blocked_range<int>(0, vertices_.size(),
+                                        utility::DefaultGrainSizeTBB()),
+                [&](const tbb::blocked_range<int>& range) {
+                    for (int i = range.begin(); i < range.end(); ++i) {
+                        // Update rotations
+                        Eigen::Matrix3d S = Eigen::Matrix3d::Zero();
+                        Eigen::Matrix3d R = Eigen::Matrix3d::Zero();
+                        int n_nbs = 0;
+                        for (int j : prime->adjacency_list_[i]) {
+                            Eigen::Vector3d e0 = vertices_[i] - vertices_[j];
+                            Eigen::Vector3d e1 =
+                                    prime->vertices_[i] - prime->vertices_[j];
+                            double w = edge_weights[GetOrderedEdge(i, j)];
+                            S += w * (e0 * e1.transpose());
+                            if (energy_model ==
+                                DeformAsRigidAsPossibleEnergy::Smoothed) {
+                                R += Rs_old[j];
+                            }
+                            n_nbs++;
+                        }
+                        if (energy_model ==
+                                    DeformAsRigidAsPossibleEnergy::Smoothed &&
+                            iter > 0 && n_nbs > 0) {
+                            S = 2 * S +
+                                (4 * smoothed_alpha * surface_area / n_nbs) *
+                                        R.transpose();
+                        }
+                        Eigen::JacobiSVD<Eigen::Matrix3d> svd(
+                                S, Eigen::ComputeFullU | Eigen::ComputeFullV);
+                        Eigen::Matrix3d U = svd.matrixU();
+                        Eigen::Matrix3d V = svd.matrixV();
+                        Eigen::Vector3d D(1, 1,
+                                          (V * U.transpose()).determinant());
+                        // ensure rotation:
+                        // http://graphics.stanford.edu/~smr/ICP/comparison/eggert_comparison_mva97.pdf
+                        Rs[i] = V * D.asDiagonal() * U.transpose();
+                        if (Rs[i].determinant() <= 0) {
+                            utility::LogError(
+                                    "something went wrong with updating R");
+                        }
                     }
-                    n_nbs++;
-                }
-                if (energy_model == DeformAsRigidAsPossibleEnergy::Smoothed &&
-                    iter > 0 && n_nbs > 0) {
-                    S = 2 * S + (4 * smoothed_alpha * surface_area / n_nbs)
-                        * R.transpose();
-                }
-                Eigen::JacobiSVD<Eigen::Matrix3d> svd(
-                        S, Eigen::ComputeFullU | Eigen::ComputeFullV);
-                Eigen::Matrix3d U = svd.matrixU();
-                Eigen::Matrix3d V = svd.matrixV();
-                Eigen::Vector3d D(1, 1, (V * U.transpose()).determinant());
-                // ensure rotation:
-                // http://graphics.stanford.edu/~smr/ICP/comparison/eggert_comparison_mva97.pdf
-                Rs[i] = V * D.asDiagonal() * U.transpose();
-                if (Rs[i].determinant() <= 0) {
-                    utility::LogError("something went wrong with updating R");
-                }
-            }
-        });
-
-        tbb::parallel_for(tbb::blocked_range<int>(
-                0, vertices_.size(), utility::DefaultGrainSizeTBB()),
-                [&](const tbb::blocked_range<int>& range){
-            for (int i = range.begin(); i < range.end(); ++i) {
-                // Update Positions
-                Eigen::Vector3d bi(0, 0, 0);
-                if (constraints.count(i) > 0) {
-                    bi = constraints[i];
-                } else {
-                    for (int j : prime->adjacency_list_[i]) {
-                        double w = edge_weights[GetOrderedEdge(i, j)];
-                        bi += w / 2 *
-                              ((Rs[i] + Rs[j]) * (vertices_[i] - vertices_[j]));
+                });
+
+        tbb::parallel_for(
+                tbb::blocked_range<int>(0, vertices_.size(),
+                                        utility::DefaultGrainSizeTBB()),
+                [&](const tbb::blocked_range<int>& range) {
+                    for (int i = range.begin(); i < range.end(); ++i) {
+                        // Update Positions
+                        Eigen::Vector3d bi(0, 0, 0);
+                        if (constraints.count(i) > 0) {
+                            bi = constraints[i];
+                        } else {
+                            for (int j : prime->adjacency_list_[i]) {
+                                double w = edge_weights[GetOrderedEdge(i, j)];
+                                bi += w / 2 *
+                                      ((Rs[i] + Rs[j]) *
+                                       (vertices_[i] - vertices_[j]));
+                            }
+                        }
+                        b[0](i) = bi(0);
+                        b[1](i) = bi(1);
+                        b[2](i) = bi(2);
                     }
-                }
-                b[0](i) = bi(0);
-                b[1](i) = bi(1);
-                b[2](i) = bi(2);
-            }
-        });
-
-        tbb::parallel_for(tbb::blocked_range<int>(0, 3, 1),
-                [&](const tbb::blocked_range<int>& range){
-            for (int comp = range.begin(); comp < range.end(); ++comp) {
-                Eigen::VectorXd p_prime = solver.solve(b[comp]);
-                if (solver.info() != Eigen::Success) {
-                    utility::LogError("Cholesky solve failed");
-                }
-                for (int i = 0; i < int(vertices_.size()); ++i) {
-                    prime->vertices_[i](comp) = p_prime(i);
-                }
-            }
-        });
-
+                });
+
+        tbb::parallel_for(
+                tbb::blocked_range<int>(0, 3, 1),
+                [&](const tbb::blocked_range<int>& range) {
+                    for (int comp = range.begin(); comp < range.end(); ++comp) {
+                        Eigen::VectorXd p_prime = solver.solve(b[comp]);
+                        if (solver.info() != Eigen::Success) {
+                            utility::LogError("Cholesky solve failed");
+                        }
+                        for (int i = 0; i < int(vertices_.size()); ++i) {
+                            prime->vertices_[i](comp) = p_prime(i);
+                        }
+                    }
+                });
 
         // Compute energy and log
         double energy = 0;
diff --git a/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp b/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp
index e137fa57933..535f0295b84 100644
--- a/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp
+++ b/cpp/open3d/io/sensor/azure_kinect/AzureKinectSensor.cpp
@@ -9,10 +9,9 @@
 
 #include <k4a/k4a.h>
 #include <k4arecord/record.h>
-#include <turbojpeg.h>
-
-#include <tbb/parallel_for.h>
 #include <tbb/blocked_range2d.h>
+#include <tbb/parallel_for.h>
+#include <turbojpeg.h>
 
 #include <memory>
 
@@ -164,18 +163,21 @@ void ConvertBGRAToRGB(geometry::Image &bgra, geometry::Image &rgb) {
     }
 
     tbb::parallel_for(tbb::blocked_range2d<int>(
-            0, bgra.height_, utility::DefaultGrainSizeTBB2D(),
-            0, bgra.width_, utility::DefaultGrainSizeTBB2D()),
-            [&](const tbb::blocked_range2d<int>& range){
-        for (int v = range.rows().begin(); v < range.rows().end(); ++v) {
-            for (int u = range.cols().begin(); u < range.cols().end(); ++u) {
-                for (int c = 0; c < 3; ++c) {
-                    *rgb.PointerAt<uint8_t>(u, v, c) =
-                            *bgra.PointerAt<uint8_t>(u, v, 2 - c);
-                }
-            }
-        }
-    });
+                              0, bgra.height_, utility::DefaultGrainSizeTBB2D(),
+                              0, bgra.width_, utility::DefaultGrainSizeTBB2D()),
+                      [&](const tbb::blocked_range2d<int> &range) {
+                          for (int v = range.rows().begin();
+                               v < range.rows().end(); ++v) {
+                              for (int u = range.cols().begin();
+                                   u < range.cols().end(); ++u) {
+                                  for (int c = 0; c < 3; ++c) {
+                                      *rgb.PointerAt<uint8_t>(u, v, c) =
+                                              *bgra.PointerAt<uint8_t>(u, v,
+                                                                       2 - c);
+                                  }
+                              }
+                          }
+                      });
 }
 
 bool AzureKinectSensor::PrintFirmware(k4a_device_t device) {
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h
index 68e31e43e9a..a8a0617ded3 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvBackpropFilter.h
@@ -7,11 +7,11 @@
 
 #pragma once
 
-#include <Eigen/Core>
-
 #include <tbb/mutex.h>
 #include <tbb/parallel_for.h>
 
+#include <Eigen/Core>
+
 namespace open3d {
 namespace ml {
 namespace impl {
diff --git a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h
index 6266a178afb..db54d4be9c8 100644
--- a/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h
+++ b/cpp/open3d/ml/impl/sparse_conv/SparseConvTransposeBackpropFilter.h
@@ -7,11 +7,11 @@
 
 #pragma once
 
-#include <Eigen/Core>
-
 #include <tbb/mutex.h>
 #include <tbb/parallel_for.h>
 
+#include <Eigen/Core>
+
 namespace open3d {
 namespace ml {
 namespace impl {
diff --git a/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp b/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp
index 10539c4a12b..91d00a6b83d 100644
--- a/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp
+++ b/cpp/open3d/pipelines/color_map/ColorMapUtils.cpp
@@ -7,9 +7,9 @@
 
 #include "open3d/pipelines/color_map/ColorMapUtils.h"
 
+#include <tbb/concurrent_vector.h>
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_for_each.h>
-#include <tbb/concurrent_vector.h>
 
 #include "open3d/camera/PinholeCameraTrajectory.h"
 #include "open3d/geometry/Image.h"
@@ -133,49 +133,55 @@ CreateVertexAndImageVisibility(
     std::vector<std::vector<int>> visibility_image_to_vertex;
     visibility_image_to_vertex.resize(n_camera);
 
-    tbb::parallel_for(tbb::blocked_range<int>(0, n_camera, 1),
+    tbb::parallel_for(
+            tbb::blocked_range<int>(0, n_camera, 1),
             [&](const tbb::blocked_range<int>& range) {
-        for (int camera_id = range.begin();
-                               camera_id < range.end(); ++camera_id) {
-            for (int vertex_id = 0; vertex_id < n_vertex; vertex_id++) {
-                Eigen::Vector3d X = mesh.vertices_[vertex_id];
-                float u, v, d;
-                std::tie(u, v, d) = Project3DPointAndGetUVDepth(
-                        X, camera_trajectory.parameters_[camera_id]);
-                int u_d = int(round(u));
-                int v_d = int(round(v));
-                // Skip if vertex in image boundary.
-                if (d < 0.0 ||
-                    !images_depth[camera_id].TestImageBoundary(u_d, v_d)) {
-                    continue;
-                }
-                // Skip if vertex's depth is too large (e.g. background).
-                float d_sensor =
-                        *images_depth[camera_id].PointerAt<float>(u_d, v_d);
-                if (d_sensor > maximum_allowable_depth) {
-                    continue;
-                }
-                // Check depth boundary mask. If a vertex is located at the boundary
-                // of an object, its color will be highly diverse from different
-                // viewing angles.
-                if (*images_mask[camera_id].PointerAt<uint8_t>(u_d, v_d) == 255) {
-                    continue;
-                }
-                // Check depth errors.
-                if (std::fabs(d - d_sensor) >=
-                    depth_threshold_for_visibility_check) {
-                    continue;
+                for (int camera_id = range.begin(); camera_id < range.end();
+                     ++camera_id) {
+                    for (int vertex_id = 0; vertex_id < n_vertex; vertex_id++) {
+                        Eigen::Vector3d X = mesh.vertices_[vertex_id];
+                        float u, v, d;
+                        std::tie(u, v, d) = Project3DPointAndGetUVDepth(
+                                X, camera_trajectory.parameters_[camera_id]);
+                        int u_d = int(round(u));
+                        int v_d = int(round(v));
+                        // Skip if vertex in image boundary.
+                        if (d < 0.0 ||
+                            !images_depth[camera_id].TestImageBoundary(u_d,
+                                                                       v_d)) {
+                            continue;
+                        }
+                        // Skip if vertex's depth is too large (e.g.
+                        // background).
+                        float d_sensor =
+                                *images_depth[camera_id].PointerAt<float>(u_d,
+                                                                          v_d);
+                        if (d_sensor > maximum_allowable_depth) {
+                            continue;
+                        }
+                        // Check depth boundary mask. If a vertex is located at
+                        // the boundary of an object, its color will be highly
+                        // diverse from different viewing angles.
+                        if (*images_mask[camera_id].PointerAt<uint8_t>(
+                                    u_d, v_d) == 255) {
+                            continue;
+                        }
+                        // Check depth errors.
+                        if (std::fabs(d - d_sensor) >=
+                            depth_threshold_for_visibility_check) {
+                            continue;
+                        }
+                        visibility_image_to_vertex[camera_id].push_back(
+                                vertex_id);
+                    }
                 }
-                visibility_image_to_vertex[camera_id].push_back(vertex_id);
-            }
-        }
-    });
+            });
 
     // visibility_vertex_to_image[v]: cameras that can see vertex v.
     std::vector<std::vector<int>> visibility_vertex_to_image;
     visibility_vertex_to_image.resize(n_vertex);
     for (int i = 0; i < n_camera; ++i) {
-        for (int vidx: visibility_image_to_vertex[i]) {
+        for (int vidx : visibility_image_to_vertex[i]) {
             visibility_vertex_to_image[vidx].emplace_back(i);
         }
     }
@@ -203,39 +209,42 @@ void SetProxyIntensityForVertex(
     auto n_vertex = mesh.vertices_.size();
     proxy_intensity.resize(n_vertex);
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, n_vertex, utility::DefaultGrainSizeTBB()),
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, n_vertex,
+                                            utility::DefaultGrainSizeTBB()),
             [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            proxy_intensity[i] = 0.0;
-            float sum = 0.0;
-            for (size_t iter = 0; iter < visibility_vertex_to_image[i].size();
-                 iter++) {
-                int j = visibility_vertex_to_image[i][iter];
-                float gray;
-                bool valid = false;
-                if (warping_fields.has_value()) {
-                    std::tie(valid, gray) = QueryImageIntensity<float>(
-                            images_gray[j], warping_fields.value()[j],
-                            mesh.vertices_[i], camera_trajectory.parameters_[j],
-                            utility::nullopt, image_boundary_margin);
-                } else {
-                    std::tie(valid, gray) = QueryImageIntensity<float>(
-                            images_gray[j], utility::nullopt, mesh.vertices_[i],
-                            camera_trajectory.parameters_[j], utility::nullopt,
-                            image_boundary_margin);
-                }
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    proxy_intensity[i] = 0.0;
+                    float sum = 0.0;
+                    for (size_t iter = 0;
+                         iter < visibility_vertex_to_image[i].size(); iter++) {
+                        int j = visibility_vertex_to_image[i][iter];
+                        float gray;
+                        bool valid = false;
+                        if (warping_fields.has_value()) {
+                            std::tie(valid, gray) = QueryImageIntensity<float>(
+                                    images_gray[j], warping_fields.value()[j],
+                                    mesh.vertices_[i],
+                                    camera_trajectory.parameters_[j],
+                                    utility::nullopt, image_boundary_margin);
+                        } else {
+                            std::tie(valid, gray) = QueryImageIntensity<float>(
+                                    images_gray[j], utility::nullopt,
+                                    mesh.vertices_[i],
+                                    camera_trajectory.parameters_[j],
+                                    utility::nullopt, image_boundary_margin);
+                        }
 
-                if (valid) {
-                    sum += 1.0;
-                    proxy_intensity[i] += gray;
+                        if (valid) {
+                            sum += 1.0;
+                            proxy_intensity[i] += gray;
+                        }
+                    }
+                    if (sum > 0) {
+                        proxy_intensity[i] /= sum;
+                    }
                 }
-            }
-            if (sum > 0) {
-                proxy_intensity[i] /= sum;
-            }
-        }
-    });
+            });
 }
 
 void SetGeometryColorAverage(
@@ -252,70 +261,76 @@ void SetGeometryColorAverage(
     tbb::concurrent_vector<std::size_t> valid_vertices;
     tbb::concurrent_vector<std::size_t> invalid_vertices;
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, n_vertex, utility::DefaultGrainSizeTBB()),
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, n_vertex,
+                                            utility::DefaultGrainSizeTBB()),
             [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            mesh.vertex_colors_[i] = Eigen::Vector3d::Zero();
-            double sum = 0.0;
-            for (size_t iter = 0; iter < visibility_vertex_to_image[i].size();
-                 iter++) {
-                int j = visibility_vertex_to_image[i][iter];
-                uint8_t r_temp, g_temp, b_temp;
-                bool valid = false;
-                utility::optional<ImageWarpingField> optional_warping_field;
-                if (warping_fields.has_value()) {
-                    optional_warping_field = warping_fields.value()[j];
-                } else {
-                    optional_warping_field = utility::nullopt;
-                }
-                std::tie(valid, r_temp) = QueryImageIntensity<uint8_t>(
-                        images_color[j], optional_warping_field,
-                        mesh.vertices_[i],camera_trajectory.parameters_[j],
-                        0, image_boundary_margin);
-                std::tie(valid, g_temp) = QueryImageIntensity<uint8_t>(
-                        images_color[j], optional_warping_field,
-                        mesh.vertices_[i], camera_trajectory.parameters_[j],
-                        1, image_boundary_margin);
-                std::tie(valid, b_temp) = QueryImageIntensity<uint8_t>(
-                        images_color[j], optional_warping_field,
-                        mesh.vertices_[i], camera_trajectory.parameters_[j],
-                        2, image_boundary_margin);
-                float r = (float)r_temp / 255.0f;
-                float g = (float)g_temp / 255.0f;
-                float b = (float)b_temp / 255.0f;
-                if (valid) {
-                    mesh.vertex_colors_[i] += Eigen::Vector3d(r, g, b);
-                    sum += 1.0;
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    mesh.vertex_colors_[i] = Eigen::Vector3d::Zero();
+                    double sum = 0.0;
+                    for (size_t iter = 0;
+                         iter < visibility_vertex_to_image[i].size(); iter++) {
+                        int j = visibility_vertex_to_image[i][iter];
+                        uint8_t r_temp, g_temp, b_temp;
+                        bool valid = false;
+                        utility::optional<ImageWarpingField>
+                                optional_warping_field;
+                        if (warping_fields.has_value()) {
+                            optional_warping_field = warping_fields.value()[j];
+                        } else {
+                            optional_warping_field = utility::nullopt;
+                        }
+                        std::tie(valid, r_temp) = QueryImageIntensity<uint8_t>(
+                                images_color[j], optional_warping_field,
+                                mesh.vertices_[i],
+                                camera_trajectory.parameters_[j], 0,
+                                image_boundary_margin);
+                        std::tie(valid, g_temp) = QueryImageIntensity<uint8_t>(
+                                images_color[j], optional_warping_field,
+                                mesh.vertices_[i],
+                                camera_trajectory.parameters_[j], 1,
+                                image_boundary_margin);
+                        std::tie(valid, b_temp) = QueryImageIntensity<uint8_t>(
+                                images_color[j], optional_warping_field,
+                                mesh.vertices_[i],
+                                camera_trajectory.parameters_[j], 2,
+                                image_boundary_margin);
+                        float r = (float)r_temp / 255.0f;
+                        float g = (float)g_temp / 255.0f;
+                        float b = (float)b_temp / 255.0f;
+                        if (valid) {
+                            mesh.vertex_colors_[i] += Eigen::Vector3d(r, g, b);
+                            sum += 1.0;
+                        }
+                    }
+                    if (sum > 0.0) {
+                        mesh.vertex_colors_[i] /= sum;
+                        valid_vertices.push_back(i);
+                    } else {
+                        invalid_vertices.push_back(i);
+                    }
                 }
-            }
-            if (sum > 0.0) {
-                mesh.vertex_colors_[i] /= sum;
-                valid_vertices.push_back(i);
-            } else {
-                invalid_vertices.push_back(i);
-            }
-        }
-    });
+            });
     if (invisible_vertex_color_knn > 0) {
         std::shared_ptr<geometry::TriangleMesh> valid_mesh = mesh.SelectByIndex(
                 {valid_vertices.begin(), valid_vertices.end()});
         geometry::KDTreeFlann kd_tree(*valid_mesh);
-        tbb::parallel_for_each(invalid_vertices,
-                [&](std::size_t invalid_vertex){
-            std::vector<int> indices;  // indices to valid_mesh
-            std::vector<double> dists;
-            kd_tree.SearchKNN(mesh.vertices_[invalid_vertex],
-                              invisible_vertex_color_knn, indices, dists);
-            Eigen::Vector3d new_color(0, 0, 0);
-            for (const int& index : indices) {
-                new_color += valid_mesh->vertex_colors_[index];
-            }
-            if (indices.empty()) {
-                new_color /= static_cast<double>(indices.size());
-            }
-            mesh.vertex_colors_[invalid_vertex] = new_color;
-        });
+        tbb::parallel_for_each(
+                invalid_vertices, [&](std::size_t invalid_vertex) {
+                    std::vector<int> indices;  // indices to valid_mesh
+                    std::vector<double> dists;
+                    kd_tree.SearchKNN(mesh.vertices_[invalid_vertex],
+                                      invisible_vertex_color_knn, indices,
+                                      dists);
+                    Eigen::Vector3d new_color(0, 0, 0);
+                    for (const int& index : indices) {
+                        new_color += valid_mesh->vertex_colors_[index];
+                    }
+                    if (indices.empty()) {
+                        new_color /= static_cast<double>(indices.size());
+                    }
+                    mesh.vertex_colors_[invalid_vertex] = new_color;
+                });
     }
 }
 
diff --git a/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp b/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
index d93cc16f4b4..7a0861bbce7 100644
--- a/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
+++ b/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
@@ -7,13 +7,13 @@
 
 #include "open3d/pipelines/color_map/NonRigidOptimizer.h"
 
-#include <memory>
-#include <vector>
-
 #include <tbb/blocked_range.h>
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_reduce.h>
 
+#include <memory>
+#include <vector>
+
 #include "open3d/io/ImageIO.h"
 #include "open3d/io/ImageWarpingFieldIO.h"
 #include "open3d/io/PinholeCameraTrajectoryIO.h"
@@ -47,12 +47,14 @@ static std::vector<ImageWarpingField> CreateWarpingFields(
     return fields;
 }
 
-template <typename MatOutType, typename VecOutType,
-          typename VecInTypeDouble, typename VecInTypeInt>
+template <typename MatOutType,
+          typename VecOutType,
+          typename VecInTypeDouble,
+          typename VecInTypeInt>
 struct JTJandJTrNonRigidReducer {
     // Global Data
-    using FuncType = std::function<void(int,
-        VecInTypeDouble&, double&, VecInTypeInt&)>;
+    using FuncType =
+            std::function<void(int, VecInTypeDouble&, double&, VecInTypeInt&)>;
     FuncType& f;
     // Local Data
     MatOutType JTJ;
@@ -60,11 +62,13 @@ struct JTJandJTrNonRigidReducer {
     double r2_sum = 0.0;
 
     JTJandJTrNonRigidReducer(FuncType& f_, int nonrigidval)
-            : f(f_), JTJ(MatOutType::Zero(6 + nonrigidval, 6 + nonrigidval)),
-            JTr(VecOutType::Zero(6 + nonrigidval)) {};
+        : f(f_),
+          JTJ(MatOutType::Zero(6 + nonrigidval, 6 + nonrigidval)),
+          JTr(VecOutType::Zero(6 + nonrigidval)){};
 
     JTJandJTrNonRigidReducer(JTJandJTrNonRigidReducer& o, tbb::split)
-            : f(o.f), JTJ(MatOutType::Zero(o.JTJ.rows(), o.JTJ.cols())),
+        : f(o.f),
+          JTJ(MatOutType::Zero(o.JTJ.rows(), o.JTJ.cols())),
           JTr(VecOutType::Zero(o.JTr.rows(), o.JTr.cols())) {}
 
     void operator()(const tbb::blocked_range<int>& range) {
@@ -112,13 +116,17 @@ static std::tuple<MatOutType, VecOutType, double> ComputeJTJandJTrNonRigid(
         int nonrigidval,
         bool verbose /*=true*/) {
     JTJandJTrNonRigidReducer<MatOutType, VecOutType, VecInTypeDouble,
-            VecInTypeInt> reducer(f, nonrigidval);
-    tbb::parallel_reduce(tbb::blocked_range<int>(0, iteration_num,
-        utility::DefaultGrainSizeTBB()), reducer);
+                             VecInTypeInt>
+            reducer(f, nonrigidval);
+    tbb::parallel_reduce(
+            tbb::blocked_range<int>(0, iteration_num,
+                                    utility::DefaultGrainSizeTBB()),
+            reducer);
     auto out = std::move(reducer).as_tuple();
     if (verbose) {
         utility::LogDebug("Residual : {:.2e} (# of elements : {:d})",
-            std::get<2>(out) / (double)iteration_num, iteration_num);
+                          std::get<2>(out) / (double)iteration_num,
+                          iteration_num);
     }
     return out;
 }
@@ -228,7 +236,8 @@ inline void atomic_sum(std::atomic<double>& total, const double& val) {
     total.fetch_add(val);
 #else
     double expected = total;
-    while(!total.compare_exchange_weak(expected, expected + val));
+    while (!total.compare_exchange_weak(expected, expected + val))
+        ;
 #endif
 }
 
@@ -316,72 +325,81 @@ RunNonRigidOptimizer(const geometry::TriangleMesh& mesh,
         utility::LogDebug("[Iteration {:04d}] ", itr + 1);
         std::atomic<double> residual = 0.0;
         std::atomic<double> residual_reg = 0.0;
-        tbb::parallel_for(tbb::blocked_range<int>(0, n_camera, 1),
+        tbb::parallel_for(
+                tbb::blocked_range<int>(0, n_camera, 1),
                 [&](const tbb::blocked_range<int>& range) {
-            for (int c = range.begin(); c < range.end(); ++c) {
-                int nonrigidval = warping_fields[c].anchor_w_ *
-                                  warping_fields[c].anchor_h_ * 2;
-                double rr_reg = 0.0;
-
-                Eigen::Matrix4d pose;
-                pose = opt_camera_trajectory.parameters_[c].extrinsic_;
-
-                auto intrinsic = opt_camera_trajectory.parameters_[c]
-                                         .intrinsic_.intrinsic_matrix_;
-                auto extrinsic = opt_camera_trajectory.
-                                 parameters_[c].extrinsic_;
-                Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
-                intr.block<3, 3>(0, 0) = intrinsic;
-                intr(3, 3) = 1.0;
-
-                auto f_lambda = [&](int i, Eigen::Vector14d& J_r, double& r,
-                                    Eigen::Vector14i& pattern) {
-                    ComputeJacobianAndResidualNonRigid(
-                            i, J_r, r, pattern, opt_mesh, proxy_intensity,
-                            images_gray[c], images_dx[c], images_dy[c],
-                            warping_fields[c], intr, extrinsic,
-                            visibility_image_to_vertex[c],
-                            option.image_boundary_margin_);
-                };
-                Eigen::MatrixXd JTJ;
-                Eigen::VectorXd JTr;
-                double r2;
-                std::tie(JTJ, JTr, r2) =
-                        ComputeJTJandJTrNonRigid<Eigen::Vector14d, Eigen::Vector14i,
-                                                 Eigen::MatrixXd, Eigen::VectorXd>(
-                                f_lambda, int(visibility_image_to_vertex[c].size()),
+                    for (int c = range.begin(); c < range.end(); ++c) {
+                        int nonrigidval = warping_fields[c].anchor_w_ *
+                                          warping_fields[c].anchor_h_ * 2;
+                        double rr_reg = 0.0;
+
+                        Eigen::Matrix4d pose;
+                        pose = opt_camera_trajectory.parameters_[c].extrinsic_;
+
+                        auto intrinsic = opt_camera_trajectory.parameters_[c]
+                                                 .intrinsic_.intrinsic_matrix_;
+                        auto extrinsic =
+                                opt_camera_trajectory.parameters_[c].extrinsic_;
+                        Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
+                        intr.block<3, 3>(0, 0) = intrinsic;
+                        intr(3, 3) = 1.0;
+
+                        auto f_lambda = [&](int i, Eigen::Vector14d& J_r,
+                                            double& r,
+                                            Eigen::Vector14i& pattern) {
+                            ComputeJacobianAndResidualNonRigid(
+                                    i, J_r, r, pattern, opt_mesh,
+                                    proxy_intensity, images_gray[c],
+                                    images_dx[c], images_dy[c],
+                                    warping_fields[c], intr, extrinsic,
+                                    visibility_image_to_vertex[c],
+                                    option.image_boundary_margin_);
+                        };
+                        Eigen::MatrixXd JTJ;
+                        Eigen::VectorXd JTr;
+                        double r2;
+                        std::tie(JTJ, JTr, r2) = ComputeJTJandJTrNonRigid<
+                                Eigen::Vector14d, Eigen::Vector14i,
+                                Eigen::MatrixXd, Eigen::VectorXd>(
+                                f_lambda,
+                                int(visibility_image_to_vertex[c].size()),
                                 nonrigidval, false);
 
-                double weight = option.non_rigid_anchor_point_weight_ *
-                                visibility_image_to_vertex[c].size() / n_vertex;
-                for (int j = 0; j < nonrigidval; j++) {
-                    double r = weight * (warping_fields[c].flow_(j) -
-                                         warping_fields_init[c].flow_(j));
-                    JTJ(6 + j, 6 + j) += weight * weight;
-                    JTr(6 + j) += weight * r;
-                    rr_reg += r * r;
-                }
-
-                bool success;
-                Eigen::VectorXd result;
-                std::tie(success, result) = utility::SolveLinearSystemPSD(
-                        JTJ, -JTr, /*prefer_sparse=*/false,
-                        /*check_symmetric=*/false,
-                        /*check_det=*/false, /*check_psd=*/false);
-                Eigen::Vector6d result_pose;
-                result_pose << result.block(0, 0, 6, 1);
-                auto delta = utility::TransformVector6dToMatrix4d(result_pose);
-                pose = delta * pose;
-
-                for (int j = 0; j < nonrigidval; j++) {
-                    warping_fields[c].flow_(j) += result(6 + j);
-                }
-                opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
-
-                atomic_sum(residual, r2);
-                atomic_sum(residual_reg, rr_reg);
-            }
-        });
+                        double weight = option.non_rigid_anchor_point_weight_ *
+                                        visibility_image_to_vertex[c].size() /
+                                        n_vertex;
+                        for (int j = 0; j < nonrigidval; j++) {
+                            double r =
+                                    weight * (warping_fields[c].flow_(j) -
+                                              warping_fields_init[c].flow_(j));
+                            JTJ(6 + j, 6 + j) += weight * weight;
+                            JTr(6 + j) += weight * r;
+                            rr_reg += r * r;
+                        }
+
+                        bool success;
+                        Eigen::VectorXd result;
+                        std::tie(success, result) =
+                                utility::SolveLinearSystemPSD(
+                                        JTJ, -JTr, /*prefer_sparse=*/false,
+                                        /*check_symmetric=*/false,
+                                        /*check_det=*/false,
+                                        /*check_psd=*/false);
+                        Eigen::Vector6d result_pose;
+                        result_pose << result.block(0, 0, 6, 1);
+                        auto delta = utility::TransformVector6dToMatrix4d(
+                                result_pose);
+                        pose = delta * pose;
+
+                        for (int j = 0; j < nonrigidval; j++) {
+                            warping_fields[c].flow_(j) += result(6 + j);
+                        }
+                        opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
+
+                        atomic_sum(residual, r2);
+                        atomic_sum(residual_reg, rr_reg);
+                    }
+                });
         utility::LogDebug("Residual error : {:.6f}, reg : {:.6f}", residual,
                           residual_reg);
         SetProxyIntensityForVertex(opt_mesh, images_gray, warping_fields,
diff --git a/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp b/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
index f507e996098..91636d39e50 100644
--- a/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
+++ b/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
@@ -7,11 +7,11 @@
 
 #include "open3d/pipelines/color_map/RigidOptimizer.h"
 
+#include <tbb/parallel_for.h>
+
 #include <memory>
 #include <vector>
 
-#include <tbb/parallel_for.h>
-
 #include "open3d/io/ImageIO.h"
 #include "open3d/io/PinholeCameraTrajectoryIO.h"
 #include "open3d/io/TriangleMeshIO.h"
@@ -77,7 +77,8 @@ inline void atomic_sum(std::atomic<double>& total, const double& val) {
     total.fetch_add(val);
 #else
     double expected = total;
-    while(!total.compare_exchange_weak(expected, expected + val));
+    while (!total.compare_exchange_weak(expected, expected + val))
+        ;
 #endif
 }
 
@@ -157,46 +158,50 @@ RunRigidOptimizer(const geometry::TriangleMesh& mesh,
         utility::LogDebug("[Iteration {:04d}] ", itr + 1);
         std::atomic<double> residual = 0.0;
         total_num_ = 0;
-        tbb::parallel_for(tbb::blocked_range<int>(0, n_camera, 1),
+        tbb::parallel_for(
+                tbb::blocked_range<int>(0, n_camera, 1),
                 [&](const tbb::blocked_range<int>& range) {
-            for (int c = range.begin(); c < range.end(); ++c) {
-                Eigen::Matrix4d pose;
-                pose = opt_camera_trajectory.parameters_[c].extrinsic_;
-
-                auto intrinsic = opt_camera_trajectory.
-                                 parameters_[c].intrinsic_.intrinsic_matrix_;
-                auto extrinsic = opt_camera_trajectory.
-                                 parameters_[c].extrinsic_;
-                Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
-                intr.block<3, 3>(0, 0) = intrinsic;
-                intr(3, 3) = 1.0;
-
-                auto f_lambda = [&](int i, Eigen::Vector6d& J_r,
-                                    double& r, double& w) {
-                    w = 1.0;  // Dummy.
-                    ComputeJacobianAndResidualRigid(
-                            i, J_r, r, w, opt_mesh, proxy_intensity,
-                            images_gray[c], images_dx[c], images_dy[c], intr,
-                            extrinsic, visibility_image_to_vertex[c],
-                            option.image_boundary_margin_);
-                };
-                Eigen::Matrix6d JTJ;
-                Eigen::Vector6d JTr;
-                double r2;
-                std::tie(JTJ, JTr, r2) = utility::ComputeJTJandJTr<
-                        Eigen::Matrix6d, Eigen::Vector6d>(f_lambda,
-                            int(visibility_image_to_vertex[c].size()), false);
-
-                Eigen::Matrix4d delta;
-                std::tie(std::ignore, delta) =
-                        utility::SolveJacobianSystemAndObtainExtrinsicMatrix(
-                                JTJ, JTr);
-                pose = delta * pose;
-                opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
-                atomic_sum(residual, r2);
-                total_num_ += int(visibility_image_to_vertex[c].size());
-            }
-        });
+                    for (int c = range.begin(); c < range.end(); ++c) {
+                        Eigen::Matrix4d pose;
+                        pose = opt_camera_trajectory.parameters_[c].extrinsic_;
+
+                        auto intrinsic = opt_camera_trajectory.parameters_[c]
+                                                 .intrinsic_.intrinsic_matrix_;
+                        auto extrinsic =
+                                opt_camera_trajectory.parameters_[c].extrinsic_;
+                        Eigen::Matrix4d intr = Eigen::Matrix4d::Zero();
+                        intr.block<3, 3>(0, 0) = intrinsic;
+                        intr(3, 3) = 1.0;
+
+                        auto f_lambda = [&](int i, Eigen::Vector6d& J_r,
+                                            double& r, double& w) {
+                            w = 1.0;  // Dummy.
+                            ComputeJacobianAndResidualRigid(
+                                    i, J_r, r, w, opt_mesh, proxy_intensity,
+                                    images_gray[c], images_dx[c], images_dy[c],
+                                    intr, extrinsic,
+                                    visibility_image_to_vertex[c],
+                                    option.image_boundary_margin_);
+                        };
+                        Eigen::Matrix6d JTJ;
+                        Eigen::Vector6d JTr;
+                        double r2;
+                        std::tie(JTJ, JTr, r2) = utility::ComputeJTJandJTr<
+                                Eigen::Matrix6d, Eigen::Vector6d>(
+                                f_lambda,
+                                int(visibility_image_to_vertex[c].size()),
+                                false);
+
+                        Eigen::Matrix4d delta;
+                        std::tie(std::ignore, delta) = utility::
+                                SolveJacobianSystemAndObtainExtrinsicMatrix(
+                                        JTJ, JTr);
+                        pose = delta * pose;
+                        opt_camera_trajectory.parameters_[c].extrinsic_ = pose;
+                        atomic_sum(residual, r2);
+                        total_num_ += int(visibility_image_to_vertex[c].size());
+                    }
+                });
         if (total_num_ > 0) {
             utility::LogDebug("Residual error : {:.6f} (avg : {:.6f})",
                               residual, residual / total_num_);
diff --git a/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp b/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp
index 6698fc149d5..a4d27970b4c 100644
--- a/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp
+++ b/cpp/open3d/pipelines/integration/UniformTSDFVolume.cpp
@@ -7,14 +7,14 @@
 
 #include "open3d/pipelines/integration/UniformTSDFVolume.h"
 
+#include <tbb/blocked_range2d.h>
+#include <tbb/blocked_range3d.h>
+#include <tbb/parallel_for.h>
+
 #include <iostream>
 #include <thread>
 #include <unordered_map>
 
-#include <tbb/parallel_for.h>
-#include <tbb/blocked_range2d.h>
-#include <tbb/blocked_range3d.h>
-
 #include "open3d/geometry/VoxelGrid.h"
 #include "open3d/pipelines/integration/MarchingCubesConst.h"
 #include "open3d/utility/Helper.h"
@@ -261,28 +261,34 @@ std::shared_ptr<geometry::VoxelGrid> UniformTSDFVolume::ExtractVoxelGrid()
     voxel_grid->voxel_size_ = voxel_length_;
     voxel_grid->origin_ = origin_;
 
-    tbb::parallel_for(tbb::blocked_range3d<int>(
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D()),
-            [&](const tbb::blocked_range3d<int>& range){
-        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
-            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
-                for (int z = range.cols().begin(); z < range.cols().end(); z++){
-                    const int ind = IndexOf(x, y, z);
-                    const float w = voxels_[ind].weight_;
-                    const float f = voxels_[ind].tsdf_;
-                    if (w != 0.0f && f < 0.98f && f >= -0.98f) {
-                        double c = (f + 1.0) * 0.5;
-                        Eigen::Vector3d color = Eigen::Vector3d(c, c, c);
-                        Eigen::Vector3i index = Eigen::Vector3i(x, y, z);
-                        voxel_grid->voxels_[index] =
-                                geometry::Voxel(index, color);
+    tbb::parallel_for(
+            tbb::blocked_range3d<int>(
+                    0, resolution_, utility::DefaultGrainSizeTBB2D(), 0,
+                    resolution_, utility::DefaultGrainSizeTBB2D(), 0,
+                    resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range3d<int> &range) {
+                for (int x = range.pages().begin(); x < range.pages().end();
+                     x++) {
+                    for (int y = range.rows().begin(); y < range.rows().end();
+                         y++) {
+                        for (int z = range.cols().begin();
+                             z < range.cols().end(); z++) {
+                            const int ind = IndexOf(x, y, z);
+                            const float w = voxels_[ind].weight_;
+                            const float f = voxels_[ind].tsdf_;
+                            if (w != 0.0f && f < 0.98f && f >= -0.98f) {
+                                double c = (f + 1.0) * 0.5;
+                                Eigen::Vector3d color =
+                                        Eigen::Vector3d(c, c, c);
+                                Eigen::Vector3i index =
+                                        Eigen::Vector3i(x, y, z);
+                                voxel_grid->voxels_[index] =
+                                        geometry::Voxel(index, color);
+                            }
+                        }
                     }
                 }
-            }
-        }
-    });
+            });
     return voxel_grid;
 }
 
@@ -291,21 +297,25 @@ std::vector<Eigen::Vector2d> UniformTSDFVolume::ExtractVolumeTSDF() const {
     sharedvoxels_.resize(voxel_num_);
 
     tbb::parallel_for(tbb::blocked_range3d<int>(
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D()),
-            [&](const tbb::blocked_range3d<int>& range){
-        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
-            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
-                for (int z = range.cols().begin(); z < range.cols().end(); z++){
-                    const int ind = IndexOf(x, y, z);
-                    const float f = voxels_[ind].tsdf_;
-                    const float w = voxels_[ind].weight_;
-                    sharedvoxels_[ind] = Eigen::Vector2d(f, w);
-                }
-            }
-        }
-    });
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D()),
+                      [&](const tbb::blocked_range3d<int> &range) {
+                          for (int x = range.pages().begin();
+                               x < range.pages().end(); x++) {
+                              for (int y = range.rows().begin();
+                                   y < range.rows().end(); y++) {
+                                  for (int z = range.cols().begin();
+                                       z < range.cols().end(); z++) {
+                                      const int ind = IndexOf(x, y, z);
+                                      const float f = voxels_[ind].tsdf_;
+                                      const float w = voxels_[ind].weight_;
+                                      sharedvoxels_[ind] =
+                                              Eigen::Vector2d(f, w);
+                                  }
+                              }
+                          }
+                      });
     return sharedvoxels_;
 }
 
@@ -314,57 +324,67 @@ std::vector<Eigen::Vector3d> UniformTSDFVolume::ExtractVolumeColor() const {
     sharedcolors_.resize(voxel_num_);
 
     tbb::parallel_for(tbb::blocked_range3d<int>(
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D()),
-            [&](const tbb::blocked_range3d<int>& range){
-        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
-            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
-                for (int z = range.cols().begin(); z < range.cols().end(); z++){
-                    const int ind = IndexOf(x, y, z);
-                    sharedcolors_[ind] = voxels_[ind].color_;
-                }
-            }
-        }
-    });
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D()),
+                      [&](const tbb::blocked_range3d<int> &range) {
+                          for (int x = range.pages().begin();
+                               x < range.pages().end(); x++) {
+                              for (int y = range.rows().begin();
+                                   y < range.rows().end(); y++) {
+                                  for (int z = range.cols().begin();
+                                       z < range.cols().end(); z++) {
+                                      const int ind = IndexOf(x, y, z);
+                                      sharedcolors_[ind] = voxels_[ind].color_;
+                                  }
+                              }
+                          }
+                      });
     return sharedcolors_;
 }
 
 void UniformTSDFVolume::InjectVolumeTSDF(
         const std::vector<Eigen::Vector2d> &sharedvoxels) {
     tbb::parallel_for(tbb::blocked_range3d<int>(
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D()),
-            [&](const tbb::blocked_range3d<int>& range){
-        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
-            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
-                for (int z = range.cols().begin(); z < range.cols().end(); z++){
-                    const int ind = IndexOf(x, y, z);
-                    voxels_[ind].tsdf_ = sharedvoxels[ind](0);
-                    voxels_[ind].weight_ = sharedvoxels[ind](1);
-                }
-            }
-        }
-    });
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D()),
+                      [&](const tbb::blocked_range3d<int> &range) {
+                          for (int x = range.pages().begin();
+                               x < range.pages().end(); x++) {
+                              for (int y = range.rows().begin();
+                                   y < range.rows().end(); y++) {
+                                  for (int z = range.cols().begin();
+                                       z < range.cols().end(); z++) {
+                                      const int ind = IndexOf(x, y, z);
+                                      voxels_[ind].tsdf_ = sharedvoxels[ind](0);
+                                      voxels_[ind].weight_ =
+                                              sharedvoxels[ind](1);
+                                  }
+                              }
+                          }
+                      });
 }
 
 void UniformTSDFVolume::InjectVolumeColor(
         const std::vector<Eigen::Vector3d> &sharedcolors) {
     tbb::parallel_for(tbb::blocked_range3d<int>(
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D()),
-            [&](const tbb::blocked_range3d<int>& range){
-        for (int x = range.pages().begin(); x < range.pages().end(); x++) {
-            for (int y = range.rows().begin(); y < range.rows().end(); y++) {
-                for (int z = range.cols().begin(); z < range.cols().end(); z++){
-                    const int ind = IndexOf(x, y, z);
-                    voxels_[ind].color_ = sharedcolors[ind];
-                }
-            }
-        }
-    });
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D(),
+                              0, resolution_, utility::DefaultGrainSizeTBB2D()),
+                      [&](const tbb::blocked_range3d<int> &range) {
+                          for (int x = range.pages().begin();
+                               x < range.pages().end(); x++) {
+                              for (int y = range.rows().begin();
+                                   y < range.rows().end(); y++) {
+                                  for (int z = range.cols().begin();
+                                       z < range.cols().end(); z++) {
+                                      const int ind = IndexOf(x, y, z);
+                                      voxels_[ind].color_ = sharedcolors[ind];
+                                  }
+                              }
+                          }
+                      });
 }
 
 void UniformTSDFVolume::IntegrateWithDepthToCameraDistanceMultiplier(
@@ -385,75 +405,88 @@ void UniformTSDFVolume::IntegrateWithDepthToCameraDistanceMultiplier(
     const float safe_width_f = intrinsic.width_ - 0.0001f;
     const float safe_height_f = intrinsic.height_ - 0.0001f;
 
-    tbb::parallel_for(tbb::blocked_range2d<int>(
-            0, resolution_, utility::DefaultGrainSizeTBB2D(),
-            0, resolution_, utility::DefaultGrainSizeTBB2D()),
-            [&](const tbb::blocked_range2d<int>& range) {
-        for (int x = range.rows().begin(); x < range.rows().end(); ++x) {
-            for (int y = range.cols().begin(); y < range.cols().end(); ++y) {
-                Eigen::Vector4f pt_3d_homo(
-                    float(half_voxel_length_f + voxel_length_f * x + origin_(0)),
-                    float(half_voxel_length_f + voxel_length_f * y + origin_(1)),
-                    float(half_voxel_length_f + origin_(2)), 1.f);
-                Eigen::Vector4f pt_camera = extrinsic_f * pt_3d_homo;
-                for (int z = 0; z < resolution_; z++,
-                         pt_camera(0) += extrinsic_scaled_f(0, 2),
-                         pt_camera(1) += extrinsic_scaled_f(1, 2),
-                         pt_camera(2) += extrinsic_scaled_f(2, 2)) {
-                    // Skip if negative depth after projection
-                    if (pt_camera(2) <= 0) {
-                        continue;
-                    }
-                    // Skip if x-y coordinate not in range
-                    float u_f = pt_camera(0) * fx / pt_camera(2) + cx + 0.5f;
-                    float v_f = pt_camera(1) * fy / pt_camera(2) + cy + 0.5f;
-                    if (!(u_f >= 0.0001f && u_f < safe_width_f && v_f >= 0.0001f &&
-                          v_f < safe_height_f)) {
-                        continue;
-                    }
-                    // Skip if negative depth in depth image
-                    int u = (int)u_f;
-                    int v = (int)v_f;
-                    float d = *image.depth_.PointerAt<float>(u, v);
-                    if (d <= 0.0f) {
-                        continue;
-                    }
+    tbb::parallel_for(
+            tbb::blocked_range2d<int>(
+                    0, resolution_, utility::DefaultGrainSizeTBB2D(), 0,
+                    resolution_, utility::DefaultGrainSizeTBB2D()),
+            [&](const tbb::blocked_range2d<int> &range) {
+                for (int x = range.rows().begin(); x < range.rows().end();
+                     ++x) {
+                    for (int y = range.cols().begin(); y < range.cols().end();
+                         ++y) {
+                        Eigen::Vector4f pt_3d_homo(
+                                float(half_voxel_length_f + voxel_length_f * x +
+                                      origin_(0)),
+                                float(half_voxel_length_f + voxel_length_f * y +
+                                      origin_(1)),
+                                float(half_voxel_length_f + origin_(2)), 1.f);
+                        Eigen::Vector4f pt_camera = extrinsic_f * pt_3d_homo;
+                        for (int z = 0; z < resolution_; z++,
+                                 pt_camera(0) += extrinsic_scaled_f(0, 2),
+                                 pt_camera(1) += extrinsic_scaled_f(1, 2),
+                                 pt_camera(2) += extrinsic_scaled_f(2, 2)) {
+                            // Skip if negative depth after projection
+                            if (pt_camera(2) <= 0) {
+                                continue;
+                            }
+                            // Skip if x-y coordinate not in range
+                            float u_f = pt_camera(0) * fx / pt_camera(2) + cx +
+                                        0.5f;
+                            float v_f = pt_camera(1) * fy / pt_camera(2) + cy +
+                                        0.5f;
+                            if (!(u_f >= 0.0001f && u_f < safe_width_f &&
+                                  v_f >= 0.0001f && v_f < safe_height_f)) {
+                                continue;
+                            }
+                            // Skip if negative depth in depth image
+                            int u = (int)u_f;
+                            int v = (int)v_f;
+                            float d = *image.depth_.PointerAt<float>(u, v);
+                            if (d <= 0.0f) {
+                                continue;
+                            }
 
-                    int v_ind = IndexOf(x, y, z);
-                    float sdf = (d - pt_camera(2)) *
-                                (*depth_to_camera_distance_multiplier
-                                          .PointerAt<float>(u, v));
-                    if (sdf > -sdf_trunc_f) {
-                        // integrate
-                        float tsdf = std::min(1.0f, sdf * sdf_trunc_inv_f);
-                        voxels_[v_ind].tsdf_ =
-                                (voxels_[v_ind].tsdf_ * voxels_[v_ind].weight_ +
-                                 tsdf) /
-                                (voxels_[v_ind].weight_ + 1.0f);
-                        if (color_type_ == TSDFVolumeColorType::RGB8) {
-                            const uint8_t *rgb =
-                                    image.color_.PointerAt<uint8_t>(u, v, 0);
-                            Eigen::Vector3d rgb_f(rgb[0], rgb[1], rgb[2]);
-                            voxels_[v_ind].color_ =
-                                    (voxels_[v_ind].color_ *
-                                             voxels_[v_ind].weight_ +
-                                     rgb_f) /
-                                    (voxels_[v_ind].weight_ + 1.0f);
-                        } else if (color_type_ == TSDFVolumeColorType::Gray32) {
-                            const float *intensity =
-                                    image.color_.PointerAt<float>(u, v, 0);
-                            voxels_[v_ind].color_ =
-                                    (voxels_[v_ind].color_.array() *
-                                             voxels_[v_ind].weight_ +
-                                     (*intensity)) /
-                                    (voxels_[v_ind].weight_ + 1.0f);
+                            int v_ind = IndexOf(x, y, z);
+                            float sdf = (d - pt_camera(2)) *
+                                        (*depth_to_camera_distance_multiplier
+                                                  .PointerAt<float>(u, v));
+                            if (sdf > -sdf_trunc_f) {
+                                // integrate
+                                float tsdf =
+                                        std::min(1.0f, sdf * sdf_trunc_inv_f);
+                                voxels_[v_ind].tsdf_ =
+                                        (voxels_[v_ind].tsdf_ *
+                                                 voxels_[v_ind].weight_ +
+                                         tsdf) /
+                                        (voxels_[v_ind].weight_ + 1.0f);
+                                if (color_type_ == TSDFVolumeColorType::RGB8) {
+                                    const uint8_t *rgb =
+                                            image.color_.PointerAt<uint8_t>(
+                                                    u, v, 0);
+                                    Eigen::Vector3d rgb_f(rgb[0], rgb[1],
+                                                          rgb[2]);
+                                    voxels_[v_ind].color_ =
+                                            (voxels_[v_ind].color_ *
+                                                     voxels_[v_ind].weight_ +
+                                             rgb_f) /
+                                            (voxels_[v_ind].weight_ + 1.0f);
+                                } else if (color_type_ ==
+                                           TSDFVolumeColorType::Gray32) {
+                                    const float *intensity =
+                                            image.color_.PointerAt<float>(u, v,
+                                                                          0);
+                                    voxels_[v_ind].color_ =
+                                            (voxels_[v_ind].color_.array() *
+                                                     voxels_[v_ind].weight_ +
+                                             (*intensity)) /
+                                            (voxels_[v_ind].weight_ + 1.0f);
+                                }
+                                voxels_[v_ind].weight_ += 1.0f;
+                            }
                         }
-                        voxels_[v_ind].weight_ += 1.0f;
                     }
                 }
-            }
-        }
-    });
+            });
 }
 
 Eigen::Vector3d UniformTSDFVolume::GetNormalAt(const Eigen::Vector3d &p) {
diff --git a/cpp/open3d/pipelines/odometry/Odometry.cpp b/cpp/open3d/pipelines/odometry/Odometry.cpp
index d326c74f283..a5a77fd0f4a 100644
--- a/cpp/open3d/pipelines/odometry/Odometry.cpp
+++ b/cpp/open3d/pipelines/odometry/Odometry.cpp
@@ -17,8 +17,8 @@
 #include "open3d/geometry/RGBDImage.h"
 #include "open3d/pipelines/odometry/RGBDOdometryJacobian.h"
 #include "open3d/utility/Eigen.h"
-#include "open3d/utility/Timer.h"
 #include "open3d/utility/Parallel.h"
+#include "open3d/utility/Timer.h"
 
 namespace open3d {
 namespace pipelines {
@@ -26,28 +26,34 @@ namespace odometry {
 
 struct CorrespondenceReduction {
     // Global
-    const geometry::Image& depth_s;
-    const geometry::Image& depth_t;
-    const Eigen::Matrix3d& KRK_inv;
-    const Eigen::Vector3d& Kt;
+    const geometry::Image &depth_s;
+    const geometry::Image &depth_t;
+    const Eigen::Matrix3d &KRK_inv;
+    const Eigen::Vector3d &Kt;
     const double depth_diff_max;
     // Local
     geometry::Image correspondence_map;
     geometry::Image depth_buffer;
 
-    CorrespondenceReduction(const geometry::Image& depth_s_,
-                            const geometry::Image& depth_t_,
-                            const Eigen::Matrix3d& KRK_inv_,
-                            const Eigen::Vector3d& Kt_,
+    CorrespondenceReduction(const geometry::Image &depth_s_,
+                            const geometry::Image &depth_t_,
+                            const Eigen::Matrix3d &KRK_inv_,
+                            const Eigen::Vector3d &Kt_,
                             const double depth_diff_max_)
-            : depth_s(depth_s_), depth_t(depth_t_), KRK_inv(KRK_inv_),
-            Kt(Kt_), depth_diff_max(depth_diff_max_) {
+        : depth_s(depth_s_),
+          depth_t(depth_t_),
+          KRK_inv(KRK_inv_),
+          Kt(Kt_),
+          depth_diff_max(depth_diff_max_) {
         InitializeCorrespondenceMap();
     }
 
-    CorrespondenceReduction(CorrespondenceReduction& o, tbb::split)
-            : depth_s(o.depth_s), depth_t(o.depth_t), KRK_inv(o.KRK_inv),
-            Kt(o.Kt), depth_diff_max(o.depth_diff_max) {
+    CorrespondenceReduction(CorrespondenceReduction &o, tbb::split)
+        : depth_s(o.depth_s),
+          depth_t(o.depth_t),
+          KRK_inv(o.KRK_inv),
+          Kt(o.Kt),
+          depth_diff_max(o.depth_diff_max) {
         InitializeCorrespondenceMap();
     }
 
@@ -83,7 +89,7 @@ struct CorrespondenceReduction {
         }
     }
 
-    void MergeCorrespondenceMaps(const CorrespondenceReduction& rhs) {
+    void MergeCorrespondenceMaps(const CorrespondenceReduction &rhs) {
         for (int v_s = 0; v_s < correspondence_map.height_; v_s++) {
             for (int u_s = 0; u_s < correspondence_map.width_; u_s++) {
                 int u_t = *rhs.correspondence_map.PointerAt<int>(u_s, v_s, 0);
@@ -91,14 +97,14 @@ struct CorrespondenceReduction {
                 if (u_t != -1 && v_t != -1) {
                     float transformed_d_t =
                             *rhs.depth_buffer.PointerAt<float>(u_s, v_s);
-                    AddElementToCorrespondenceMap(
-                            u_s, v_s, u_t, v_t, transformed_d_t);
+                    AddElementToCorrespondenceMap(u_s, v_s, u_t, v_t,
+                                                  transformed_d_t);
                 }
             }
         }
     }
 
-    void operator()(const tbb::blocked_range2d<int>&r) {
+    void operator()(const tbb::blocked_range2d<int> &r) {
         for (int v_s = r.rows().begin(); v_s < r.rows().end(); ++v_s) {
             for (int u_s = r.cols().begin(); u_s < r.cols().end(); u_s++) {
                 double d_s = *depth_s.PointerAt<float>(u_s, v_s);
@@ -112,10 +118,9 @@ struct CorrespondenceReduction {
                         v_t < depth_t.height_) {
                         double d_t = *depth_t.PointerAt<float>(u_t, v_t);
                         if (!std::isnan(d_t) &&
-                            std::abs(transformed_d_s - d_t) <=
-                                    depth_diff_max) {
-                            AddElementToCorrespondenceMap(
-                                    u_s, v_s, u_t, v_t, (float)d_s);
+                            std::abs(transformed_d_s - d_t) <= depth_diff_max) {
+                            AddElementToCorrespondenceMap(u_s, v_s, u_t, v_t,
+                                                          (float)d_s);
                         }
                     }
                 }
@@ -123,7 +128,7 @@ struct CorrespondenceReduction {
         }
     }
 
-    inline void join(const CorrespondenceReduction& rhs) {
+    inline void join(const CorrespondenceReduction &rhs) {
         MergeCorrespondenceMaps(rhs);
     }
 
@@ -169,11 +174,13 @@ CorrespondenceSetPixelWise ComputeCorrespondence(
     const Eigen::Matrix3d KRK_inv = K * R * K_inv;
     const Eigen::Vector3d Kt = K * extrinsic.block<3, 1>(0, 3);
 
-    CorrespondenceReduction reducer(depth_s, depth_t,
-        KRK_inv, Kt, option.depth_diff_max_);
-    tbb::parallel_reduce(tbb::blocked_range2d<int>(
-            0, depth_s.height_, utility::DefaultGrainSizeTBB2D(),
-            0, depth_s.width_, utility::DefaultGrainSizeTBB2D()), reducer);
+    CorrespondenceReduction reducer(depth_s, depth_t, KRK_inv, Kt,
+                                    option.depth_diff_max_);
+    tbb::parallel_reduce(
+            tbb::blocked_range2d<int>(
+                    0, depth_s.height_, utility::DefaultGrainSizeTBB2D(), 0,
+                    depth_s.width_, utility::DefaultGrainSizeTBB2D()),
+            reducer);
 
     return reducer.correspondences();
 }
@@ -223,19 +230,19 @@ static std::vector<Eigen::Matrix3d> CreateCameraMatrixPyramid(
 
 struct InformationMatrixReducer {
     // Globals
-    const CorrespondenceSetPixelWise& corres;
-    const geometry::Image& xyz_t;
+    const CorrespondenceSetPixelWise &corres;
+    const geometry::Image &xyz_t;
     // Locals
     Eigen::Matrix6d GTG;
 
-    InformationMatrixReducer(const CorrespondenceSetPixelWise& corres_,
-                             const geometry::Image& xyz_t_)
+    InformationMatrixReducer(const CorrespondenceSetPixelWise &corres_,
+                             const geometry::Image &xyz_t_)
         : corres(corres_), xyz_t(xyz_t_), GTG(Eigen::Matrix6d::Zero()) {}
 
-    InformationMatrixReducer(InformationMatrixReducer& o, tbb::split)
+    InformationMatrixReducer(InformationMatrixReducer &o, tbb::split)
         : corres(o.corres), xyz_t(o.xyz_t), GTG(Eigen::Matrix6d::Zero()) {}
 
-    void operator()(const tbb::blocked_range<std::size_t>& range) {
+    void operator()(const tbb::blocked_range<std::size_t> &range) {
         // write q^*
         // see http://redwood-data.org/indoor/registration.html
         // note: I comes first in this implementation
@@ -264,9 +271,7 @@ struct InformationMatrixReducer {
         }
     }
 
-    void join(InformationMatrixReducer& other) {
-        GTG += other.GTG;
-    }
+    void join(InformationMatrixReducer &other) { GTG += other.GTG; }
 };
 
 static Eigen::Matrix6d CreateInformationMatrix(
@@ -283,8 +288,10 @@ static Eigen::Matrix6d CreateInformationMatrix(
             depth_t, pinhole_camera_intrinsic.intrinsic_matrix_);
 
     InformationMatrixReducer reducer(correspondence, *xyz_t.get());
-    tbb::parallel_reduce(tbb::blocked_range<std::size_t>(0,
-        correspondence.size(), utility::DefaultGrainSizeTBB()), reducer);
+    tbb::parallel_reduce(
+            tbb::blocked_range<std::size_t>(0, correspondence.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            reducer);
     return std::move(reducer.GTG);
 }
 
diff --git a/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp b/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp
index da9dab27a82..5b9f7a8aab7 100644
--- a/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp
+++ b/cpp/open3d/pipelines/registration/FastGlobalRegistration.cpp
@@ -7,17 +7,18 @@
 
 #include "open3d/pipelines/registration/FastGlobalRegistration.h"
 
-#include <map>
 #include <tbb/parallel_for.h>
 #include <tbb/parallel_invoke.h>
 
+#include <map>
+
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/pipelines/registration/Feature.h"
 #include "open3d/pipelines/registration/Registration.h"
 #include "open3d/utility/Logging.h"
-#include "open3d/utility/Random.h"
 #include "open3d/utility/Parallel.h"
+#include "open3d/utility/Random.h"
 
 namespace open3d {
 namespace pipelines {
@@ -30,33 +31,39 @@ static std::vector<std::pair<int, int>> InitialMatching(
     std::vector<int> corres_ij(src_features.data_.cols(), -1);
     std::vector<int> corres_ji(dst_features.data_.cols(), -1);
 
-    tbb::parallel_invoke([&](){
-        tbb::parallel_for(tbb::blocked_range<int>(0,
-                src_features.data_.cols(), utility::DefaultGrainSizeTBB()),
-                [&](const tbb::blocked_range<int>& range) {
-            for (int i = range.begin(); i < range.end(); ++i) {
-                std::vector<int> corres_tmp(1);
-                std::vector<double> dist_tmp(1);
-                dst_feature_tree.SearchKNN(
-                        Eigen::VectorXd(src_features.data_.col(i)),
-                        1, corres_tmp, dist_tmp);
-                corres_ij[i] = corres_tmp[0];
-            }
-        });
-    }, [&](){
-        tbb::parallel_for(tbb::blocked_range<int>(0,
-                dst_features.data_.cols(), utility::DefaultGrainSizeTBB()),
-                [&](const tbb::blocked_range<int>& range) {
-            for (int j = range.begin(); j < range.end(); ++j) {
-                std::vector<int> corres_tmp(1);
-                std::vector<double> dist_tmp(1);
-                src_feature_tree.SearchKNN(
-                        Eigen::VectorXd(dst_features.data_.col(j)),
-                        1, corres_tmp, dist_tmp);
-                corres_ji[j] = corres_tmp[0];
-            }
-        });
-    });
+    tbb::parallel_invoke(
+            [&]() {
+                tbb::parallel_for(
+                        tbb::blocked_range<int>(0, src_features.data_.cols(),
+                                                utility::DefaultGrainSizeTBB()),
+                        [&](const tbb::blocked_range<int>& range) {
+                            for (int i = range.begin(); i < range.end(); ++i) {
+                                std::vector<int> corres_tmp(1);
+                                std::vector<double> dist_tmp(1);
+                                dst_feature_tree.SearchKNN(
+                                        Eigen::VectorXd(
+                                                src_features.data_.col(i)),
+                                        1, corres_tmp, dist_tmp);
+                                corres_ij[i] = corres_tmp[0];
+                            }
+                        });
+            },
+            [&]() {
+                tbb::parallel_for(
+                        tbb::blocked_range<int>(0, dst_features.data_.cols(),
+                                                utility::DefaultGrainSizeTBB()),
+                        [&](const tbb::blocked_range<int>& range) {
+                            for (int j = range.begin(); j < range.end(); ++j) {
+                                std::vector<int> corres_tmp(1);
+                                std::vector<double> dist_tmp(1);
+                                src_feature_tree.SearchKNN(
+                                        Eigen::VectorXd(
+                                                dst_features.data_.col(j)),
+                                        1, corres_tmp, dist_tmp);
+                                corres_ji[j] = corres_tmp[0];
+                            }
+                        });
+            });
 
     utility::LogDebug("\t[cross check] ");
     std::vector<std::pair<int, int>> corres_cross;
diff --git a/cpp/open3d/pipelines/registration/Feature.cpp b/cpp/open3d/pipelines/registration/Feature.cpp
index 73580575b02..61c3ffe6511 100644
--- a/cpp/open3d/pipelines/registration/Feature.cpp
+++ b/cpp/open3d/pipelines/registration/Feature.cpp
@@ -7,10 +7,10 @@
 
 #include "open3d/pipelines/registration/Feature.h"
 
-#include <Eigen/Dense>
-
 #include <tbb/parallel_for.h>
 
+#include <Eigen/Dense>
+
 #include "open3d/geometry/KDTreeFlann.h"
 #include "open3d/geometry/PointCloud.h"
 #include "open3d/utility/Logging.h"
@@ -60,38 +60,41 @@ static std::shared_ptr<Feature> ComputeSPFHFeature(
         const geometry::KDTreeSearchParam &search_param) {
     auto feature = std::make_shared<Feature>();
     feature->Resize(33, (int)input.points_.size());
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, input.points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            const auto &point = input.points_[i];
-            const auto &normal = input.normals_[i];
-            std::vector<int> indices;
-            std::vector<double> distance2;
-            if (kdtree.Search(point, search_param, indices, distance2) > 1) {
-                // only compute SPFH feature when a point has neighbors
-                double hist_incr = 100.0 / (double)(indices.size() - 1);
-                for (size_t k = 1; k < indices.size(); k++) {
-                    // skip the point itself, compute histogram
-                    auto pf = ComputePairFeatures(point, normal,
-                                                  input.points_[indices[k]],
-                                                  input.normals_[indices[k]]);
-                    int h_index = (int)(floor(11 * (pf(0) + M_PI) / (2.0 * M_PI)));
-                    if (h_index < 0) h_index = 0;
-                    if (h_index >= 11) h_index = 10;
-                    feature->data_(h_index, i) += hist_incr;
-                    h_index = (int)(floor(11 * (pf(1) + 1.0) * 0.5));
-                    if (h_index < 0) h_index = 0;
-                    if (h_index >= 11) h_index = 10;
-                    feature->data_(h_index + 11, i) += hist_incr;
-                    h_index = (int)(floor(11 * (pf(2) + 1.0) * 0.5));
-                    if (h_index < 0) h_index = 0;
-                    if (h_index >= 11) h_index = 10;
-                    feature->data_(h_index + 22, i) += hist_incr;
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, input.points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    const auto &point = input.points_[i];
+                    const auto &normal = input.normals_[i];
+                    std::vector<int> indices;
+                    std::vector<double> distance2;
+                    if (kdtree.Search(point, search_param, indices, distance2) >
+                        1) {
+                        // only compute SPFH feature when a point has neighbors
+                        double hist_incr = 100.0 / (double)(indices.size() - 1);
+                        for (size_t k = 1; k < indices.size(); k++) {
+                            // skip the point itself, compute histogram
+                            auto pf = ComputePairFeatures(
+                                    point, normal, input.points_[indices[k]],
+                                    input.normals_[indices[k]]);
+                            int h_index = (int)(floor(11 * (pf(0) + M_PI) /
+                                                      (2.0 * M_PI)));
+                            if (h_index < 0) h_index = 0;
+                            if (h_index >= 11) h_index = 10;
+                            feature->data_(h_index, i) += hist_incr;
+                            h_index = (int)(floor(11 * (pf(1) + 1.0) * 0.5));
+                            if (h_index < 0) h_index = 0;
+                            if (h_index >= 11) h_index = 10;
+                            feature->data_(h_index + 11, i) += hist_incr;
+                            h_index = (int)(floor(11 * (pf(2) + 1.0) * 0.5));
+                            if (h_index < 0) h_index = 0;
+                            if (h_index >= 11) h_index = 10;
+                            feature->data_(h_index + 22, i) += hist_incr;
+                        }
+                    }
                 }
-            }
-        }
-    });
+            });
     return feature;
 }
 
@@ -110,39 +113,41 @@ std::shared_ptr<Feature> ComputeFPFHFeature(
         utility::LogError("Internal error: SPFH feature is nullptr.");
     }
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, input.points_.size(), utility::DefaultGrainSizeTBB()),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            const auto &point = input.points_[i];
-            std::vector<int> indices;
-            std::vector<double> distance2;
-            if (kdtree.Search(point, search_param, indices, distance2) > 1) {
-                double sum[3] = {0.0, 0.0, 0.0};
-                for (size_t k = 1; k < indices.size(); k++) {
-                    // skip the point itself
-                    double dist = distance2[k];
-                    if (dist == 0.0) continue;
-                    for (int j = 0; j < 33; j++) {
-                        double val = spfh->data_(j, indices[k]) / dist;
-                        sum[j / 11] += val;
-                        feature->data_(j, i) += val;
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, input.points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    const auto &point = input.points_[i];
+                    std::vector<int> indices;
+                    std::vector<double> distance2;
+                    if (kdtree.Search(point, search_param, indices, distance2) >
+                        1) {
+                        double sum[3] = {0.0, 0.0, 0.0};
+                        for (size_t k = 1; k < indices.size(); k++) {
+                            // skip the point itself
+                            double dist = distance2[k];
+                            if (dist == 0.0) continue;
+                            for (int j = 0; j < 33; j++) {
+                                double val = spfh->data_(j, indices[k]) / dist;
+                                sum[j / 11] += val;
+                                feature->data_(j, i) += val;
+                            }
+                        }
+                        for (int j = 0; j < 3; j++)
+                            if (sum[j] != 0.0) sum[j] = 100.0 / sum[j];
+                        for (int j = 0; j < 33; j++) {
+                            feature->data_(j, i) *= sum[j / 11];
+                            // The commented line is the fpfh function in the
+                            // paper. But according to PCL implementation, it is
+                            // skipped. Our initial test shows that the full
+                            // fpfh function in the paper seems to be better
+                            // than PCL implementation. Further test required.
+                            feature->data_(j, i) += spfh->data_(j, i);
+                        }
                     }
                 }
-                for (int j = 0; j < 3; j++)
-                    if (sum[j] != 0.0) sum[j] = 100.0 / sum[j];
-                for (int j = 0; j < 33; j++) {
-                    feature->data_(j, i) *= sum[j / 11];
-                    // The commented line is the fpfh function in the paper.
-                    // But according to PCL implementation, it is skipped.
-                    // Our initial test shows that the full fpfh function in the
-                    // paper seems to be better than PCL implementation. Further
-                    // test required.
-                    feature->data_(j, i) += spfh->data_(j, i);
-                }
-            }
-        }
-    });
+            });
     return feature;
 }
 
@@ -160,29 +165,35 @@ CorrespondenceSet CorrespondencesFromFeatures(const Feature &source_features,
                                int(target_features.data_.cols())};
     std::vector<CorrespondenceSet> corres(num_searches);
 
-    tbb::parallel_for(tbb::blocked_range<int>(0, num_searches, 1),
-            [&](const tbb::blocked_range<int>& range) {
-        for (int k = range.begin(); k < range.end(); ++k) {
-            geometry::KDTreeFlann kdtree(features[1 - k]);
-
-            int num_pts_k = num_pts[k];
-            corres[k] = CorrespondenceSet(num_pts_k);
-            tbb::parallel_for(tbb::blocked_range<int>(
-                    0, num_pts_k, utility::DefaultGrainSizeTBB()),
-                    [&](const tbb::blocked_range<int>& range) {
-                for (int i = range.begin(); i < range.end(); ++i) {
-                    std::vector<int> corres_tmp(1);
-                    std::vector<double> dist_tmp(1);
-
-                    kdtree.SearchKNN(
-                            Eigen::VectorXd(features[k].get().data_.col(i)),
-                            1, corres_tmp, dist_tmp);
-                    int j = corres_tmp[0];
-                    corres[k][i] = Eigen::Vector2i(i, j);
+    tbb::parallel_for(
+            tbb::blocked_range<int>(0, num_searches, 1),
+            [&](const tbb::blocked_range<int> &range) {
+                for (int k = range.begin(); k < range.end(); ++k) {
+                    geometry::KDTreeFlann kdtree(features[1 - k]);
+
+                    int num_pts_k = num_pts[k];
+                    corres[k] = CorrespondenceSet(num_pts_k);
+                    tbb::parallel_for(
+                            tbb::blocked_range<int>(
+                                    0, num_pts_k,
+                                    utility::DefaultGrainSizeTBB()),
+                            [&](const tbb::blocked_range<int> &range) {
+                                for (int i = range.begin(); i < range.end();
+                                     ++i) {
+                                    std::vector<int> corres_tmp(1);
+                                    std::vector<double> dist_tmp(1);
+
+                                    kdtree.SearchKNN(
+                                            Eigen::VectorXd(
+                                                    features[k].get().data_.col(
+                                                            i)),
+                                            1, corres_tmp, dist_tmp);
+                                    int j = corres_tmp[0];
+                                    corres[k][i] = Eigen::Vector2i(i, j);
+                                }
+                            });
                 }
             });
-        }
-    });
 
     // corres[0]: corres_ij, corres[1]: corres_ji
     if (!mutual_filter) return corres[0];
diff --git a/cpp/open3d/pipelines/registration/GeneralizedICP.cpp b/cpp/open3d/pipelines/registration/GeneralizedICP.cpp
index 99911fb0327..02039132f2e 100644
--- a/cpp/open3d/pipelines/registration/GeneralizedICP.cpp
+++ b/cpp/open3d/pipelines/registration/GeneralizedICP.cpp
@@ -12,9 +12,10 @@
 
 #include "open3d/pipelines/registration/GeneralizedICP.h"
 
+#include <tbb/parallel_for.h>
+
 #include <Eigen/Dense>
 #include <unsupported/Eigen/MatrixFunctions>
-#include <tbb/parallel_for.h>
 
 #include "open3d/geometry/KDTreeSearchParam.h"
 #include "open3d/geometry/PointCloud.h"
@@ -66,14 +67,14 @@ std::shared_ptr<geometry::PointCloud> InitializePointCloudForGeneralizedICP(
     output->covariances_.resize(output->points_.size());
     const Eigen::Matrix3d C = Eigen::Vector3d(epsilon, 1, 1).asDiagonal();
 
-    tbb::parallel_for(tbb::blocked_range<std::size_t>(
-            0, output->normals_.size(), 1),
-            [&](const tbb::blocked_range<std::size_t>& range) {
-        for (std::size_t i = range.begin(); i < range.end(); ++i) {
-            const auto Rx = GetRotationFromE1ToX(output->normals_[i]);
-            output->covariances_[i] = Rx * C * Rx.transpose();
-        }
-    });
+    tbb::parallel_for(
+            tbb::blocked_range<std::size_t>(0, output->normals_.size(), 1),
+            [&](const tbb::blocked_range<std::size_t> &range) {
+                for (std::size_t i = range.begin(); i < range.end(); ++i) {
+                    const auto Rx = GetRotationFromE1ToX(output->normals_[i]);
+                    output->covariances_[i] = Rx * C * Rx.transpose();
+                }
+            });
 
     return output;
 }
diff --git a/cpp/open3d/pipelines/registration/Registration.cpp b/cpp/open3d/pipelines/registration/Registration.cpp
index a418ee9038a..a8e0a9b6858 100644
--- a/cpp/open3d/pipelines/registration/Registration.cpp
+++ b/cpp/open3d/pipelines/registration/Registration.cpp
@@ -33,21 +33,26 @@ struct RegistrationReduction {
     RegistrationReduction(const geometry::KDTreeFlann& target_kdtree_,
                           const geometry::PointCloud& source_,
                           double max_correspondence_distance_)
-        : target_kdtree(target_kdtree_), source(source_),
+        : target_kdtree(target_kdtree_),
+          source(source_),
           max_distance(max_correspondence_distance_),
-          correspondences(), error2(0.0) {}
+          correspondences(),
+          error2(0.0) {}
 
     RegistrationReduction(RegistrationReduction& other, tbb::split)
-        : target_kdtree(other.target_kdtree), source(other.source),
-          max_distance(other.max_distance), correspondences(), error2(0.0) {}
+        : target_kdtree(other.target_kdtree),
+          source(other.source),
+          max_distance(other.max_distance),
+          correspondences(),
+          error2(0.0) {}
 
     void operator()(const tbb::blocked_range<std::size_t>& range) {
         for (std::size_t i = range.begin(); i < range.end(); ++i) {
             std::vector<int> indices(1);
             std::vector<double> dists(1);
-            const auto &point = source.points_[i];
-            if (target_kdtree.SearchHybrid(point,
-                    max_distance, 1, indices, dists) > 0) {
+            const auto& point = source.points_[i];
+            if (target_kdtree.SearchHybrid(point, max_distance, 1, indices,
+                                           dists) > 0) {
                 error2 += dists[0];
                 correspondences.emplace_back(i, indices[0]);
             }
@@ -56,7 +61,8 @@ struct RegistrationReduction {
 
     void join(RegistrationReduction& other) {
         correspondences.insert(correspondences.end(),
-            other.correspondences.begin(), other.correspondences.end());
+                               other.correspondences.begin(),
+                               other.correspondences.end());
         error2 += other.error2;
     }
 
@@ -66,24 +72,26 @@ struct RegistrationReduction {
 };
 
 static RegistrationResult GetRegistrationResultAndCorrespondences(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
-        const geometry::KDTreeFlann &target_kdtree,
+        const geometry::PointCloud& source,
+        const geometry::PointCloud& target,
+        const geometry::KDTreeFlann& target_kdtree,
         double max_correspondence_distance,
-        const Eigen::Matrix4d &transformation) {
+        const Eigen::Matrix4d& transformation) {
     RegistrationResult result(transformation);
     if (max_correspondence_distance <= 0.0) {
         return result;
     }
 
-    RegistrationReduction reducer(target_kdtree,
-        source, max_correspondence_distance);
-    tbb::parallel_reduce(tbb::blocked_range<std::size_t>(
-        0, source.points_.size(), utility::DefaultGrainSizeTBB()), reducer);
+    RegistrationReduction reducer(target_kdtree, source,
+                                  max_correspondence_distance);
+    tbb::parallel_reduce(
+            tbb::blocked_range<std::size_t>(0, source.points_.size(),
+                                            utility::DefaultGrainSizeTBB()),
+            reducer);
 
     double error2;
-    std::tie(result.correspondence_set_, error2)
-            = std::move(reducer).as_tuple();
+    std::tie(result.correspondence_set_, error2) =
+            std::move(reducer).as_tuple();
 
     if (result.correspondence_set_.empty()) {
         result.fitness_ = 0.0;
@@ -97,16 +105,16 @@ static RegistrationResult GetRegistrationResultAndCorrespondences(
 }
 
 static double EvaluateInlierCorrespondenceRatio(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
-        const CorrespondenceSet &corres,
+        const geometry::PointCloud& source,
+        const geometry::PointCloud& target,
+        const CorrespondenceSet& corres,
         double max_correspondence_distance,
-        const Eigen::Matrix4d &transformation) {
+        const Eigen::Matrix4d& transformation) {
     RegistrationResult result(transformation);
 
     int inlier_corres = 0;
     double max_dis2 = max_correspondence_distance * max_correspondence_distance;
-    for (const auto &c : corres) {
+    for (const auto& c : corres) {
         double dis2 =
                 (source.points_[c[0]] - target.points_[c[1]]).squaredNorm();
         if (dis2 < max_dis2) {
@@ -118,11 +126,11 @@ static double EvaluateInlierCorrespondenceRatio(
 }
 
 RegistrationResult EvaluateRegistration(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
+        const geometry::PointCloud& source,
+        const geometry::PointCloud& target,
         double max_correspondence_distance,
-        const Eigen::Matrix4d
-                &transformation /* = Eigen::Matrix4d::Identity()*/) {
+        const Eigen::Matrix4d&
+                transformation /* = Eigen::Matrix4d::Identity()*/) {
     geometry::KDTreeFlann kdtree;
     kdtree.SetGeometry(target);
     geometry::PointCloud pcd = source;
@@ -134,14 +142,14 @@ RegistrationResult EvaluateRegistration(
 }
 
 RegistrationResult RegistrationICP(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
+        const geometry::PointCloud& source,
+        const geometry::PointCloud& target,
         double max_correspondence_distance,
-        const Eigen::Matrix4d &init /* = Eigen::Matrix4d::Identity()*/,
-        const TransformationEstimation &estimation
+        const Eigen::Matrix4d& init /* = Eigen::Matrix4d::Identity()*/,
+        const TransformationEstimation& estimation
         /* = TransformationEstimationPointToPoint(false)*/,
-        const ICPConvergenceCriteria
-                &criteria /* = ICPConvergenceCriteria()*/) {
+        const ICPConvergenceCriteria&
+                criteria /* = ICPConvergenceCriteria()*/) {
     if (max_correspondence_distance <= 0.0) {
         utility::LogError("Invalid max_correspondence_distance.");
     }
@@ -198,7 +206,8 @@ RegistrationResult RegistrationICP(
 template <typename T>
 void atomic_min(std::atomic<T>& min_val, const T& val) noexcept {
     T prev_val = min_val;
-    while (prev_val > val && min_val.compare_exchange_weak(prev_val, val));
+    while (prev_val > val && min_val.compare_exchange_weak(prev_val, val))
+        ;
 }
 
 struct RANSACCorrespondenceReduction {
@@ -207,8 +216,8 @@ struct RANSACCorrespondenceReduction {
     const geometry::PointCloud& target;
     const CorrespondenceSet& corres;
     const TransformationEstimation& estimation;
-    using CheckerType = std::vector<std::reference_wrapper<
-            const CorrespondenceChecker>>;
+    using CheckerType =
+            std::vector<std::reference_wrapper<const CorrespondenceChecker>>;
     const CheckerType& checkers;
     const geometry::KDTreeFlann& kdtree;
     std::atomic<int>& est_k_global;
@@ -223,7 +232,6 @@ struct RANSACCorrespondenceReduction {
     CorrespondenceSet ransac_corres;
     RegistrationResult best_result;
 
-
     RANSACCorrespondenceReduction(const geometry::PointCloud& source_,
                                   const geometry::PointCloud& target_,
                                   const CorrespondenceSet& corres_,
@@ -236,18 +244,33 @@ struct RANSACCorrespondenceReduction {
                                   double max_dist_,
                                   int ransac_n_,
                                   double confidence)
-        : source(source_), target(target_), corres(corres_),
-          estimation(estimation_), checkers(checkers_), kdtree(kdtree_),
-          est_k_global(est_k_global_), total_validation(total_validation_),
-          rand_gen(rand_gen_), max_distance(max_dist_), ransac_n(ransac_n_),
-          log_confidence(std::log(1.0 - confidence)), ransac_corres(ransac_n) {}
-
-    RANSACCorrespondenceReduction(RANSACCorrespondenceReduction& o,
-        tbb::split): source(o.source), target(o.target), corres(o.corres),
-            estimation(o.estimation), checkers(o.checkers), kdtree(o.kdtree),
-            est_k_global(o.est_k_global), total_validation(o.total_validation),
-            rand_gen(o.rand_gen), max_distance(o.max_distance),
-            ransac_n(o.ransac_n), log_confidence(o.log_confidence) {}
+        : source(source_),
+          target(target_),
+          corres(corres_),
+          estimation(estimation_),
+          checkers(checkers_),
+          kdtree(kdtree_),
+          est_k_global(est_k_global_),
+          total_validation(total_validation_),
+          rand_gen(rand_gen_),
+          max_distance(max_dist_),
+          ransac_n(ransac_n_),
+          log_confidence(std::log(1.0 - confidence)),
+          ransac_corres(ransac_n) {}
+
+    RANSACCorrespondenceReduction(RANSACCorrespondenceReduction& o, tbb::split)
+        : source(o.source),
+          target(o.target),
+          corres(o.corres),
+          estimation(o.estimation),
+          checkers(o.checkers),
+          kdtree(o.kdtree),
+          est_k_global(o.est_k_global),
+          total_validation(o.total_validation),
+          rand_gen(o.rand_gen),
+          max_distance(o.max_distance),
+          ransac_n(o.ransac_n),
+          log_confidence(o.log_confidence) {}
 
     void operator()(const tbb::blocked_range<int>& range) {
         int est_k_local = est_k_global;
@@ -263,8 +286,11 @@ struct RANSACCorrespondenceReduction {
 
                 // Check transformation: inexpensive
                 if (!std::all_of(checkers.begin(), checkers.end(),
-                        [&](const auto& checker){ return checker.get().Check(
-                    source, target, ransac_corres, transformation); })) {
+                                 [&](const auto& checker) {
+                                     return checker.get().Check(source, target,
+                                                                ransac_corres,
+                                                                transformation);
+                                 })) {
                     continue;
                 }
 
@@ -313,16 +339,16 @@ struct RANSACCorrespondenceReduction {
 };
 
 RegistrationResult RegistrationRANSACBasedOnCorrespondence(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
-        const CorrespondenceSet &corres,
+        const geometry::PointCloud& source,
+        const geometry::PointCloud& target,
+        const CorrespondenceSet& corres,
         double max_correspondence_distance,
-        const TransformationEstimation &estimation
+        const TransformationEstimation& estimation
         /* = TransformationEstimationPointToPoint(false)*/,
         int ransac_n /* = 3*/,
-        const std::vector<std::reference_wrapper<const CorrespondenceChecker>>
-                &checkers /* = {}*/,
-        const RANSACConvergenceCriteria &criteria
+        const std::vector<std::reference_wrapper<const CorrespondenceChecker>>&
+                checkers /* = {}*/,
+        const RANSACConvergenceCriteria& criteria
         /* = RANSACConvergenceCriteria()*/) {
     if (ransac_n < 3 || (int)corres.size() < ransac_n ||
         max_correspondence_distance <= 0.0) {
@@ -333,11 +359,14 @@ RegistrationResult RegistrationRANSACBasedOnCorrespondence(
     std::atomic<int> est_k_global = criteria.max_iteration_;
     std::atomic<int> total_validation = 0;
     utility::random::UniformIntGenerator<int> rand_gen(0, corres.size() - 1);
-    RANSACCorrespondenceReduction reducer(source, target, corres, estimation,
-        checkers, kdtree, est_k_global, total_validation, rand_gen,
-        max_correspondence_distance, ransac_n, criteria.confidence_);
-    tbb::parallel_reduce(tbb::blocked_range<int>(
-        0, criteria.max_iteration_, utility::DefaultGrainSizeTBB()), reducer);
+    RANSACCorrespondenceReduction reducer(
+            source, target, corres, estimation, checkers, kdtree, est_k_global,
+            total_validation, rand_gen, max_correspondence_distance, ransac_n,
+            criteria.confidence_);
+    tbb::parallel_reduce(
+            tbb::blocked_range<int>(0, criteria.max_iteration_,
+                                    utility::DefaultGrainSizeTBB()),
+            reducer);
     auto best_result = std::move(reducer.best_result);
     utility::LogDebug(
             "RANSAC exits after {:d} validations. Best inlier ratio {:e}, "
@@ -347,19 +376,19 @@ RegistrationResult RegistrationRANSACBasedOnCorrespondence(
 }
 
 RegistrationResult RegistrationRANSACBasedOnFeatureMatching(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
-        const Feature &source_features,
-        const Feature &target_features,
+        const geometry::PointCloud& source,
+        const geometry::PointCloud& target,
+        const Feature& source_features,
+        const Feature& target_features,
         bool mutual_filter,
         double max_correspondence_distance,
-        const TransformationEstimation
-                &estimation /* = TransformationEstimationPointToPoint(false)*/,
+        const TransformationEstimation&
+                estimation /* = TransformationEstimationPointToPoint(false)*/,
         int ransac_n /* = 3*/,
-        const std::vector<std::reference_wrapper<const CorrespondenceChecker>>
-                &checkers /* = {}*/,
-        const RANSACConvergenceCriteria
-                &criteria /* = RANSACConvergenceCriteria()*/) {
+        const std::vector<std::reference_wrapper<const CorrespondenceChecker>>&
+                checkers /* = {}*/,
+        const RANSACConvergenceCriteria&
+                criteria /* = RANSACConvergenceCriteria()*/) {
     if (ransac_n < 3 || max_correspondence_distance <= 0.0) {
         return RegistrationResult();
     }
@@ -414,16 +443,14 @@ struct InformationMatrixReducer {
         }
     }
 
-    void join(InformationMatrixReducer& other) {
-        GTG += other.GTG;
-    }
+    void join(InformationMatrixReducer& other) { GTG += other.GTG; }
 };
 
 Eigen::Matrix6d GetInformationMatrixFromPointClouds(
-        const geometry::PointCloud &source,
-        const geometry::PointCloud &target,
+        const geometry::PointCloud& source,
+        const geometry::PointCloud& target,
         double max_correspondence_distance,
-        const Eigen::Matrix4d &transformation) {
+        const Eigen::Matrix4d& transformation) {
     geometry::PointCloud pcd = source;
     if (!transformation.isIdentity()) {
         pcd.Transform(transformation);
@@ -435,11 +462,10 @@ Eigen::Matrix6d GetInformationMatrixFromPointClouds(
             transformation);
 
     InformationMatrixReducer reducer(result.correspondence_set_, target);
-    tbb::parallel_reduce(
-            tbb::blocked_range<std::size_t>(0,
-                result.correspondence_set_.size(),
-                utility::DefaultGrainSizeTBB()),
-            reducer);
+    tbb::parallel_reduce(tbb::blocked_range<std::size_t>(
+                                 0, result.correspondence_set_.size(),
+                                 utility::DefaultGrainSizeTBB()),
+                         reducer);
     return std::move(reducer.GTG);
 }
 
diff --git a/cpp/open3d/pipelines/registration/Registration.h b/cpp/open3d/pipelines/registration/Registration.h
index 6eeabfdf1c7..36a9e0274c0 100644
--- a/cpp/open3d/pipelines/registration/Registration.h
+++ b/cpp/open3d/pipelines/registration/Registration.h
@@ -109,10 +109,10 @@ class RegistrationResult {
                                              inlier_rmse_ < other.inlier_rmse_);
     }
 
-    RegistrationResult(const RegistrationResult&) = default;
-    RegistrationResult(RegistrationResult&&) = default;
-    RegistrationResult& operator=(const RegistrationResult&) = default;
-    RegistrationResult& operator=(RegistrationResult&&) = default;
+    RegistrationResult(const RegistrationResult &) = default;
+    RegistrationResult(RegistrationResult &&) = default;
+    RegistrationResult &operator=(const RegistrationResult &) = default;
+    RegistrationResult &operator=(RegistrationResult &&) = default;
 
 public:
     /// The estimated transformation matrix.
diff --git a/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp b/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp
index 9fea392807d..c648fc7e496 100644
--- a/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp
+++ b/cpp/open3d/t/geometry/kernel/PointCloudCPU.cpp
@@ -5,10 +5,10 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
-#include "open3d/t/geometry/kernel/PointCloudImpl.h"
-
-#include <tbb/spin_mutex.h>
 #include <tbb/parallel_for.h>
+#include <tbb/spin_mutex.h>
+
+#include "open3d/t/geometry/kernel/PointCloudImpl.h"
 
 namespace open3d {
 namespace t {
@@ -42,43 +42,49 @@ void ProjectCPU(
     }
 
     tbb::spin_mutex mtx;
-    core::ParallelFor(core::Device("CPU:0"), points.GetLength(),
+    core::ParallelFor(
+            core::Device("CPU:0"), points.GetLength(),
             [&](int64_t workload_idx) {
-        float x = points_ptr[3 * workload_idx + 0];
-        float y = points_ptr[3 * workload_idx + 1];
-        float z = points_ptr[3 * workload_idx + 2];
+                float x = points_ptr[3 * workload_idx + 0];
+                float y = points_ptr[3 * workload_idx + 1];
+                float z = points_ptr[3 * workload_idx + 2];
 
-        // coordinate in camera (in voxel -> in meter)
-        float xc, yc, zc, u, v;
-        transform_indexer.RigidTransform(x, y, z, &xc, &yc, &zc);
+                // coordinate in camera (in voxel -> in meter)
+                float xc, yc, zc, u, v;
+                transform_indexer.RigidTransform(x, y, z, &xc, &yc, &zc);
 
-        // coordinate in image (in pixel)
-        transform_indexer.Project(xc, yc, zc, &u, &v);
-        u = round(u);
-        v = round(v);
-        if (!depth_indexer.InBoundary(u, v) || zc <= 0 || zc > depth_max) {
-            return;
-        }
+                // coordinate in image (in pixel)
+                transform_indexer.Project(xc, yc, zc, &u, &v);
+                u = round(u);
+                v = round(v);
+                if (!depth_indexer.InBoundary(u, v) || zc <= 0 ||
+                    zc > depth_max) {
+                    return;
+                }
 
-        float* depth_ptr = depth_indexer.GetDataPtr<float>(
-                static_cast<int64_t>(u), static_cast<int64_t>(v));
-        float d = zc * depth_scale;
-        {
-            tbb::spin_mutex::scoped_lock lock(mtx);
-            if (*depth_ptr == 0 || *depth_ptr >= d) {
-                *depth_ptr = d;
+                float* depth_ptr = depth_indexer.GetDataPtr<float>(
+                        static_cast<int64_t>(u), static_cast<int64_t>(v));
+                float d = zc * depth_scale;
+                {
+                    tbb::spin_mutex::scoped_lock lock(mtx);
+                    if (*depth_ptr == 0 || *depth_ptr >= d) {
+                        *depth_ptr = d;
 
-                if (has_colors) {
-                    float* color_ptr = color_indexer.GetDataPtr<float>(
-                            static_cast<int64_t>(u), static_cast<int64_t>(v));
+                        if (has_colors) {
+                            float* color_ptr = color_indexer.GetDataPtr<float>(
+                                    static_cast<int64_t>(u),
+                                    static_cast<int64_t>(v));
 
-                    color_ptr[0] = point_colors_ptr[3 * workload_idx + 0];
-                    color_ptr[1] = point_colors_ptr[3 * workload_idx + 1];
-                    color_ptr[2] = point_colors_ptr[3 * workload_idx + 2];
+                            color_ptr[0] =
+                                    point_colors_ptr[3 * workload_idx + 0];
+                            color_ptr[1] =
+                                    point_colors_ptr[3 * workload_idx + 1];
+                            color_ptr[2] =
+                                    point_colors_ptr[3 * workload_idx + 2];
+                        }
+                    }
                 }
-            }
-        }
-    });
+            });
 }
 
 }  // namespace pointcloud
diff --git a/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp b/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp
index e135caea14d..0c916233938 100644
--- a/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp
+++ b/cpp/open3d/t/io/sensor/RGBDVideoReader.cpp
@@ -7,10 +7,10 @@
 
 #include "open3d/t/io/sensor/RGBDVideoReader.h"
 
-#include <string>
-
 #include <tbb/parallel_invoke.h>
 
+#include <string>
+
 #include "open3d/io/IJsonConvertibleIO.h"
 #include "open3d/io/ImageIO.h"
 #include "open3d/t/io/sensor/realsense/RSBagReader.h"
@@ -55,19 +55,21 @@ void RGBDVideoReader::SaveFrames(const std::string &frame_path,
     open3d::geometry::Image im_color, im_depth;
     for (auto tim_rgbd = NextFrame(); !IsEOF() && GetTimestamp() < end_time;
          ++idx, tim_rgbd = NextFrame())
-    tbb::parallel_invoke([&](){
-            im_color = tim_rgbd.color_.ToLegacy();
-            auto color_file =
-                    fmt::format("{0}/color/{1:05d}.jpg", frame_path, idx);
-            open3d::io::WriteImage(color_file, im_color);
-            utility::LogDebug("Written color image to {}", color_file);
-        }, [&](){
-            im_depth = tim_rgbd.depth_.ToLegacy();
-            auto depth_file =
-                    fmt::format("{0}/depth/{1:05d}.png", frame_path, idx);
-            open3d::io::WriteImage(depth_file, im_depth);
-            utility::LogDebug("Written depth image to {}", depth_file);
-    });
+        tbb::parallel_invoke(
+                [&]() {
+                    im_color = tim_rgbd.color_.ToLegacy();
+                    auto color_file = fmt::format("{0}/color/{1:05d}.jpg",
+                                                  frame_path, idx);
+                    open3d::io::WriteImage(color_file, im_color);
+                    utility::LogDebug("Written color image to {}", color_file);
+                },
+                [&]() {
+                    im_depth = tim_rgbd.depth_.ToLegacy();
+                    auto depth_file = fmt::format("{0}/depth/{1:05d}.png",
+                                                  frame_path, idx);
+                    open3d::io::WriteImage(depth_file, im_depth);
+                    utility::LogDebug("Written depth image to {}", depth_file);
+                });
 
     utility::LogInfo("Written {} depth and color images to {}/{{depth,color}}/",
                      idx, frame_path);
diff --git a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
index 23f3fced488..4d572a4fc41 100644
--- a/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
+++ b/cpp/open3d/t/pipelines/kernel/FillInLinearSystemImpl.h
@@ -56,15 +56,15 @@ void FillInRigidAlignmentTermCPU
 #if !defined(__CUDACC__)
     tbb::spin_mutex fill_alignment_mutex;
     tbb::profiling::set_name(fill_alignment_mutex,
-        "FillInRigidAlignmentTermCPU");
+                             "FillInRigidAlignmentTermCPU");
 #define LOCAL_LAMBDA_CAPTURE =, &fill_alignment_mutex
 #else
 #defined LOCAL_LAMBDA_CAPTURE =
 #endif
 
     core::ParallelFor(
-            AtA.GetDevice(), n, [LOCAL_LAMBDA_CAPTURE]
-                    OPEN3D_DEVICE(int64_t workload_idx) {
+            AtA.GetDevice(), n,
+            [LOCAL_LAMBDA_CAPTURE] OPEN3D_DEVICE(int64_t workload_idx) {
                 const float *p_prime = Ti_ps_ptr + 3 * workload_idx;
                 const float *q_prime = Tj_qs_ptr + 3 * workload_idx;
                 const float *normal_p_prime =
@@ -100,19 +100,19 @@ void FillInRigidAlignmentTermCPU
                 }
                 atomicAdd(residual_ptr, r * r);
 #else
-        {
-            tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
-            for (int i_local = 0; i_local < 12; ++i_local) {
-                for (int j_local = 0; j_local < 12; ++j_local) {
-                    AtA_local_ptr[i_local * 12 + j_local]
-                      += J_ij[i_local] * J_ij[j_local];
-                 }
-                 Atb_local_ptr[i_local] += J_ij[i_local] * r;
-            }
-            *residual_ptr += r * r;
-        }
+                {
+                    tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
+                    for (int i_local = 0; i_local < 12; ++i_local) {
+                        for (int j_local = 0; j_local < 12; ++j_local) {
+                            AtA_local_ptr[i_local * 12 + j_local] +=
+                                    J_ij[i_local] * J_ij[j_local];
+                        }
+                        Atb_local_ptr[i_local] += J_ij[i_local] * r;
+                    }
+                    *residual_ptr += r * r;
+                }
 #endif
-    });
+            });
 #undef LOCAL_LAMBDA_CAPTURE
 
     // Then fill-in the large linear system
@@ -200,15 +200,15 @@ void FillInSLACAlignmentTermCPU
 #if !defined(__CUDACC__)
     tbb::spin_mutex fill_alignment_mutex;
     tbb::profiling::set_name(fill_alignment_mutex,
-        "FillInSLACAlignmentTermCPU");
+                             "FillInSLACAlignmentTermCPU");
 #define LOCAL_LAMBDA_CAPTURE =, &fill_alignment_mutex
 #else
 #defined LOCAL_LAMBDA_CAPTURE =
 #endif
 
     core::ParallelFor(
-            AtA.GetDevice(), n, [LOCAL_LAMBDA_CAPTURE]
-                    OPEN3D_DEVICE(int64_t workload_idx) {
+            AtA.GetDevice(), n,
+            [LOCAL_LAMBDA_CAPTURE] OPEN3D_DEVICE(int64_t workload_idx) {
                 const float *Ti_Cp = Ti_Cps_ptr + 3 * workload_idx;
                 const float *Tj_Cq = Tj_Cqs_ptr + 3 * workload_idx;
                 const float *Cnormal_p = Cnormal_ps_ptr + 3 * workload_idx;
@@ -284,19 +284,19 @@ void FillInSLACAlignmentTermCPU
                 }
                 atomicAdd(residual_ptr, r * r);
 #else
-        {
-            tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
-            for (int ki = 0; ki < 60; ++ki) {
-                for (int kj = 0; kj < 60; ++kj) {
-                    AtA_ptr[idx[ki] * n_vars + idx[kj]]
-                      += J[ki] * J[kj];
-                 }
-                 Atb_ptr[idx[ki]] += J[ki] * r;
-            }
-            *residual_ptr += r * r;
-        }
+                {
+                    tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
+                    for (int ki = 0; ki < 60; ++ki) {
+                        for (int kj = 0; kj < 60; ++kj) {
+                            AtA_ptr[idx[ki] * n_vars + idx[kj]] +=
+                                    J[ki] * J[kj];
+                        }
+                        Atb_ptr[idx[ki]] += J[ki] * r;
+                    }
+                    *residual_ptr += r * r;
+                }
 #endif
-    });
+            });
 #undef LOCAL_LAMBDA_CAPTURE
 }
 
@@ -338,14 +338,15 @@ void FillInSLACRegularizerTermCPU
 #if !defined(__CUDACC__)
     tbb::spin_mutex fill_alignment_mutex;
     tbb::profiling::set_name(fill_alignment_mutex,
-        "FillInSLACRegularizerTermCPU");
+                             "FillInSLACRegularizerTermCPU");
 #define LOCAL_LAMBDA_CAPTURE =, &fill_alignment_mutex
 #else
 #defined LOCAL_LAMBDA_CAPTURE =
 #endif
 
-    core::ParallelFor(AtA.GetDevice(), n, [LOCAL_LAMBDA_CAPTURE]
-                    OPEN3D_DEVICE(int64_t workload_idx) {
+    core::ParallelFor(
+            AtA.GetDevice(), n,
+            [LOCAL_LAMBDA_CAPTURE] OPEN3D_DEVICE(int64_t workload_idx) {
                 // Enumerate 6 neighbors
                 int idx_i = grid_idx_ptr[workload_idx];
 
@@ -477,34 +478,37 @@ void FillInSLACRegularizerTermCPU
                                       -weight * local_r[axis]);
                         }
 #else
-                {
-                    tbb::spin_mutex::scoped_lock lock(fill_alignment_mutex);
-                    // Update residual
-                    *residual_ptr += weight * (local_r[0] * local_r[0] +
-                                               local_r[1] * local_r[1] +
-                                               local_r[2] * local_r[2]);
-
-                    for (int axis = 0; axis < 3; ++axis) {
-                        // Update AtA: 2x2
-                        AtA_ptr[(offset_idx_i + axis) * n_vars +
-                                 offset_idx_i + axis] += weight;
-                        AtA_ptr[(offset_idx_k + axis) * n_vars +
-                                 offset_idx_k + axis] += weight;
-
-                        AtA_ptr[(offset_idx_i + axis) * n_vars +
-                                 offset_idx_k + axis] -= weight;
-                        AtA_ptr[(offset_idx_k + axis) * n_vars +
-                                 offset_idx_i + axis] -= weight;
-
-                        // Update Atb: 2x1
-                        Atb_ptr[offset_idx_i + axis] += weight * local_r[axis];
-                        Atb_ptr[offset_idx_k + axis] -= weight * local_r[axis];
-                    }
-                }
+                        {
+                            tbb::spin_mutex::scoped_lock lock(
+                                    fill_alignment_mutex);
+                            // Update residual
+                            *residual_ptr += weight * (local_r[0] * local_r[0] +
+                                                       local_r[1] * local_r[1] +
+                                                       local_r[2] * local_r[2]);
+
+                            for (int axis = 0; axis < 3; ++axis) {
+                                // Update AtA: 2x2
+                                AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                        offset_idx_i + axis] += weight;
+                                AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                        offset_idx_k + axis] += weight;
+
+                                AtA_ptr[(offset_idx_i + axis) * n_vars +
+                                        offset_idx_k + axis] -= weight;
+                                AtA_ptr[(offset_idx_k + axis) * n_vars +
+                                        offset_idx_i + axis] -= weight;
+
+                                // Update Atb: 2x1
+                                Atb_ptr[offset_idx_i + axis] +=
+                                        weight * local_r[axis];
+                                Atb_ptr[offset_idx_k + axis] -=
+                                        weight * local_r[axis];
+                            }
+                        }
 #endif
                     }
                 }
-    });
+            });
 #undef LOCAL_LAMBDA_CAPTURE
 }
 }  // namespace kernel
diff --git a/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp b/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
index 8ce0634653d..27a098d84dc 100644
--- a/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RGBDOdometryCPU.cpp
@@ -49,40 +49,40 @@ void ComputeOdometryInformationMatrixCPU(const core::Tensor& source_vertex_map,
     A_1x21 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_21,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-        for (int workload_idx = r.begin(); workload_idx < r.end();
-             workload_idx++) {
-            int y = workload_idx / cols;
-            int x = workload_idx % cols;
-
-            float J_x[6], J_y[6], J_z[6];
-            float rx, ry, rz;
-
-            bool valid = GetJacobianPointToPoint(
-                    x, y, square_dist_thr, source_vertex_indexer,
-                    target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
-                    rz);
-
-            if (valid) {
-                for (int i = 0, j = 0; j < 6; j++) {
-                    for (int k = 0; k <= j; k++) {
-                        A_reduction[i] += J_x[j] * J_x[k];
-                        A_reduction[i] += J_y[j] * J_y[k];
-                        A_reduction[i] += J_z[j] * J_z[k];
-                        i++;
+                for (int workload_idx = r.begin(); workload_idx < r.end();
+                     workload_idx++) {
+                    int y = workload_idx / cols;
+                    int x = workload_idx % cols;
+
+                    float J_x[6], J_y[6], J_z[6];
+                    float rx, ry, rz;
+
+                    bool valid = GetJacobianPointToPoint(
+                            x, y, square_dist_thr, source_vertex_indexer,
+                            target_vertex_indexer, ti, J_x, J_y, J_z, rx, ry,
+                            rz);
+
+                    if (valid) {
+                        for (int i = 0, j = 0; j < 6; j++) {
+                            for (int k = 0; k <= j; k++) {
+                                A_reduction[i] += J_x[j] * J_x[k];
+                                A_reduction[i] += J_y[j] * J_y[k];
+                                A_reduction[i] += J_z[j] * J_z[k];
+                                i++;
+                            }
+                        }
                     }
                 }
-            }
-        }
-        return A_reduction;
-    },
-    // TBB: Defining reduction operation.
-    [&](std::vector<float> a, std::vector<float> b) {
-        std::vector<float> result(21);
-        for (int j = 0; j < 21; j++) {
-            result[j] = a[j] + b[j];
-        }
-        return result;
-    });
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<float> a, std::vector<float> b) {
+                std::vector<float> result(21);
+                for (int j = 0; j < 21; j++) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
     core::Tensor A_reduction_tensor(A_1x21, {21}, core::Float32, device);
     float* reduction_ptr = A_reduction_tensor.GetDataPtr<float>();
 
@@ -141,47 +141,47 @@ void ComputeOdometryResultIntensityCPU(
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-        for (int workload_idx = r.begin(); workload_idx < r.end();
-             workload_idx++) {
-            int y = workload_idx / cols;
-            int x = workload_idx % cols;
-
-            float J_I[6];
-            float r_I;
-
-            bool valid = GetJacobianIntensity(
-                    x, y, depth_outlier_trunc, source_depth_indexer,
-                    target_depth_indexer, source_intensity_indexer,
-                    target_intensity_indexer,
-                    target_intensity_dx_indexer,
-                    target_intensity_dy_indexer, source_vertex_indexer,
-                    ti, J_I, r_I);
-
-            if (valid) {
-                float d_huber = HuberDeriv(r_I, intensity_huber_delta);
-                float r_huber = HuberLoss(r_I, intensity_huber_delta);
-
-                for (int i = 0, j = 0; j < 6; j++) {
-                    for (int k = 0; k <= j; k++) {
-                        A_reduction[i] += J_I[j] * J_I[k];
-                        i++;
+                for (int workload_idx = r.begin(); workload_idx < r.end();
+                     workload_idx++) {
+                    int y = workload_idx / cols;
+                    int x = workload_idx % cols;
+
+                    float J_I[6];
+                    float r_I;
+
+                    bool valid = GetJacobianIntensity(
+                            x, y, depth_outlier_trunc, source_depth_indexer,
+                            target_depth_indexer, source_intensity_indexer,
+                            target_intensity_indexer,
+                            target_intensity_dx_indexer,
+                            target_intensity_dy_indexer, source_vertex_indexer,
+                            ti, J_I, r_I);
+
+                    if (valid) {
+                        float d_huber = HuberDeriv(r_I, intensity_huber_delta);
+                        float r_huber = HuberLoss(r_I, intensity_huber_delta);
+
+                        for (int i = 0, j = 0; j < 6; j++) {
+                            for (int k = 0; k <= j; k++) {
+                                A_reduction[i] += J_I[j] * J_I[k];
+                                i++;
+                            }
+                            A_reduction[21 + j] += J_I[j] * d_huber;
+                        }
+                        A_reduction[27] += r_huber;
+                        A_reduction[28] += 1;
                     }
-                    A_reduction[21 + j] += J_I[j] * d_huber;
                 }
-                A_reduction[27] += r_huber;
-                A_reduction[28] += 1;
-            }
-        }
-        return A_reduction;
-    },
-    // TBB: Defining reduction operation.
-    [&](std::vector<float> a, std::vector<float> b) {
-        std::vector<float> result(29);
-        for (int j = 0; j < 29; j++) {
-            result[j] = a[j] + b[j];
-        }
-        return result;
-    });
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<float> a, std::vector<float> b) {
+                std::vector<float> result(29);
+                for (int j = 0; j < 29; j++) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
 
     core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
     DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
@@ -234,54 +234,54 @@ void ComputeOdometryResultHybridCPU(const core::Tensor& source_depth,
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-        for (int workload_idx = r.begin(); workload_idx < r.end();
-             workload_idx++) {
-            int y = workload_idx / cols;
-            int x = workload_idx % cols;
-
-            float J_I[6], J_D[6];
-            float r_I, r_D;
-
-            bool valid = GetJacobianHybrid(
-                    x, y, depth_outlier_trunc, source_depth_indexer,
-                    target_depth_indexer, source_intensity_indexer,
-                    target_intensity_indexer, target_depth_dx_indexer,
-                    target_depth_dy_indexer,
-                    target_intensity_dx_indexer,
-                    target_intensity_dy_indexer, source_vertex_indexer,
-                    ti, J_I, J_D, r_I, r_D);
-
-            if (valid) {
-                float d_huber_I =
-                        HuberDeriv(r_I, intensity_huber_delta);
-                float d_huber_D = HuberDeriv(r_D, depth_huber_delta);
-
-                float r_huber_I = HuberLoss(r_I, intensity_huber_delta);
-                float r_huber_D = HuberLoss(r_D, depth_huber_delta);
-
-                for (int i = 0, j = 0; j < 6; j++) {
-                    for (int k = 0; k <= j; k++) {
-                        A_reduction[i] +=
-                                J_I[j] * J_I[k] + J_D[j] * J_D[k];
-                        i++;
+                for (int workload_idx = r.begin(); workload_idx < r.end();
+                     workload_idx++) {
+                    int y = workload_idx / cols;
+                    int x = workload_idx % cols;
+
+                    float J_I[6], J_D[6];
+                    float r_I, r_D;
+
+                    bool valid = GetJacobianHybrid(
+                            x, y, depth_outlier_trunc, source_depth_indexer,
+                            target_depth_indexer, source_intensity_indexer,
+                            target_intensity_indexer, target_depth_dx_indexer,
+                            target_depth_dy_indexer,
+                            target_intensity_dx_indexer,
+                            target_intensity_dy_indexer, source_vertex_indexer,
+                            ti, J_I, J_D, r_I, r_D);
+
+                    if (valid) {
+                        float d_huber_I =
+                                HuberDeriv(r_I, intensity_huber_delta);
+                        float d_huber_D = HuberDeriv(r_D, depth_huber_delta);
+
+                        float r_huber_I = HuberLoss(r_I, intensity_huber_delta);
+                        float r_huber_D = HuberLoss(r_D, depth_huber_delta);
+
+                        for (int i = 0, j = 0; j < 6; j++) {
+                            for (int k = 0; k <= j; k++) {
+                                A_reduction[i] +=
+                                        J_I[j] * J_I[k] + J_D[j] * J_D[k];
+                                i++;
+                            }
+                            A_reduction[21 + j] +=
+                                    J_I[j] * d_huber_I + J_D[j] * d_huber_D;
+                        }
+                        A_reduction[27] += r_huber_I + r_huber_D;
+                        A_reduction[28] += 1;
                     }
-                    A_reduction[21 + j] +=
-                            J_I[j] * d_huber_I + J_D[j] * d_huber_D;
                 }
-                A_reduction[27] += r_huber_I + r_huber_D;
-                A_reduction[28] += 1;
-            }
-        }
-        return A_reduction;
-    },
-    // TBB: Defining reduction operation.
-    [&](std::vector<float> a, std::vector<float> b) {
-        std::vector<float> result(29);
-        for (int j = 0; j < 29; j++) {
-            result[j] = a[j] + b[j];
-        }
-        return result;
-    });
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<float> a, std::vector<float> b) {
+                std::vector<float> result(29);
+                for (int j = 0; j < 29; j++) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
     core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
     DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
 }
@@ -318,43 +318,43 @@ void ComputeOdometryResultPointToPlaneCPU(
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
             [&](tbb::blocked_range<int> r, std::vector<float> A_reduction) {
-        for (int workload_idx = r.begin(); workload_idx < r.end();
-             workload_idx++) {
-            int y = workload_idx / cols;
-            int x = workload_idx % cols;
-
-            float J_ij[6];
-            float r;
-
-            bool valid = GetJacobianPointToPlane(
-                    x, y, depth_outlier_trunc, source_vertex_indexer,
-                    target_vertex_indexer, target_normal_indexer, ti,
-                    J_ij, r);
-
-            if (valid) {
-                float d_huber = HuberDeriv(r, depth_huber_delta);
-                float r_huber = HuberLoss(r, depth_huber_delta);
-                for (int i = 0, j = 0; j < 6; j++) {
-                    for (int k = 0; k <= j; k++) {
-                        A_reduction[i] += J_ij[j] * J_ij[k];
-                        i++;
+                for (int workload_idx = r.begin(); workload_idx < r.end();
+                     workload_idx++) {
+                    int y = workload_idx / cols;
+                    int x = workload_idx % cols;
+
+                    float J_ij[6];
+                    float r;
+
+                    bool valid = GetJacobianPointToPlane(
+                            x, y, depth_outlier_trunc, source_vertex_indexer,
+                            target_vertex_indexer, target_normal_indexer, ti,
+                            J_ij, r);
+
+                    if (valid) {
+                        float d_huber = HuberDeriv(r, depth_huber_delta);
+                        float r_huber = HuberLoss(r, depth_huber_delta);
+                        for (int i = 0, j = 0; j < 6; j++) {
+                            for (int k = 0; k <= j; k++) {
+                                A_reduction[i] += J_ij[j] * J_ij[k];
+                                i++;
+                            }
+                            A_reduction[21 + j] += J_ij[j] * d_huber;
+                        }
+                        A_reduction[27] += r_huber;
+                        A_reduction[28] += 1;
                     }
-                    A_reduction[21 + j] += J_ij[j] * d_huber;
                 }
-                A_reduction[27] += r_huber;
-                A_reduction[28] += 1;
-            }
-        }
-        return A_reduction;
-    },
-    // TBB: Defining reduction operation.
-    [&](std::vector<float> a, std::vector<float> b) {
-        std::vector<float> result(29);
-        for (int j = 0; j < 29; j++) {
-            result[j] = a[j] + b[j];
-        }
-        return result;
-    });
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<float> a, std::vector<float> b) {
+                std::vector<float> result(29);
+                for (int j = 0; j < 29; j++) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
 
     core::Tensor A_reduction_tensor(A_1x29, {29}, core::Float32, device);
     DecodeAndSolve6x6(A_reduction_tensor, delta, inlier_residual, inlier_count);
diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
index ba3b440c932..fe302d9cf35 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
@@ -44,44 +44,44 @@ static void ComputePosePointToPlaneKernelCPU(
     std::vector<scalar_t> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
-            [&](const tbb::blocked_range<int>& range,
-                    std::vector<scalar_t> A_reduction) {
-        for (int workload_idx = range.begin(); workload_idx < range.end();
-             ++workload_idx) {
-            scalar_t J_ij[6];
-            scalar_t r = 0;
-
-            bool valid = kernel::GetJacobianPointToPlane<scalar_t>(
-                    workload_idx, source_points_ptr, target_points_ptr,
-                    target_normals_ptr, correspondence_indices, J_ij,
-                    r);
-
-            scalar_t w = GetWeightFromRobustKernel(r);
-
-            if (valid) {
-                // Dump J, r into JtJ and Jtr
-                int i = 0;
-                for (int j = 0; j < 6; ++j) {
-                    for (int k = 0; k <= j; ++k) {
-                        A_reduction[i] += J_ij[j] * w * J_ij[k];
-                        ++i;
+            [&](const tbb::blocked_range<int> &range,
+                std::vector<scalar_t> A_reduction) {
+                for (int workload_idx = range.begin();
+                     workload_idx < range.end(); ++workload_idx) {
+                    scalar_t J_ij[6];
+                    scalar_t r = 0;
+
+                    bool valid = kernel::GetJacobianPointToPlane<scalar_t>(
+                            workload_idx, source_points_ptr, target_points_ptr,
+                            target_normals_ptr, correspondence_indices, J_ij,
+                            r);
+
+                    scalar_t w = GetWeightFromRobustKernel(r);
+
+                    if (valid) {
+                        // Dump J, r into JtJ and Jtr
+                        int i = 0;
+                        for (int j = 0; j < 6; ++j) {
+                            for (int k = 0; k <= j; ++k) {
+                                A_reduction[i] += J_ij[j] * w * J_ij[k];
+                                ++i;
+                            }
+                            A_reduction[21 + j] += J_ij[j] * w * r;
+                        }
+                        A_reduction[27] += r;
+                        A_reduction[28] += 1;
                     }
-                    A_reduction[21 + j] += J_ij[j] * w * r;
                 }
-                A_reduction[27] += r;
-                A_reduction[28] += 1;
-            }
-        }
-        return A_reduction;
-    },
-    // TBB: Defining reduction operation.
-    [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
-        std::vector<scalar_t> result(29);
-        for (int j = 0; j < 29; ++j) {
-            result[j] = a[j] + b[j];
-        }
-        return result;
-    });
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
+                std::vector<scalar_t> result(29);
+                for (int j = 0; j < 29; ++j) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
 
     for (int i = 0; i < 29; ++i) {
         global_sum[i] = A_1x29[i];
@@ -143,49 +143,49 @@ static void ComputePoseColoredICPKernelCPU(
     std::vector<scalar_t> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
-            [&](const tbb::blocked_range<int>& r,
+            [&](const tbb::blocked_range<int> &r,
                 std::vector<scalar_t> A_reduction) {
-        for (int workload_idx = r.begin(); workload_idx < r.end();
-             ++workload_idx) {
-            scalar_t J_G[6] = {0}, J_I[6] = {0};
-            scalar_t r_G = 0, r_I = 0;
-
-            bool valid = GetJacobianColoredICP<scalar_t>(
-                    workload_idx, source_points_ptr, source_colors_ptr,
-                    target_points_ptr, target_normals_ptr,
-                    target_colors_ptr, target_color_gradients_ptr,
-                    correspondence_indices, sqrt_lambda_geometric,
-                    sqrt_lambda_photometric, J_G, J_I, r_G, r_I);
-
-            scalar_t w_G = GetWeightFromRobustKernel(r_G);
-            scalar_t w_I = GetWeightFromRobustKernel(r_I);
-
-            if (valid) {
-                // Dump J, r into JtJ and Jtr
-                int i = 0;
-                for (int j = 0; j < 6; ++j) {
-                    for (int k = 0; k <= j; ++k) {
-                        A_reduction[i] += J_G[j] * w_G * J_G[k] +
-                                          J_I[j] * w_I * J_I[k];
-                        ++i;
+                for (int workload_idx = r.begin(); workload_idx < r.end();
+                     ++workload_idx) {
+                    scalar_t J_G[6] = {0}, J_I[6] = {0};
+                    scalar_t r_G = 0, r_I = 0;
+
+                    bool valid = GetJacobianColoredICP<scalar_t>(
+                            workload_idx, source_points_ptr, source_colors_ptr,
+                            target_points_ptr, target_normals_ptr,
+                            target_colors_ptr, target_color_gradients_ptr,
+                            correspondence_indices, sqrt_lambda_geometric,
+                            sqrt_lambda_photometric, J_G, J_I, r_G, r_I);
+
+                    scalar_t w_G = GetWeightFromRobustKernel(r_G);
+                    scalar_t w_I = GetWeightFromRobustKernel(r_I);
+
+                    if (valid) {
+                        // Dump J, r into JtJ and Jtr
+                        int i = 0;
+                        for (int j = 0; j < 6; ++j) {
+                            for (int k = 0; k <= j; ++k) {
+                                A_reduction[i] += J_G[j] * w_G * J_G[k] +
+                                                  J_I[j] * w_I * J_I[k];
+                                ++i;
+                            }
+                            A_reduction[21 + j] +=
+                                    J_G[j] * w_G * r_G + J_I[j] * w_I * r_I;
+                        }
+                        A_reduction[27] += r_G * r_G + r_I * r_I;
+                        A_reduction[28] += 1;
                     }
-                    A_reduction[21 + j] +=
-                            J_G[j] * w_G * r_G + J_I[j] * w_I * r_I;
                 }
-                A_reduction[27] += r_G * r_G + r_I * r_I;
-                A_reduction[28] += 1;
-            }
-        }
-        return A_reduction;
-    },
-    // TBB: Defining reduction operation.
-    [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
-        std::vector<scalar_t> result(29);
-        for (int j = 0; j < 29; ++j) {
-            result[j] = a[j] + b[j];
-        }
-        return result;
-    });
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
+                std::vector<scalar_t> result(29);
+                for (int j = 0; j < 29; ++j) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
 
     for (int i = 0; i < 29; ++i) {
         global_sum[i] = A_1x29[i];
@@ -575,36 +575,36 @@ void ComputeInformationMatrixKernelCPU(const scalar_t *target_points_ptr,
     AtA = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_21,
             [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
-        for (int workload_idx = r.begin(); workload_idx < r.end();
-             ++workload_idx) {
-            scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
-
-            bool valid = GetInformationJacobians<scalar_t>(
-                    workload_idx, target_points_ptr,
-                    correspondence_indices, J_x, J_y, J_z);
-
-            if (valid) {
-                int i = 0;
-                for (int j = 0; j < 6; ++j) {
-                    for (int k = 0; k <= j; ++k) {
-                        A_reduction[i] += J_x[j] * J_x[k] +
-                                          J_y[j] * J_y[k] +
-                                          J_z[j] * J_z[k];
-                        ++i;
+                for (int workload_idx = r.begin(); workload_idx < r.end();
+                     ++workload_idx) {
+                    scalar_t J_x[6] = {0}, J_y[6] = {0}, J_z[6] = {0};
+
+                    bool valid = GetInformationJacobians<scalar_t>(
+                            workload_idx, target_points_ptr,
+                            correspondence_indices, J_x, J_y, J_z);
+
+                    if (valid) {
+                        int i = 0;
+                        for (int j = 0; j < 6; ++j) {
+                            for (int k = 0; k <= j; ++k) {
+                                A_reduction[i] += J_x[j] * J_x[k] +
+                                                  J_y[j] * J_y[k] +
+                                                  J_z[j] * J_z[k];
+                                ++i;
+                            }
+                        }
                     }
                 }
-            }
-        }
-        return A_reduction;
-    },
-    // TBB: Defining reduction operation.
-    [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
-        std::vector<scalar_t> result(21);
-        for (int j = 0; j < 21; ++j) {
-            result[j] = a[j] + b[j];
-        }
-        return result;
-    });
+                return A_reduction;
+            },
+            // TBB: Defining reduction operation.
+            [&](std::vector<scalar_t> a, std::vector<scalar_t> b) {
+                std::vector<scalar_t> result(21);
+                for (int j = 0; j < 21; ++j) {
+                    result[j] = a[j] + b[j];
+                }
+                return result;
+            });
 
     for (int i = 0; i < 21; ++i) {
         global_sum[i] = AtA[i];
diff --git a/cpp/open3d/t/pipelines/registration/Feature.cpp b/cpp/open3d/t/pipelines/registration/Feature.cpp
index 56cd9a76e81..b2217dee0c9 100644
--- a/cpp/open3d/t/pipelines/registration/Feature.cpp
+++ b/cpp/open3d/t/pipelines/registration/Feature.cpp
@@ -104,16 +104,16 @@ core::Tensor CorrespondencesFromFeatures(const core::Tensor &source_features,
 
     // corres[0]: corres_ij, corres[1]: corres_ji
     tbb::parallel_for(tbb::blocked_range<int>(0, num_searches, 1),
-            [&](const tbb::blocked_range<int>& range){
-        for (int i = range.begin(); i < range.end(); ++i) {
-            core::nns::NearestNeighborSearch nns(
-                    features[1 - i], core::Dtype::Int64);
-            nns.KnnIndex();
-            auto result = nns.KnnSearch(features[i], 1);
-
-            corres[i] = result.first.View({-1});
-        }
-    });
+                      [&](const tbb::blocked_range<int> &range) {
+                          for (int i = range.begin(); i < range.end(); ++i) {
+                              core::nns::NearestNeighborSearch nns(
+                                      features[1 - i], core::Dtype::Int64);
+                              nns.KnnIndex();
+                              auto result = nns.KnnSearch(features[i], 1);
+
+                              corres[i] = result.first.View({-1});
+                          }
+                      });
 
     auto corres_ij = corres[0];
     core::Tensor arange_source =
diff --git a/cpp/open3d/utility/Eigen.cpp b/cpp/open3d/utility/Eigen.cpp
index 292834e0050..db10e65c45c 100644
--- a/cpp/open3d/utility/Eigen.cpp
+++ b/cpp/open3d/utility/Eigen.cpp
@@ -157,7 +157,7 @@ struct JTJandJTrReduceBodyHelper {
     VecType JTr = VecType::Zero();
     double r2_sum = 0.0;
 
-    void join(const JTJandJTrReduceBodyHelper& rhs) {
+    void join(const JTJandJTrReduceBodyHelper &rhs) {
         JTJ += rhs.JTJ;
         JTr += rhs.JTr;
         r2_sum += rhs.r2_sum;
@@ -173,24 +173,24 @@ struct JTJandJTrReduceBody;
 
 // Code specific to the single valued reduction
 template <typename MatType, typename VecType>
-struct JTJandJTrReduceBody<MatType, VecType, false>: public
-        JTJandJTrReduceBodyHelper<MatType, VecType> {
-    using FuncType = std::function<void(int, VecType&, double&, double&)>;
+struct JTJandJTrReduceBody<MatType, VecType, false>
+    : public JTJandJTrReduceBodyHelper<MatType, VecType> {
+    using FuncType = std::function<void(int, VecType &, double &, double &)>;
     // Global data
-    FuncType& f;
+    FuncType &f;
 
     // Local data
     VecType J_r = VecType::Zero();
     double r = 0.0;
     double w = 0.0;
 
-    JTJandJTrReduceBody(FuncType& f_)
+    JTJandJTrReduceBody(FuncType &f_)
         : JTJandJTrReduceBodyHelper<MatType, VecType>(), f(f_) {}
 
-    JTJandJTrReduceBody(JTJandJTrReduceBody& other, tbb::split sp)
+    JTJandJTrReduceBody(JTJandJTrReduceBody &other, tbb::split sp)
         : JTJandJTrReduceBodyHelper<MatType, VecType>(), f(other.f) {}
 
-    void operator()(const tbb::blocked_range<int>& range) {
+    void operator()(const tbb::blocked_range<int> &range) {
         for (int i = range.begin(); i < range.end(); ++i) {
             f(i, J_r, r, w);
             this->JTJ.noalias() += J_r * w * J_r.transpose();
@@ -198,7 +198,6 @@ struct JTJandJTrReduceBody<MatType, VecType, false>: public
             this->r2_sum += r * r;
         }
     }
-
 };
 
 template <typename MatType, typename VecType>
@@ -207,8 +206,10 @@ std::tuple<MatType, VecType, double> ComputeJTJandJTr(
         int iteration_num,
         bool verbose /*=true*/) {
     JTJandJTrReduceBody<MatType, VecType, false> reducer(f);
-    tbb::parallel_reduce(tbb::blocked_range<int>(0, iteration_num,
-            utility::DefaultGrainSizeTBB()), reducer);
+    tbb::parallel_reduce(
+            tbb::blocked_range<int>(0, iteration_num,
+                                    utility::DefaultGrainSizeTBB()),
+            reducer);
     std::tuple<MatType, VecType, double> result = std::move(reducer).as_tuple();
     if (verbose) {
         LogDebug("Residual : {:.2e} (# of elements : {:d})",
@@ -219,28 +220,36 @@ std::tuple<MatType, VecType, double> ComputeJTJandJTr(
 
 // Code specific to the vector valued reduction
 template <typename MatType, typename VecType>
-struct JTJandJTrReduceBody<MatType, VecType, true>: public
-        JTJandJTrReduceBodyHelper<MatType, VecType> {
-    using FuncType = std::function<void(int,
-        std::vector<VecType, Eigen::aligned_allocator<VecType>>&,
-        std::vector<double>&, std::vector<double>&)>;
+struct JTJandJTrReduceBody<MatType, VecType, true>
+    : public JTJandJTrReduceBodyHelper<MatType, VecType> {
+    using FuncType = std::function<void(
+            int,
+            std::vector<VecType, Eigen::aligned_allocator<VecType>> &,
+            std::vector<double> &,
+            std::vector<double> &)>;
     // Global data
-    FuncType& f;
+    FuncType &f;
 
     // Local data
     std::vector<VecType, Eigen::aligned_allocator<VecType>> J_r;
     std::vector<double> r;
     std::vector<double> w;
 
-    JTJandJTrReduceBody(FuncType& f_)
+    JTJandJTrReduceBody(FuncType &f_)
         : JTJandJTrReduceBodyHelper<MatType, VecType>(),
-                f(f_), J_r(), r(), w() {}
+          f(f_),
+          J_r(),
+          r(),
+          w() {}
 
-    JTJandJTrReduceBody(JTJandJTrReduceBody& other, tbb::split sp)
+    JTJandJTrReduceBody(JTJandJTrReduceBody &other, tbb::split sp)
         : JTJandJTrReduceBodyHelper<MatType, VecType>(),
-                f(other.f), J_r(), r(), w() {}
+          f(other.f),
+          J_r(),
+          r(),
+          w() {}
 
-    void operator()(const tbb::blocked_range<int>& range) {
+    void operator()(const tbb::blocked_range<int> &range) {
         for (int i = range.begin(); i < range.end(); ++i) {
             f(i, J_r, r, w);
             for (int j = 0; j < (int)r.size(); j++) {
@@ -250,7 +259,6 @@ struct JTJandJTrReduceBody<MatType, VecType, true>: public
             }
         }
     }
-
 };
 
 template <typename MatType, typename VecType>
@@ -263,8 +271,10 @@ std::tuple<MatType, VecType, double> ComputeJTJandJTr(
         int iteration_num,
         bool verbose /*=true*/) {
     JTJandJTrReduceBody<MatType, VecType, true> reducer(f);
-    tbb::parallel_reduce(tbb::blocked_range<int>(0, iteration_num,
-            utility::DefaultGrainSizeTBB()), reducer);
+    tbb::parallel_reduce(
+            tbb::blocked_range<int>(0, iteration_num,
+                                    utility::DefaultGrainSizeTBB()),
+            reducer);
     std::tuple<MatType, VecType, double> result = std::move(reducer).as_tuple();
     if (verbose) {
         LogDebug("Residual : {:.2e} (# of elements : {:d})",
diff --git a/cpp/open3d/utility/Logging.h b/cpp/open3d/utility/Logging.h
index fdb132f9b06..5dd4d830024 100644
--- a/cpp/open3d/utility/Logging.h
+++ b/cpp/open3d/utility/Logging.h
@@ -143,7 +143,7 @@ class Logger {
                                         int line,
                                         const char *function,
                                         const char *format,
-                                        Args &&... args) {
+                                        Args &&...args) {
         if (sizeof...(Args) > 0) {
             Logger::GetInstance().VError(
                     file, line, function,
@@ -158,7 +158,7 @@ class Logger {
                             int line,
                             const char *function,
                             const char *format,
-                            Args &&... args) {
+                            Args &&...args) {
         if (Logger::GetInstance().GetVerbosityLevel() >=
             VerbosityLevel::Warning) {
             if (sizeof...(Args) > 0) {
@@ -176,7 +176,7 @@ class Logger {
                          int line,
                          const char *function,
                          const char *format,
-                         Args &&... args) {
+                         Args &&...args) {
         if (Logger::GetInstance().GetVerbosityLevel() >= VerbosityLevel::Info) {
             if (sizeof...(Args) > 0) {
                 Logger::GetInstance().VInfo(
@@ -193,7 +193,7 @@ class Logger {
                           int line,
                           const char *function,
                           const char *format,
-                          Args &&... args) {
+                          Args &&...args) {
         if (Logger::GetInstance().GetVerbosityLevel() >=
             VerbosityLevel::Debug) {
             if (sizeof...(Args) > 0) {
diff --git a/cpp/open3d/utility/Parallel.cpp b/cpp/open3d/utility/Parallel.cpp
index 313f80a1756..d4ca3d0eed6 100644
--- a/cpp/open3d/utility/Parallel.cpp
+++ b/cpp/open3d/utility/Parallel.cpp
@@ -9,13 +9,10 @@
 
 #include <tbb/task_arena.h>
 
-
 namespace open3d {
 namespace utility {
 
-int EstimateMaxThreads() {
-    return tbb::this_task_arena::max_concurrency();
-}
+int EstimateMaxThreads() { return tbb::this_task_arena::max_concurrency(); }
 
 std::size_t& DefaultGrainSizeTBB() noexcept {
     static std::size_t GrainSize = 256;
diff --git a/cpp/open3d/utility/ProgressBar.cpp b/cpp/open3d/utility/ProgressBar.cpp
index 7a1efae3a9e..2dea083b385 100644
--- a/cpp/open3d/utility/ProgressBar.cpp
+++ b/cpp/open3d/utility/ProgressBar.cpp
@@ -5,11 +5,12 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
-#include "open3d/utility/Logging.h"
 #include "open3d/utility/ProgressBar.h"
 
 #include <fmt/printf.h>
 
+#include "open3d/utility/Logging.h"
+
 #ifdef _OPENMP
 #include <omp.h>
 #endif
@@ -100,8 +101,8 @@ TBBProgressBar::TBBProgressBar(std::size_t expected_count,
 }
 
 void TBBProgressBar::Reset(std::size_t expected_count,
-                        std::string progress_info,
-                        bool active) noexcept(false) {
+                           std::string progress_info,
+                           bool active) noexcept(false) {
     if (expected_count & flag_bit_mask) {
         utility::LogError("Expected count out of range [0, 2^31)");
     }
@@ -138,13 +139,13 @@ void TBBProgressBar::UpdateBar() {
     // If so set the flag bit and print 100%
     // tmp is created so that compare_exchange doesn't modify expected_count
     if (std::size_t tmp = expected_count_;
-            current_count_.compare_exchange_strong(
-            tmp, expected_count_ | flag_bit_mask)) {
+        current_count_.compare_exchange_strong(
+                tmp, expected_count_ | flag_bit_mask)) {
         fmt::print("{}[{}] 100%\n", progress_info_,
                    std::string(resolution_, '='));
     } else if (tbb::this_task_arena::current_thread_index() != 0) {
-        std::size_t new_progress_pixel = current_count_ *
-                resolution_ / expected_count_;
+        std::size_t new_progress_pixel =
+                current_count_ * resolution_ / expected_count_;
         if (new_progress_pixel > progress_pixel_) {
             progress_pixel_ = new_progress_pixel;
             int percent = int(current_count_ * 100 / expected_count_);
diff --git a/cpp/open3d/utility/ProgressBar.h b/cpp/open3d/utility/ProgressBar.h
index cc68a4f2c6b..9aef731eef3 100644
--- a/cpp/open3d/utility/ProgressBar.h
+++ b/cpp/open3d/utility/ProgressBar.h
@@ -20,9 +20,7 @@ class ProgressBar {
     ProgressBar(size_t expected_count,
                 std::string progress_info,
                 bool active = false);
-    void Reset(size_t expected_count,
-               std::string progress_info,
-               bool active);
+    void Reset(size_t expected_count, std::string progress_info, bool active);
     inline ProgressBar& operator++() { return *this += 1; };
     virtual ProgressBar& operator+=(std::size_t n);
     void SetCurrentCount(size_t n);
@@ -63,7 +61,6 @@ class TBBProgressBar {
         return current_count_ & ~flag_bit_mask;
     }
 
-
 protected:
     static constexpr std::size_t flag_bit_mask = ~(~std::size_t{} >> 1);
     static constexpr std::size_t resolution_ = 40;
diff --git a/cpp/open3d/utility/Random.h b/cpp/open3d/utility/Random.h
index 6b38a2fd02d..c7a5393d7e5 100644
--- a/cpp/open3d/utility/Random.h
+++ b/cpp/open3d/utility/Random.h
@@ -8,6 +8,7 @@
 #pragma once
 
 #include <tbb/spin_mutex.h>
+
 #include <random>
 
 #include "open3d/utility/Logging.h"
diff --git a/cpp/tests/utility/ProgressBar.cpp b/cpp/tests/utility/ProgressBar.cpp
index da08565c1b9..b8e175e433c 100644
--- a/cpp/tests/utility/ProgressBar.cpp
+++ b/cpp/tests/utility/ProgressBar.cpp
@@ -7,11 +7,11 @@
 
 #include "open3d/utility/ProgressBar.h"
 
+#include <tbb/parallel_for.h>
+
 #include <chrono>
 #include <thread>
 
-#include <tbb/parallel_for.h>
-
 #include "open3d/utility/Parallel.h"
 #include "tests/Tests.h"
 
@@ -34,9 +34,8 @@ TEST(ProgressBar, OMPProgressBar) {
     utility::OMPProgressBar progress_bar(iterations,
                                          "OMPProgressBar test: ", true);
 
-#pragma omp parallel for schedule(static) default(none) \
-        shared(iterations, progress_bar) \
-        num_threads(utility::EstimateMaxThreads())
+#pragma omp parallel for schedule(static) default(none) shared( \
+        iterations, progress_bar) num_threads(utility::EstimateMaxThreads())
     for (int i = 0; i < iterations; ++i) {
         std::this_thread::sleep_for(std::chrono::milliseconds(10));
         ++progress_bar;
@@ -46,29 +45,31 @@ TEST(ProgressBar, OMPProgressBar) {
 
 TEST(ProgressBar, TBBProgressBar) {
     int iterations = 1000;
-    utility::TBBProgressBar progress_bar(
-            iterations, "TBBProgressBar test: ", true);
-    tbb::parallel_for(tbb::blocked_range<int>(0, iterations, 10),
+    utility::TBBProgressBar progress_bar(iterations,
+                                         "TBBProgressBar test: ", true);
+    tbb::parallel_for(
+            tbb::blocked_range<int>(0, iterations, 10),
             [&](const tbb::blocked_range<int>& range) {
-        for (int i = range.begin(); i < range.end(); ++i) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(10));
-            ++progress_bar;
-        }
-    });
+                for (int i = range.begin(); i < range.end(); ++i) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                    ++progress_bar;
+                }
+            });
     EXPECT_EQ(static_cast<int>(progress_bar.GetCurrentCount()), iterations);
 }
 
 TEST(ProgressBar, TBBProgressBarBatched) {
     int iterations = 1000;
-    utility::TBBProgressBar progress_bar(
-            iterations, "Batched TBBProgressBar test: ", true);
-    tbb::parallel_for(tbb::blocked_range<int>(0, iterations, 10),
+    utility::TBBProgressBar progress_bar(iterations,
+                                         "Batched TBBProgressBar test: ", true);
+    tbb::parallel_for(
+            tbb::blocked_range<int>(0, iterations, 10),
             [&](const tbb::blocked_range<int>& range) {
-        for (int i = range.begin(); i < range.end(); ++i) {
-            std::this_thread::sleep_for(std::chrono::milliseconds(10));
-        }
-        progress_bar += (range.end() - range.begin());
-    });
+                for (int i = range.begin(); i < range.end(); ++i) {
+                    std::this_thread::sleep_for(std::chrono::milliseconds(10));
+                }
+                progress_bar += (range.end() - range.begin());
+            });
     EXPECT_EQ(static_cast<int>(progress_bar.GetCurrentCount()), iterations);
 }
 
diff --git a/examples/cpp/RealSenseBagReader.cpp b/examples/cpp/RealSenseBagReader.cpp
index 8c58801b0da..7a7a58d8ba8 100644
--- a/examples/cpp/RealSenseBagReader.cpp
+++ b/examples/cpp/RealSenseBagReader.cpp
@@ -6,6 +6,7 @@
 // ----------------------------------------------------------------------------
 
 #include <json/json.h>
+#include <tbb/parallel_invoke.h>
 
 #include <chrono>
 #include <fstream>
@@ -14,8 +15,6 @@
 #include <string>
 #include <thread>
 
-#include <tbb/parallel_invoke.h>
-
 #include "open3d/Open3D.h"
 
 using namespace open3d;
@@ -206,17 +205,19 @@ int main(int argc, char *argv[]) {
 
             ++idx;
             if (write_image) {
-                tbb::parallel_invoke([&](){
-                    auto color_file = fmt::format("{0}/color/{1:05d}.jpg",
-                                                  output_path, idx);
-                    utility::LogInfo("Writing to {}", color_file);
-                    io::WriteImage(color_file, im_rgbd.color_);
-                }, [&](){
-                    auto depth_file = fmt::format("{0}/depth/{1:05d}.png",
-                                                  output_path, idx);
-                    utility::LogInfo("Writing to {}", depth_file);
-                    io::WriteImage(depth_file, im_rgbd.depth_);
-                });
+                tbb::parallel_invoke(
+                        [&]() {
+                            auto color_file = fmt::format(
+                                    "{0}/color/{1:05d}.jpg", output_path, idx);
+                            utility::LogInfo("Writing to {}", color_file);
+                            io::WriteImage(color_file, im_rgbd.color_);
+                        },
+                        [&]() {
+                            auto depth_file = fmt::format(
+                                    "{0}/depth/{1:05d}.png", output_path, idx);
+                            utility::LogInfo("Writing to {}", depth_file);
+                            io::WriteImage(depth_file, im_rgbd.depth_);
+                        });
             }
             vis.UpdateGeometry();
             vis.UpdateRender();
diff --git a/examples/cpp/TrimMeshBasedOnPointCloud.cpp b/examples/cpp/TrimMeshBasedOnPointCloud.cpp
index ef45c51b801..311e31fb3dd 100644
--- a/examples/cpp/TrimMeshBasedOnPointCloud.cpp
+++ b/examples/cpp/TrimMeshBasedOnPointCloud.cpp
@@ -5,10 +5,10 @@
 // SPDX-License-Identifier: MIT
 // ----------------------------------------------------------------------------
 
-#include "open3d/Open3D.h"
-
 #include <tbb/parallel_for.h>
 
+#include "open3d/Open3D.h"
+
 void PrintHelp() {
     using namespace open3d;
 

From 6399001b9a95cd68413253d11cb3059d8a4dab60 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Mon, 12 Feb 2024 12:51:00 -0800
Subject: [PATCH 10/22] Marked single argument constructors for reductions as
 explicit to please codacy

---
 cpp/open3d/utility/Eigen.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/open3d/utility/Eigen.cpp b/cpp/open3d/utility/Eigen.cpp
index db10e65c45c..6ffefddcd74 100644
--- a/cpp/open3d/utility/Eigen.cpp
+++ b/cpp/open3d/utility/Eigen.cpp
@@ -184,7 +184,7 @@ struct JTJandJTrReduceBody<MatType, VecType, false>
     double r = 0.0;
     double w = 0.0;
 
-    JTJandJTrReduceBody(FuncType &f_)
+    explicit JTJandJTrReduceBody(FuncType &f_)
         : JTJandJTrReduceBodyHelper<MatType, VecType>(), f(f_) {}
 
     JTJandJTrReduceBody(JTJandJTrReduceBody &other, tbb::split sp)
@@ -235,7 +235,7 @@ struct JTJandJTrReduceBody<MatType, VecType, true>
     std::vector<double> r;
     std::vector<double> w;
 
-    JTJandJTrReduceBody(FuncType &f_)
+    explicit JTJandJTrReduceBody(FuncType &f_)
         : JTJandJTrReduceBodyHelper<MatType, VecType>(),
           f(f_),
           J_r(),

From 11c546d4f3d9f9445e4a9cee8a693b2ce3e78cc6 Mon Sep 17 00:00:00 2001
From: Daniel Simon <DanSimon09@gmail.com>
Date: Mon, 12 Feb 2024 12:52:12 -0800
Subject: [PATCH 11/22] Explicitly load atomics in calls to utility::Log*(...)
 because newer versions of format wont automatically convert it to its
 underlying type.

---
 cpp/open3d/geometry/PointCloudSegmentation.cpp       | 2 +-
 cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp | 4 ++--
 cpp/open3d/pipelines/color_map/RigidOptimizer.cpp    | 4 ++--
 cpp/open3d/pipelines/registration/Registration.cpp   | 4 ++--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/open3d/geometry/PointCloudSegmentation.cpp b/cpp/open3d/geometry/PointCloudSegmentation.cpp
index 74c63236dd5..c36cc4dafb4 100644
--- a/cpp/open3d/geometry/PointCloudSegmentation.cpp
+++ b/cpp/open3d/geometry/PointCloudSegmentation.cpp
@@ -268,7 +268,7 @@ std::tuple<Eigen::Vector4d, std::vector<size_t>> PointCloud::SegmentPlane(
             "RANSAC | Inliers: {:d}, Fitness: {:e}, RMSE: {:e}, Iteration: "
             "{:d}",
             final_inliers.size(), result.fitness_, result.inlier_rmse_,
-            iteration_count);
+            iteration_count.load());
     return std::make_tuple(best_plane_model, final_inliers);
 }
 
diff --git a/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp b/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
index 7a0861bbce7..96f35fe9061 100644
--- a/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
+++ b/cpp/open3d/pipelines/color_map/NonRigidOptimizer.cpp
@@ -400,8 +400,8 @@ RunNonRigidOptimizer(const geometry::TriangleMesh& mesh,
                         atomic_sum(residual_reg, rr_reg);
                     }
                 });
-        utility::LogDebug("Residual error : {:.6f}, reg : {:.6f}", residual,
-                          residual_reg);
+        utility::LogDebug("Residual error : {:.6f}, reg : {:.6f}",
+                          residual.load(), residual_reg.load());
         SetProxyIntensityForVertex(opt_mesh, images_gray, warping_fields,
                                    opt_camera_trajectory,
                                    visibility_vertex_to_image, proxy_intensity,
diff --git a/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp b/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
index 91636d39e50..fffe0ea42f6 100644
--- a/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
+++ b/cpp/open3d/pipelines/color_map/RigidOptimizer.cpp
@@ -204,9 +204,9 @@ RunRigidOptimizer(const geometry::TriangleMesh& mesh,
                 });
         if (total_num_ > 0) {
             utility::LogDebug("Residual error : {:.6f} (avg : {:.6f})",
-                              residual, residual / total_num_);
+                              residual.load(), residual / total_num_);
         } else {
-            utility::LogDebug("Residual error : {:.6f}", residual);
+            utility::LogDebug("Residual error : {:.6f}", residual.load());
         }
         SetProxyIntensityForVertex(opt_mesh, images_gray, utility::nullopt,
                                    opt_camera_trajectory,
diff --git a/cpp/open3d/pipelines/registration/Registration.cpp b/cpp/open3d/pipelines/registration/Registration.cpp
index a8e0a9b6858..d9ec1533f42 100644
--- a/cpp/open3d/pipelines/registration/Registration.cpp
+++ b/cpp/open3d/pipelines/registration/Registration.cpp
@@ -370,8 +370,8 @@ RegistrationResult RegistrationRANSACBasedOnCorrespondence(
     auto best_result = std::move(reducer.best_result);
     utility::LogDebug(
             "RANSAC exits after {:d} validations. Best inlier ratio {:e}, "
-            "RMSE {:e}",
-            total_validation, best_result.fitness_, best_result.inlier_rmse_);
+            "RMSE {:e}", total_validation.load(), best_result.fitness_,
+            best_result.inlier_rmse_);
     return best_result;
 }
 

From f48abcca147a656db164bdbf6a3578804a781577 Mon Sep 17 00:00:00 2001
From: Sameer Sheorey <sameer.sheorey@intel.com>
Date: Wed, 17 Apr 2024 13:00:29 -0700
Subject: [PATCH 12/22] style fix

---
 cpp/open3d/pipelines/registration/Registration.cpp | 3 ++-
 cpp/open3d/utility/Logging.h                       | 8 ++++----
 2 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/cpp/open3d/pipelines/registration/Registration.cpp b/cpp/open3d/pipelines/registration/Registration.cpp
index d9ec1533f42..ed8fa6f3317 100644
--- a/cpp/open3d/pipelines/registration/Registration.cpp
+++ b/cpp/open3d/pipelines/registration/Registration.cpp
@@ -370,7 +370,8 @@ RegistrationResult RegistrationRANSACBasedOnCorrespondence(
     auto best_result = std::move(reducer.best_result);
     utility::LogDebug(
             "RANSAC exits after {:d} validations. Best inlier ratio {:e}, "
-            "RMSE {:e}", total_validation.load(), best_result.fitness_,
+            "RMSE {:e}",
+            total_validation.load(), best_result.fitness_,
             best_result.inlier_rmse_);
     return best_result;
 }
diff --git a/cpp/open3d/utility/Logging.h b/cpp/open3d/utility/Logging.h
index 5dd4d830024..fdb132f9b06 100644
--- a/cpp/open3d/utility/Logging.h
+++ b/cpp/open3d/utility/Logging.h
@@ -143,7 +143,7 @@ class Logger {
                                         int line,
                                         const char *function,
                                         const char *format,
-                                        Args &&...args) {
+                                        Args &&... args) {
         if (sizeof...(Args) > 0) {
             Logger::GetInstance().VError(
                     file, line, function,
@@ -158,7 +158,7 @@ class Logger {
                             int line,
                             const char *function,
                             const char *format,
-                            Args &&...args) {
+                            Args &&... args) {
         if (Logger::GetInstance().GetVerbosityLevel() >=
             VerbosityLevel::Warning) {
             if (sizeof...(Args) > 0) {
@@ -176,7 +176,7 @@ class Logger {
                          int line,
                          const char *function,
                          const char *format,
-                         Args &&...args) {
+                         Args &&... args) {
         if (Logger::GetInstance().GetVerbosityLevel() >= VerbosityLevel::Info) {
             if (sizeof...(Args) > 0) {
                 Logger::GetInstance().VInfo(
@@ -193,7 +193,7 @@ class Logger {
                           int line,
                           const char *function,
                           const char *format,
-                          Args &&...args) {
+                          Args &&... args) {
         if (Logger::GetInstance().GetVerbosityLevel() >=
             VerbosityLevel::Debug) {
             if (sizeof...(Args) > 0) {

From c04077f9fff6387bedd8b744c21f69ac1d84181c Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Tue, 10 Sep 2024 18:48:17 -0700
Subject: [PATCH 13/22] Fixed incorrect include in Parallel.h

---
 cpp/open3d/utility/Parallel.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/open3d/utility/Parallel.h b/cpp/open3d/utility/Parallel.h
index 61452ac85c3..33b59fc3cd0 100644
--- a/cpp/open3d/utility/Parallel.h
+++ b/cpp/open3d/utility/Parallel.h
@@ -7,7 +7,7 @@
 
 #pragma once
 
-#include <cstdint>
+#include <cstddef>
 
 namespace open3d {
 namespace utility {

From 7d6c7073fc41cb7b4b9da133a6c830e7647142f7 Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Tue, 10 Sep 2024 18:48:58 -0700
Subject: [PATCH 14/22] Updated ProgressBar.h to use size_t in std namespace
 for consistency

---
 cpp/open3d/utility/ProgressBar.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/open3d/utility/ProgressBar.h b/cpp/open3d/utility/ProgressBar.h
index 9aef731eef3..7d3999e0706 100644
--- a/cpp/open3d/utility/ProgressBar.h
+++ b/cpp/open3d/utility/ProgressBar.h
@@ -17,10 +17,10 @@ namespace utility {
 
 class ProgressBar {
 public:
-    ProgressBar(size_t expected_count,
+    ProgressBar(std::size_t expected_count,
                 std::string progress_info,
                 bool active = false);
-    void Reset(size_t expected_count, std::string progress_info, bool active);
+    void Reset(std::size_t expected_count, std::string progress_info, bool active);
     inline ProgressBar& operator++() { return *this += 1; };
     virtual ProgressBar& operator+=(std::size_t n);
     void SetCurrentCount(size_t n);

From bba01efd45a165c04dc487e6c6fc83c680ae2931 Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Tue, 10 Sep 2024 18:49:33 -0700
Subject: [PATCH 15/22] Switched std::lock_guard<std::mutex> for
 DiscreteGenerator to use tbb::spin_mutex::scoped_lock.

---
 cpp/open3d/utility/Random.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/open3d/utility/Random.h b/cpp/open3d/utility/Random.h
index c7a5393d7e5..9445c03666b 100644
--- a/cpp/open3d/utility/Random.h
+++ b/cpp/open3d/utility/Random.h
@@ -210,8 +210,8 @@ class DiscreteGenerator {
 
     /// Call this to generate a discretely distributed integer value.
     T operator()() {
-        std::lock_guard<std::mutex> lock(*GetMutex());
-        return distribution_(*GetEngine());
+        tbb::spin_mutex::scoped_lock lock(GetMutex());
+        return distribution_(GetEngine());
     }
 
 protected:

From ec98caebf7598011b51b012ff8bc8f94d9c16869 Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Tue, 10 Sep 2024 18:49:57 -0700
Subject: [PATCH 16/22] Switch to using tbb::parallel_reduce on all platforms.

---
 cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp b/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
index fe302d9cf35..7fee90c998f 100644
--- a/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
+++ b/cpp/open3d/t/pipelines/kernel/RegistrationCPU.cpp
@@ -262,18 +262,12 @@ static void ComputePoseDopplerICPKernelCPU(
     // and 28th as inlier_count.
     std::vector<scalar_t> A_1x29(29, 0.0);
 
-#ifdef _WIN32
     std::vector<scalar_t> zeros_29(29, 0.0);
     A_1x29 = tbb::parallel_reduce(
             tbb::blocked_range<int>(0, n), zeros_29,
             [&](tbb::blocked_range<int> r, std::vector<scalar_t> A_reduction) {
                 for (int workload_idx = r.begin(); workload_idx < r.end();
                      ++workload_idx) {
-#else
-    scalar_t *A_reduction = A_1x29.data();
-#pragma omp parallel for reduction(+ : A_reduction[:29]) schedule(static) num_threads(utility::EstimateMaxThreads())
-    for (int workload_idx = 0; workload_idx < n; ++workload_idx) {
-#endif
                     scalar_t J_G[6] = {0}, J_D[6] = {0};
                     scalar_t r_G = 0, r_D = 0;
 
@@ -306,7 +300,6 @@ static void ComputePoseDopplerICPKernelCPU(
                         A_reduction[28] += 1;
                     }
                 }
-#ifdef _WIN32
                 return A_reduction;
             },
             // TBB: Defining reduction operation.
@@ -317,7 +310,6 @@ static void ComputePoseDopplerICPKernelCPU(
                 }
                 return result;
             });
-#endif
 
     for (int i = 0; i < 29; ++i) {
         global_sum[i] = A_1x29[i];

From dbe4133f34b1d3eba90bf20150bbc202824608c8 Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Thu, 19 Sep 2024 19:39:21 -0700
Subject: [PATCH 17/22] Fixed progress bar bug that allow for multiple threads
 writing to the terminal.

---
 cpp/open3d/utility/ProgressBar.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/open3d/utility/ProgressBar.cpp b/cpp/open3d/utility/ProgressBar.cpp
index 2dea083b385..45c022fb841 100644
--- a/cpp/open3d/utility/ProgressBar.cpp
+++ b/cpp/open3d/utility/ProgressBar.cpp
@@ -143,7 +143,7 @@ void TBBProgressBar::UpdateBar() {
                 tmp, expected_count_ | flag_bit_mask)) {
         fmt::print("{}[{}] 100%\n", progress_info_,
                    std::string(resolution_, '='));
-    } else if (tbb::this_task_arena::current_thread_index() != 0) {
+    } else if (tbb::this_task_arena::current_thread_index() == 0) {
         std::size_t new_progress_pixel =
                 current_count_ * resolution_ / expected_count_;
         if (new_progress_pixel > progress_pixel_) {

From 25d8e7feed33616a72efa6b169025321c635b227 Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Thu, 19 Sep 2024 19:40:23 -0700
Subject: [PATCH 18/22] Fixed issue with CPU reduction. Need to exit early if
 no work is to be done to prevent assignment to the output pointer.

---
 cpp/open3d/core/kernel/ReductionCPU.cpp | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/cpp/open3d/core/kernel/ReductionCPU.cpp b/cpp/open3d/core/kernel/ReductionCPU.cpp
index bdde3acfec6..7eb256cd739 100644
--- a/cpp/open3d/core/kernel/ReductionCPU.cpp
+++ b/cpp/open3d/core/kernel/ReductionCPU.cpp
@@ -112,9 +112,13 @@ class CPUReductionEngine {
                     "Internal error: two-pass reduction only works for "
                     "single-output reduction ops.");
         }
+        const auto num_workloads = indexer.NumWorkloads();
+        if (num_workloads == 0) {
+            return;
+        }
         scalar_t& dst = *reinterpret_cast<scalar_t*>(indexer.GetOutputPtr(0));
         dst = tbb::parallel_reduce(
-                tbb::blocked_range<int64_t>(0, indexer.NumWorkloads(),
+                tbb::blocked_range<int64_t>(0, num_workloads,
                                             utility::DefaultGrainSizeTBB()),
                 identity,
                 [&](const tbb::blocked_range<int64_t>& range, scalar_t so_far) {

From 929f0104229682637d970ceab20f24e08c53db87 Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Thu, 19 Sep 2024 20:08:32 -0700
Subject: [PATCH 19/22] Updated MemoryManagerStatistic to use a tbb::spin_mutex
 instead of a std::mutex

---
 cpp/open3d/core/MemoryManagerStatistic.cpp | 6 +++---
 cpp/open3d/core/MemoryManagerStatistic.h   | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/cpp/open3d/core/MemoryManagerStatistic.cpp b/cpp/open3d/core/MemoryManagerStatistic.cpp
index 82c15e2edf9..978172a7e0a 100644
--- a/cpp/open3d/core/MemoryManagerStatistic.cpp
+++ b/cpp/open3d/core/MemoryManagerStatistic.cpp
@@ -118,7 +118,7 @@ bool MemoryManagerStatistic::HasLeaks() const {
 void MemoryManagerStatistic::CountMalloc(void* ptr,
                                          size_t byte_size,
                                          const Device& device) {
-    std::lock_guard<std::mutex> lock(statistics_mutex_);
+    tbb::spin_mutex::scoped_lock lock(statistics_mutex_);
 
     // Filter nullptr. Empty allocations are not tracked.
     if (ptr == nullptr && byte_size == 0) {
@@ -141,7 +141,7 @@ void MemoryManagerStatistic::CountMalloc(void* ptr,
 }
 
 void MemoryManagerStatistic::CountFree(void* ptr, const Device& device) {
-    std::lock_guard<std::mutex> lock(statistics_mutex_);
+    tbb::spin_mutex::scoped_lock lock(statistics_mutex_);
 
     // Filter nullptr. Empty allocations are not tracked.
     if (ptr == nullptr) {
@@ -170,7 +170,7 @@ void MemoryManagerStatistic::CountFree(void* ptr, const Device& device) {
 }
 
 void MemoryManagerStatistic::Reset() {
-    std::lock_guard<std::mutex> lock(statistics_mutex_);
+    tbb::spin_mutex::scoped_lock lock(statistics_mutex_);
     statistics_.clear();
 }
 
diff --git a/cpp/open3d/core/MemoryManagerStatistic.h b/cpp/open3d/core/MemoryManagerStatistic.h
index 60de026ec5c..d27049e6e2c 100644
--- a/cpp/open3d/core/MemoryManagerStatistic.h
+++ b/cpp/open3d/core/MemoryManagerStatistic.h
@@ -10,6 +10,7 @@
 #include <cstddef>
 #include <map>
 #include <mutex>
+#include <tbb/spin_mutex.h>
 #include <unordered_map>
 
 #include "open3d/core/Device.h"
@@ -86,7 +87,7 @@ class MemoryManagerStatistic {
     /// Print at each malloc and free, disabled by default.
     bool print_at_malloc_free_ = false;
 
-    std::mutex statistics_mutex_;
+    tbb::spin_mutex statistics_mutex_;
     std::map<Device, MemoryStatistics> statistics_;
 };
 

From 31484e5088832f8fc396cc1d11ee9ab3d995b56f Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Thu, 19 Sep 2024 20:17:43 -0700
Subject: [PATCH 20/22] Updated point cloud segmentation to generate random
 samples using the global mutex from utilities::random.

---
 .../geometry/PointCloudSegmentation.cpp       | 33 +++++++++----------
 1 file changed, 15 insertions(+), 18 deletions(-)

diff --git a/cpp/open3d/geometry/PointCloudSegmentation.cpp b/cpp/open3d/geometry/PointCloudSegmentation.cpp
index c36cc4dafb4..b079dd57323 100644
--- a/cpp/open3d/geometry/PointCloudSegmentation.cpp
+++ b/cpp/open3d/geometry/PointCloudSegmentation.cpp
@@ -6,6 +6,7 @@
 // ----------------------------------------------------------------------------
 
 #include <tbb/parallel_for.h>
+#include <tbb/spin_mutex.h>
 #include <tbb/spin_rw_mutex.h>
 
 #include <Eigen/Dense>
@@ -29,30 +30,28 @@ namespace geometry {
 template <typename T>
 class RandomSampler {
 public:
-    explicit RandomSampler(const size_t total_size) : total_size_(total_size) {}
+    explicit RandomSampler(const std::size_t total_size)
+        : dist(0, total_size - 1) {}
 
-    std::vector<T> operator()(size_t sample_size) {
-        std::lock_guard<std::mutex> lock(mutex_);
-        std::vector<T> samples;
-        samples.reserve(sample_size);
+    std::vector<T> operator()(std::size_t sample_size) {
+        std::vector<T> indices;
+        indices.reserve(sample_size);
 
-        size_t valid_sample = 0;
-        while (valid_sample < sample_size) {
-            const size_t idx = utility::random::RandUint32() % total_size_;
+        tbb::spin_mutex::scoped_lock lock(utility::random::GetMutex());
+        while (indices.size() < sample_size) {
+            const T idx = dist(utility::random::GetEngine());
             // Well, this is slow. But typically the sample_size is small.
-            if (std::find(samples.begin(), samples.end(), idx) ==
-                samples.end()) {
-                samples.push_back(idx);
-                valid_sample++;
+            if (std::find(indices.begin(), indices.end(), idx) ==
+                indices.end()) {
+                indices.emplace_back(idx);
             }
         }
 
-        return samples;
+        return indices;
     }
 
 private:
-    size_t total_size_;
-    std::mutex mutex_;
+    std::uniform_int_distribution<T> dist;
 };
 
 /// \class RANSACResult
@@ -198,9 +197,7 @@ std::tuple<Eigen::Vector4d, std::vector<size_t>> PointCloud::SegmentPlane(
                         continue;
                     }
 
-                    const std::vector<size_t> sampled_indices =
-                            sampler(ransac_n);
-                    std::vector<size_t> inliers = sampled_indices;
+                    std::vector<std::size_t> inliers = sampler(ransac_n);
 
                     // Fit model to num_model_parameters randomly selected
                     // points among the inliers.

From 81b9bb3d9178d56edfaefd8d7dd59b6e45a82bfa Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Mon, 23 Sep 2024 11:36:09 -0700
Subject: [PATCH 21/22] Updated example usage of global mutex and engine access
 to reflect the TBB types.

---
 cpp/open3d/utility/Random.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/open3d/utility/Random.h b/cpp/open3d/utility/Random.h
index 9445c03666b..1ea135a7e50 100644
--- a/cpp/open3d/utility/Random.h
+++ b/cpp/open3d/utility/Random.h
@@ -29,8 +29,8 @@ void Seed(const int seed);
 ///
 /// {
 ///     // Put the lock and the call to the engine in the same scope.
-///     std::lock_guard<std::mutex> lock(*utility::random::GetMutex());
-///     std::shuffle(vals.begin(), vals.end(), *utility::random::GetEngine());
+///     tbb::spin_mutex::scoped_lock lock(utility::random::GetMutex());
+///     std::shuffle(vals.begin(), vals.end(), utility::random::GetEngine());
 /// }
 /// ```
 std::mt19937& GetEngine();

From 86a474d5c7c327c604e0b705979fa61545aa392c Mon Sep 17 00:00:00 2001
From: Daniel Simon <dansimon09@gmail.com>
Date: Wed, 2 Oct 2024 13:01:47 -0700
Subject: [PATCH 22/22] Turn TBB into publicly linked library

---
 3rdparty/find_dependencies.cmake | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/3rdparty/find_dependencies.cmake b/3rdparty/find_dependencies.cmake
index 77b7085df69..25bef81996e 100644
--- a/3rdparty/find_dependencies.cmake
+++ b/3rdparty/find_dependencies.cmake
@@ -1568,7 +1568,7 @@ if(OPEN3D_USE_ONEAPI_PACKAGES)
         PACKAGE TBB
         TARGETS TBB::tbb
     )
-    list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_tbb)
+    list(APPEND Open3D_3RDPARTY_PUBLIC_TARGETS_FROM_SYSTEM Open3D::3rdparty_tbb)
 
 else(OPEN3D_USE_ONEAPI_PACKAGES)
     # MKL/BLAS
@@ -1687,7 +1687,7 @@ else(OPEN3D_USE_ONEAPI_PACKAGES)
             INCLUDE_DIRS ${STATIC_MKL_INCLUDE_DIR}
             LIB_DIR      ${STATIC_MKL_LIB_DIR}
             LIBRARIES    ${STATIC_MKL_LIBRARIES}
-            DEPENDS      ext_tbb ext_mkl_include ext_mkl
+            DEPENDS      Open3D::3rdparty_tbb ext_mkl_include ext_mkl
         )
         if(UNIX)
             target_compile_options(3rdparty_blas INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-m64>")
@@ -1709,9 +1709,13 @@ else(OPEN3D_USE_ONEAPI_PACKAGES)
     endif()
     if(NOT USE_SYSTEM_TBB)
         include(${Open3D_3RDPARTY_DIR}/mkl/tbb.cmake)
-        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_CUSTOM TBB::tbb)
+        add_library(3rdparty_tbb INTERFACE)
+        add_library(Open3D::3rdparty_tbb ALIAS 3rdparty_tbb)
+        target_link_libraries(3rdparty_tbb INTERFACE TBB::tbb)
+        install(TARGETS 3rdparty_tbb EXPORT ${PROJECT_NAME}Targets)
+        list(APPEND Open3D_3RDPARTY_PUBLIC_TARGETS_FROM_CUSTOM Open3D::3rdparty_tbb)
     else()
-        list(APPEND Open3D_3RDPARTY_PRIVATE_TARGETS_FROM_SYSTEM Open3D::3rdparty_tbb)
+        list(APPEND Open3D_3RDPARTY_PUBLIC_TARGETS_FROM_SYSTEM Open3D::3rdparty_tbb)
     endif()
 
 endif(OPEN3D_USE_ONEAPI_PACKAGES)