diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD index 8a33debcf0a..402334b22e6 100644 --- a/tensorflow/core/BUILD +++ b/tensorflow/core/BUILD @@ -3341,6 +3341,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [ "common_runtime/tensorpool_allocator.h", "common_runtime/gpu_tensorpool_allocator.h", "common_runtime/threadpool_device.h", + "common_runtime/tensorpool_mkl_allocator.h", "common_runtime/process_state.h", "common_runtime/pool_allocator.h", "graph/gradients.h", @@ -3419,6 +3420,7 @@ tf_cuda_library( "common_runtime/stats_publisher_interface.cc", "common_runtime/step_stats_collector.cc", "common_runtime/tensorpool_allocator.cc", + "common_runtime/tensorpool_mkl_allocator.cc", "common_runtime/gpu_tensorpool_allocator.cc", "common_runtime/threadpool_device.cc", "common_runtime/threadpool_device_factory.cc", diff --git a/tensorflow/core/common_runtime/tensorpool_allocator.h b/tensorflow/core/common_runtime/tensorpool_allocator.h index c40c5d9385e..251fca5d780 100644 --- a/tensorflow/core/common_runtime/tensorpool_allocator.h +++ b/tensorflow/core/common_runtime/tensorpool_allocator.h @@ -110,17 +110,17 @@ class TensorPoolAllocator : public Allocator { SubAllocator* sub_allocator_; }; - private: +protected: + bool stats_; + std::unique_ptr sub_allocator_; void* BigAllocate(size_t alignment, size_t num_bytes); void* BigAllocateStatistic(size_t alignment, size_t num_bytes); void BigDeallocate(Header* header); private: - bool stats_; std::atomic_bool inited_; std::atomic_bool initing_; - std::unique_ptr sub_allocator_; MemoryPlannerBase* mem_planner_; size_t large_bin_index_; diff --git a/tensorflow/core/common_runtime/tensorpool_mkl_allocator.cc b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.cc new file mode 100644 index 00000000000..3f612f06027 --- /dev/null +++ b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.cc @@ -0,0 +1,114 @@ +#include "tensorflow/core/common_runtime/bfc_allocator.h" +#include "tensorflow/core/common_runtime/memory_planner.h" +#include "tensorflow/core/common_runtime/tensorpool_mkl_allocator.h" +#include "tensorflow/core/framework/allocator_registry.h" +#include "tensorflow/core/platform/mem.h" +#include "tensorflow/core/util/env_var.h" +#include + +#define unlikely(x) __builtin_expect(!!(x), 0) + +namespace tensorflow { + +namespace { + constexpr int64 DEFAULT_TENSORPOOL_MKL_LARGE_SIZE = (512 << 10); +} + +namespace { +void* SetLightHeader(void* p, size_t total_bytes, size_t header_size) { + // LightHeader *KB max(sizeof(LightHeader)=8B, alignment) + // { | .....| checksum (4B) | header_size (4B)} + auto user_ptr = (char*)p + header_size; + new((char*)user_ptr - sizeof(LightHeader)) LightHeader(header_size); + + return user_ptr; +} + +LightHeader* GetLightHeader(void* p) { + auto light_header = (LightHeader*)((char*)p - sizeof(LightHeader)); + + return (strcmp(light_header->checksum, CHECK_SUM.c_str()) == 0) + ? light_header + : nullptr; +} + +Header* GetHeader(void* p) { + auto header = (Header*)((char*) p - sizeof(Header)); + + if (header->user_ptr != p) { + auto light_header = GetLightHeader(p); + LOG(FATAL) << "Memory corruption!" + << ", p:" << p + << ", p->header_size:" << light_header->header_size + << ", p->checksum:" << light_header->checksum; + } + + return header; +} +} + +TensorPoolMklAllocator::TensorPoolMklAllocator() + : TensorPoolAllocator() { + Status s = ReadInt64FromEnvVar("TENSORPOOL_MKL_LARGE_SIZE", + DEFAULT_TENSORPOOL_MKL_LARGE_SIZE, + &kLargeAllocationsThreshold); + uint64 max_mem_bytes = kDefaultMaxLimit; + large_size_allocator_ = + new BFCAllocator(sub_allocator_.get(), max_mem_bytes, kAllowGrowth, kName); +} + +void* TensorPoolMklAllocator::AllocateRaw(size_t alignment, + size_t num_bytes) { + void* ret; + if (SmallAlloc(num_bytes)) { + auto header_size = std::max(sizeof(LightHeader), alignment); + auto total = num_bytes + header_size; + auto ptr = sub_allocator_->Alloc(alignment, total); + ret = SetLightHeader(ptr, total, header_size); + + return ret; + } + + if (LargeAlloc(num_bytes)) { + VLOG(1) << "Large allocate " << num_bytes << " bytes."; + mutex_lock l(mutex_); + ret = large_size_allocator_->AllocateRaw(alignment, num_bytes); + AddLargeAllocMap(ret, num_bytes); + + return ret; + } + + if (unlikely(stats_)) { + ret = BigAllocateStatistic(alignment, num_bytes); + } else { + ret = BigAllocate(alignment, num_bytes); + } + + return ret; +} + +void TensorPoolMklAllocator::DeallocateRaw(void* ptr) { + VLOG(1) << "DeallocateRaw " << Name() << " " + << (ptr ? RequestedSize(ptr) : 0); + + auto light_header = GetLightHeader(ptr); + if (light_header != nullptr) { + auto header_size = light_header->header_size; + auto raw_ptr = (char*)ptr - header_size; + // LightHeader not record allocation size + // Free interface ignore the freed num_bytes + sub_allocator_->Free(raw_ptr, 0); + return; + } + + if (IsLargeSizeAllocation(ptr)) { + mutex_lock l(mutex_); + RemoveLargeAllocMap(ptr); + large_size_allocator_->DeallocateRaw(ptr); + return; + } + + auto header = GetHeader(ptr); + BigDeallocate(header); +} +} // tensorflow diff --git a/tensorflow/core/common_runtime/tensorpool_mkl_allocator.h b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.h new file mode 100644 index 00000000000..9e772069663 --- /dev/null +++ b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.h @@ -0,0 +1,76 @@ +#ifndef TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_ +#define TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_ + +#include +#include "tensorflow/core/common_runtime/tensorpool_allocator.h" +#include "tensorflow/core/framework/allocator.h" +#include "tensorflow/core/lib/core/spin_lock.h" +#include "tensorflow/core/platform/macros.h" +#include "tensorflow/core/platform/mutex.h" +#include "tensorflow/core/platform/types.h" + + +#include +#include +#include +#include + +namespace tensorflow { + +class TensorPoolMklAllocator : public TensorPoolAllocator { + public: + TensorPoolMklAllocator(); + ~TensorPoolMklAllocator() override { delete large_size_allocator_; } + + void* AllocateRaw(size_t alignment, size_t num_bytes) override; + void DeallocateRaw(void* ptr) override; + + static constexpr size_t kDefaultMaxLimit = 64LL << 30; + static const bool kAllowGrowth = true; + static constexpr const char* kName = "tensor_pool_mkl_allocator"; + + inline bool LargeAlloc(size_t s) { + return s > kLargeAllocationsThreshold; + } + + inline bool IsLargeSizeAllocation(const void* ptr) const + LOCKS_EXCLUDED(mutex_) { + mutex_lock l(mutex_); + return large_allocations_map_.find(ptr) != large_allocations_map_.end(); + } + + // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held + inline void AddLargeAllocMap(void* ptr, size_t num_bytes) + EXCLUSIVE_LOCKS_REQUIRED(mutex_) { + if (ptr != nullptr) { + std::pair map_val(ptr, num_bytes); + large_allocations_map_.insert(map_val); + } + } + + inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) { + auto map_iter = large_allocations_map_.find(ptr); + if (map_iter != large_allocations_map_.end()) { + large_allocations_map_.erase(map_iter); + } else { + LOG(ERROR) << "tried to deallocate invalid pointer"; + } + return; + } + + private: + Allocator* large_size_allocator_ = nullptr; // owned by this class + // Hash map to keep track of "BFC" allocations + // We do not use BFC allocator for small allocations. + std::unordered_map large_allocations_map_ + GUARDED_BY(mutex_); + + mutable mutex mutex_; + + // Size in bytes that defines the upper-bound for "small" allocations. + // Any allocation above this threshold is "large" allocation. + int64 kLargeAllocationsThreshold; +}; + +} +#endif // TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_ \ No newline at end of file diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc index 4887c99f9b4..2e60b6483aa 100644 --- a/tensorflow/core/common_runtime/threadpool_device.cc +++ b/tensorflow/core/common_runtime/threadpool_device.cc @@ -18,6 +18,7 @@ limitations under the License. #include "tensorflow/core/common_runtime/local_device.h" #include "tensorflow/core/common_runtime/scoped_allocator.h" #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h" +#include "tensorflow/core/common_runtime/tensorpool_mkl_allocator.h" #include "tensorflow/core/framework/allocator.h" #include "tensorflow/core/framework/allocator_registry.h" #include "tensorflow/core/framework/device_base.h" @@ -123,17 +124,17 @@ class MklCPUAllocatorFactory : public AllocatorFactory { public: bool NumaEnabled() override { return false; } - Allocator* CreateAllocator() override { return new TensorPoolAllocator; } + Allocator* CreateAllocator() override { return new TensorPoolMklAllocator; } // Note: Ignores numa_node, for now. virtual SubAllocator* CreateSubAllocator(int numa_node) { - return new TensorPoolSubAllocator(new TensorPoolAllocator); + return new TensorPoolSubAllocator(new TensorPoolMklAllocator); } private: class TensorPoolSubAllocator : public SubAllocator { public: - explicit TensorPoolSubAllocator(TensorPoolAllocator* allocator) + explicit TensorPoolSubAllocator(TensorPoolMklAllocator* allocator) : SubAllocator({}, {}), allocator_(allocator) {} void* Alloc(size_t alignment, size_t num_bytes) override { @@ -145,7 +146,7 @@ class MklCPUAllocatorFactory : public AllocatorFactory { } private: - TensorPoolAllocator* allocator_; + TensorPoolMklAllocator* allocator_; }; };