Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions tensorflow/core/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -3341,6 +3341,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
"common_runtime/tensorpool_allocator.h",
"common_runtime/gpu_tensorpool_allocator.h",
"common_runtime/threadpool_device.h",
"common_runtime/tensorpool_mkl_allocator.h",
"common_runtime/process_state.h",
"common_runtime/pool_allocator.h",
"graph/gradients.h",
Expand Down Expand Up @@ -3419,6 +3420,7 @@ tf_cuda_library(
"common_runtime/stats_publisher_interface.cc",
"common_runtime/step_stats_collector.cc",
"common_runtime/tensorpool_allocator.cc",
"common_runtime/tensorpool_mkl_allocator.cc",
"common_runtime/gpu_tensorpool_allocator.cc",
"common_runtime/threadpool_device.cc",
"common_runtime/threadpool_device_factory.cc",
Expand Down
6 changes: 3 additions & 3 deletions tensorflow/core/common_runtime/tensorpool_allocator.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,17 +110,17 @@ class TensorPoolAllocator : public Allocator {
SubAllocator* sub_allocator_;
};

private:
protected:
bool stats_;
std::unique_ptr<SubAllocator> sub_allocator_;
void* BigAllocate(size_t alignment, size_t num_bytes);
void* BigAllocateStatistic(size_t alignment, size_t num_bytes);
void BigDeallocate(Header* header);

private:
bool stats_;
std::atomic_bool inited_;
std::atomic_bool initing_;

std::unique_ptr<SubAllocator> sub_allocator_;
MemoryPlannerBase* mem_planner_;

size_t large_bin_index_;
Expand Down
114 changes: 114 additions & 0 deletions tensorflow/core/common_runtime/tensorpool_mkl_allocator.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
#include "tensorflow/core/common_runtime/bfc_allocator.h"
#include "tensorflow/core/common_runtime/memory_planner.h"
#include "tensorflow/core/common_runtime/tensorpool_mkl_allocator.h"
#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/platform/mem.h"
#include "tensorflow/core/util/env_var.h"
#include <sys/time.h>

#define unlikely(x) __builtin_expect(!!(x), 0)

namespace tensorflow {

namespace {
constexpr int64 DEFAULT_TENSORPOOL_MKL_LARGE_SIZE = (512 << 10);
}

namespace {
void* SetLightHeader(void* p, size_t total_bytes, size_t header_size) {
// LightHeader *KB max(sizeof(LightHeader)=8B, alignment)
// { | .....| checksum (4B) | header_size (4B)}
auto user_ptr = (char*)p + header_size;
new((char*)user_ptr - sizeof(LightHeader)) LightHeader(header_size);

return user_ptr;
}

LightHeader* GetLightHeader(void* p) {
auto light_header = (LightHeader*)((char*)p - sizeof(LightHeader));

return (strcmp(light_header->checksum, CHECK_SUM.c_str()) == 0)
? light_header
: nullptr;
}

Header* GetHeader(void* p) {
auto header = (Header*)((char*) p - sizeof(Header));

if (header->user_ptr != p) {
auto light_header = GetLightHeader(p);
LOG(FATAL) << "Memory corruption!"
<< ", p:" << p
<< ", p->header_size:" << light_header->header_size
<< ", p->checksum:" << light_header->checksum;
}

return header;
}
}

TensorPoolMklAllocator::TensorPoolMklAllocator()
: TensorPoolAllocator() {
Status s = ReadInt64FromEnvVar("TENSORPOOL_MKL_LARGE_SIZE",
DEFAULT_TENSORPOOL_MKL_LARGE_SIZE,
&kLargeAllocationsThreshold);
uint64 max_mem_bytes = kDefaultMaxLimit;
large_size_allocator_ =
new BFCAllocator(sub_allocator_.get(), max_mem_bytes, kAllowGrowth, kName);
}

void* TensorPoolMklAllocator::AllocateRaw(size_t alignment,
size_t num_bytes) {
void* ret;
if (SmallAlloc(num_bytes)) {
auto header_size = std::max(sizeof(LightHeader), alignment);
auto total = num_bytes + header_size;
auto ptr = sub_allocator_->Alloc(alignment, total);
ret = SetLightHeader(ptr, total, header_size);

return ret;
}

if (LargeAlloc(num_bytes)) {
VLOG(1) << "Large allocate " << num_bytes << " bytes.";
mutex_lock l(mutex_);
ret = large_size_allocator_->AllocateRaw(alignment, num_bytes);
AddLargeAllocMap(ret, num_bytes);

return ret;
}

if (unlikely(stats_)) {
ret = BigAllocateStatistic(alignment, num_bytes);
} else {
ret = BigAllocate(alignment, num_bytes);
}

return ret;
}

void TensorPoolMklAllocator::DeallocateRaw(void* ptr) {
VLOG(1) << "DeallocateRaw " << Name() << " "
<< (ptr ? RequestedSize(ptr) : 0);

auto light_header = GetLightHeader(ptr);
if (light_header != nullptr) {
auto header_size = light_header->header_size;
auto raw_ptr = (char*)ptr - header_size;
// LightHeader not record allocation size
// Free interface ignore the freed num_bytes
sub_allocator_->Free(raw_ptr, 0);
return;
}

if (IsLargeSizeAllocation(ptr)) {
mutex_lock l(mutex_);
RemoveLargeAllocMap(ptr);
large_size_allocator_->DeallocateRaw(ptr);
return;
}

auto header = GetHeader(ptr);
BigDeallocate(header);
}
} // tensorflow
76 changes: 76 additions & 0 deletions tensorflow/core/common_runtime/tensorpool_mkl_allocator.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
#ifndef TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_
#define TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_

#include <cstdlib>
#include "tensorflow/core/common_runtime/tensorpool_allocator.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/lib/core/spin_lock.h"
#include "tensorflow/core/platform/macros.h"
#include "tensorflow/core/platform/mutex.h"
#include "tensorflow/core/platform/types.h"


#include <atomic>
#include <stack>
#include <vector>
#include <unordered_map>

namespace tensorflow {

class TensorPoolMklAllocator : public TensorPoolAllocator {
public:
TensorPoolMklAllocator();
~TensorPoolMklAllocator() override { delete large_size_allocator_; }

void* AllocateRaw(size_t alignment, size_t num_bytes) override;
void DeallocateRaw(void* ptr) override;

static constexpr size_t kDefaultMaxLimit = 64LL << 30;
static const bool kAllowGrowth = true;
static constexpr const char* kName = "tensor_pool_mkl_allocator";

inline bool LargeAlloc(size_t s) {
return s > kLargeAllocationsThreshold;
}

inline bool IsLargeSizeAllocation(const void* ptr) const
LOCKS_EXCLUDED(mutex_) {
mutex_lock l(mutex_);
return large_allocations_map_.find(ptr) != large_allocations_map_.end();
}

// AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
if (ptr != nullptr) {
std::pair<void*, size_t> map_val(ptr, num_bytes);
large_allocations_map_.insert(map_val);
}
}

inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
auto map_iter = large_allocations_map_.find(ptr);
if (map_iter != large_allocations_map_.end()) {
large_allocations_map_.erase(map_iter);
} else {
LOG(ERROR) << "tried to deallocate invalid pointer";
}
return;
}

private:
Allocator* large_size_allocator_ = nullptr; // owned by this class
// Hash map to keep track of "BFC" allocations
// We do not use BFC allocator for small allocations.
std::unordered_map<const void*, size_t> large_allocations_map_
GUARDED_BY(mutex_);

mutable mutex mutex_;

// Size in bytes that defines the upper-bound for "small" allocations.
// Any allocation above this threshold is "large" allocation.
int64 kLargeAllocationsThreshold;
};

}
#endif // TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_
9 changes: 5 additions & 4 deletions tensorflow/core/common_runtime/threadpool_device.cc
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ limitations under the License.
#include "tensorflow/core/common_runtime/local_device.h"
#include "tensorflow/core/common_runtime/scoped_allocator.h"
#include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
#include "tensorflow/core/common_runtime/tensorpool_mkl_allocator.h"
#include "tensorflow/core/framework/allocator.h"
#include "tensorflow/core/framework/allocator_registry.h"
#include "tensorflow/core/framework/device_base.h"
Expand Down Expand Up @@ -123,17 +124,17 @@ class MklCPUAllocatorFactory : public AllocatorFactory {
public:
bool NumaEnabled() override { return false; }

Allocator* CreateAllocator() override { return new TensorPoolAllocator; }
Allocator* CreateAllocator() override { return new TensorPoolMklAllocator; }

// Note: Ignores numa_node, for now.
virtual SubAllocator* CreateSubAllocator(int numa_node) {
return new TensorPoolSubAllocator(new TensorPoolAllocator);
return new TensorPoolSubAllocator(new TensorPoolMklAllocator);
}

private:
class TensorPoolSubAllocator : public SubAllocator {
public:
explicit TensorPoolSubAllocator(TensorPoolAllocator* allocator)
explicit TensorPoolSubAllocator(TensorPoolMklAllocator* allocator)
: SubAllocator({}, {}), allocator_(allocator) {}

void* Alloc(size_t alignment, size_t num_bytes) override {
Expand All @@ -145,7 +146,7 @@ class MklCPUAllocatorFactory : public AllocatorFactory {
}

private:
TensorPoolAllocator* allocator_;
TensorPoolMklAllocator* allocator_;
};
};

Expand Down