diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD
index 8a33debcf0a..402334b22e6 100644
--- a/tensorflow/core/BUILD
+++ b/tensorflow/core/BUILD
@@ -3341,6 +3341,7 @@ CORE_CPU_LIB_HEADERS = CORE_CPU_BASE_HDRS + [
     "common_runtime/tensorpool_allocator.h",
     "common_runtime/gpu_tensorpool_allocator.h",
     "common_runtime/threadpool_device.h",
+    "common_runtime/tensorpool_mkl_allocator.h",
     "common_runtime/process_state.h",
     "common_runtime/pool_allocator.h",
     "graph/gradients.h",
@@ -3419,6 +3420,7 @@ tf_cuda_library(
         "common_runtime/stats_publisher_interface.cc",
         "common_runtime/step_stats_collector.cc",
         "common_runtime/tensorpool_allocator.cc",
+        "common_runtime/tensorpool_mkl_allocator.cc",
         "common_runtime/gpu_tensorpool_allocator.cc",
         "common_runtime/threadpool_device.cc",
         "common_runtime/threadpool_device_factory.cc",
diff --git a/tensorflow/core/common_runtime/tensorpool_allocator.h b/tensorflow/core/common_runtime/tensorpool_allocator.h
index c40c5d9385e..251fca5d780 100644
--- a/tensorflow/core/common_runtime/tensorpool_allocator.h
+++ b/tensorflow/core/common_runtime/tensorpool_allocator.h
@@ -110,17 +110,17 @@ class TensorPoolAllocator : public Allocator {
     SubAllocator* sub_allocator_;
   };
 
- private:
+protected:
+  bool stats_;
+  std::unique_ptr<SubAllocator> sub_allocator_;
   void* BigAllocate(size_t alignment, size_t num_bytes);
   void* BigAllocateStatistic(size_t alignment, size_t num_bytes);
   void BigDeallocate(Header* header);
   
  private:
-  bool stats_;
   std::atomic_bool inited_;
   std::atomic_bool initing_;
 
-  std::unique_ptr<SubAllocator> sub_allocator_;
   MemoryPlannerBase* mem_planner_;
  
   size_t large_bin_index_;
diff --git a/tensorflow/core/common_runtime/tensorpool_mkl_allocator.cc b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.cc
new file mode 100644
index 00000000000..3f612f06027
--- /dev/null
+++ b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.cc
@@ -0,0 +1,114 @@
+#include "tensorflow/core/common_runtime/bfc_allocator.h"
+#include "tensorflow/core/common_runtime/memory_planner.h"
+#include "tensorflow/core/common_runtime/tensorpool_mkl_allocator.h"
+#include "tensorflow/core/framework/allocator_registry.h"
+#include "tensorflow/core/platform/mem.h"
+#include "tensorflow/core/util/env_var.h"
+#include <sys/time.h>
+
+#define unlikely(x) __builtin_expect(!!(x), 0)
+
+namespace tensorflow {
+
+namespace {
+  constexpr int64 DEFAULT_TENSORPOOL_MKL_LARGE_SIZE = (512 << 10);
+}
+
+namespace {
+void* SetLightHeader(void* p, size_t total_bytes, size_t header_size) {
+  // LightHeader *KB max(sizeof(LightHeader)=8B, alignment)
+  //   { | .....| checksum (4B) | header_size (4B)}
+  auto user_ptr = (char*)p + header_size;
+  new((char*)user_ptr - sizeof(LightHeader)) LightHeader(header_size);
+
+  return user_ptr;
+}
+
+LightHeader* GetLightHeader(void* p) {
+  auto light_header = (LightHeader*)((char*)p - sizeof(LightHeader));
+
+  return (strcmp(light_header->checksum, CHECK_SUM.c_str()) == 0)
+           ? light_header
+           : nullptr;
+}
+
+Header* GetHeader(void* p) {
+  auto header = (Header*)((char*) p - sizeof(Header));
+
+  if (header->user_ptr != p) {
+    auto light_header = GetLightHeader(p);
+    LOG(FATAL) << "Memory corruption!"
+               << ", p:" << p
+               << ", p->header_size:" << light_header->header_size
+               << ", p->checksum:" << light_header->checksum;
+  }
+
+  return header;
+}
+}
+
+TensorPoolMklAllocator::TensorPoolMklAllocator()
+    : TensorPoolAllocator() {
+  Status s = ReadInt64FromEnvVar("TENSORPOOL_MKL_LARGE_SIZE",
+    DEFAULT_TENSORPOOL_MKL_LARGE_SIZE,
+    &kLargeAllocationsThreshold);
+  uint64 max_mem_bytes = kDefaultMaxLimit;
+  large_size_allocator_ =
+    new BFCAllocator(sub_allocator_.get(), max_mem_bytes, kAllowGrowth, kName);
+}
+
+void* TensorPoolMklAllocator::AllocateRaw(size_t alignment,
+    size_t num_bytes) {
+  void* ret;
+  if (SmallAlloc(num_bytes)) {
+    auto header_size = std::max(sizeof(LightHeader), alignment);
+    auto total = num_bytes + header_size;
+    auto ptr = sub_allocator_->Alloc(alignment, total);
+    ret = SetLightHeader(ptr, total, header_size);
+
+    return ret;
+  }
+
+  if (LargeAlloc(num_bytes)) {
+    VLOG(1) << "Large allocate " << num_bytes << " bytes.";
+    mutex_lock l(mutex_);
+    ret = large_size_allocator_->AllocateRaw(alignment, num_bytes);
+    AddLargeAllocMap(ret, num_bytes);
+
+    return ret;
+  }
+
+  if (unlikely(stats_)) {
+    ret = BigAllocateStatistic(alignment, num_bytes);
+  } else {
+    ret = BigAllocate(alignment, num_bytes);
+  }
+
+  return ret;
+}
+
+void TensorPoolMklAllocator::DeallocateRaw(void* ptr) {
+  VLOG(1) << "DeallocateRaw " << Name() << " "
+          << (ptr ? RequestedSize(ptr) : 0);
+
+  auto light_header = GetLightHeader(ptr);
+  if (light_header != nullptr) {
+    auto header_size = light_header->header_size;
+    auto raw_ptr = (char*)ptr - header_size;
+    // LightHeader not record allocation size
+    // Free interface ignore the freed num_bytes
+    sub_allocator_->Free(raw_ptr, 0);
+    return;
+  }
+
+  if (IsLargeSizeAllocation(ptr)) {
+    mutex_lock l(mutex_);
+    RemoveLargeAllocMap(ptr);
+    large_size_allocator_->DeallocateRaw(ptr);
+    return;
+  }
+
+  auto header = GetHeader(ptr);
+  BigDeallocate(header);
+}
+} // tensorflow
diff --git a/tensorflow/core/common_runtime/tensorpool_mkl_allocator.h b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.h
new file mode 100644
index 00000000000..9e772069663
--- /dev/null
+++ b/tensorflow/core/common_runtime/tensorpool_mkl_allocator.h
@@ -0,0 +1,76 @@
+#ifndef TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_
+#define TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_
+
+#include <cstdlib>
+#include "tensorflow/core/common_runtime/tensorpool_allocator.h"
+#include "tensorflow/core/framework/allocator.h"
+#include "tensorflow/core/lib/core/spin_lock.h"
+#include "tensorflow/core/platform/macros.h"
+#include "tensorflow/core/platform/mutex.h"
+#include "tensorflow/core/platform/types.h"
+
+
+#include <atomic>
+#include <stack>
+#include <vector>
+#include <unordered_map>
+
+namespace tensorflow {
+
+class TensorPoolMklAllocator : public TensorPoolAllocator {
+ public:
+  TensorPoolMklAllocator();
+  ~TensorPoolMklAllocator() override { delete large_size_allocator_; }
+
+  void* AllocateRaw(size_t alignment, size_t num_bytes) override;
+  void DeallocateRaw(void* ptr) override;
+
+  static constexpr size_t kDefaultMaxLimit = 64LL << 30;
+  static const bool kAllowGrowth = true;
+  static constexpr const char* kName = "tensor_pool_mkl_allocator";
+
+  inline bool LargeAlloc(size_t s) {
+    return s > kLargeAllocationsThreshold;
+  }
+
+  inline bool IsLargeSizeAllocation(const void* ptr) const
+    LOCKS_EXCLUDED(mutex_) {
+    mutex_lock l(mutex_);
+    return large_allocations_map_.find(ptr) != large_allocations_map_.end();
+  }
+
+  // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
+  inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
+      EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    if (ptr != nullptr) {
+      std::pair<void*, size_t> map_val(ptr, num_bytes);
+      large_allocations_map_.insert(map_val);
+    }
+  }
+
+  inline void RemoveLargeAllocMap(void* ptr) EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
+    auto map_iter = large_allocations_map_.find(ptr);
+    if (map_iter != large_allocations_map_.end()) {
+      large_allocations_map_.erase(map_iter);
+    } else {
+      LOG(ERROR) << "tried to deallocate invalid pointer";
+    }
+    return;
+  }
+
+ private:
+  Allocator* large_size_allocator_ = nullptr;              // owned by this class
+  // Hash map to keep track of "BFC" allocations
+  // We do not use BFC allocator for small allocations.
+  std::unordered_map<const void*, size_t> large_allocations_map_
+      GUARDED_BY(mutex_);
+
+  mutable mutex mutex_;
+
+  // Size in bytes that defines the upper-bound for "small" allocations.
+  // Any allocation above this threshold is "large" allocation.
+  int64 kLargeAllocationsThreshold;
+};
+
+}
+#endif // TENSORFLOW_COMMON_RUNTIME_TENSORPOOL_MKL_ALLOCATOR_H_
\ No newline at end of file
diff --git a/tensorflow/core/common_runtime/threadpool_device.cc b/tensorflow/core/common_runtime/threadpool_device.cc
index 4887c99f9b4..2e60b6483aa 100644
--- a/tensorflow/core/common_runtime/threadpool_device.cc
+++ b/tensorflow/core/common_runtime/threadpool_device.cc
@@ -18,6 +18,7 @@ limitations under the License.
 #include "tensorflow/core/common_runtime/local_device.h"
 #include "tensorflow/core/common_runtime/scoped_allocator.h"
 #include "tensorflow/core/common_runtime/scoped_allocator_mgr.h"
+#include "tensorflow/core/common_runtime/tensorpool_mkl_allocator.h"
 #include "tensorflow/core/framework/allocator.h"
 #include "tensorflow/core/framework/allocator_registry.h"
 #include "tensorflow/core/framework/device_base.h"
@@ -123,17 +124,17 @@ class MklCPUAllocatorFactory : public AllocatorFactory {
  public:
   bool NumaEnabled() override { return false; }
 
-  Allocator* CreateAllocator() override { return new TensorPoolAllocator; }
+  Allocator* CreateAllocator() override { return new TensorPoolMklAllocator; }
 
   // Note: Ignores numa_node, for now.
   virtual SubAllocator* CreateSubAllocator(int numa_node) {
-    return new TensorPoolSubAllocator(new TensorPoolAllocator);
+    return new TensorPoolSubAllocator(new TensorPoolMklAllocator);
   }
 
  private:
   class TensorPoolSubAllocator : public SubAllocator {
    public:
-    explicit TensorPoolSubAllocator(TensorPoolAllocator* allocator)
+    explicit TensorPoolSubAllocator(TensorPoolMklAllocator* allocator)
       : SubAllocator({}, {}), allocator_(allocator) {}
 
     void* Alloc(size_t alignment, size_t num_bytes) override {
@@ -145,7 +146,7 @@ class MklCPUAllocatorFactory : public AllocatorFactory {
     }
 
    private:
-    TensorPoolAllocator* allocator_;
+    TensorPoolMklAllocator* allocator_;
   };
 };