1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// A simple CPU allocator that intercepts malloc/free calls from MKL library
17// and redirects them to Tensorflow allocator
18
19#ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
20#define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
21
22#ifdef INTEL_MKL
23
24#include <cstdlib>
25
26#include "tensorflow/core/common_runtime/bfc_allocator.h"
27#include "tensorflow/core/common_runtime/pool_allocator.h"
28#include "tensorflow/core/lib/strings/numbers.h"
29#include "tensorflow/core/lib/strings/str_util.h"
30#include "tensorflow/core/platform/mem.h"
31#include "tensorflow/core/platform/numa.h"
32#include "tensorflow/core/util/env_var.h"
33#include "tensorflow/core/util/onednn_env_vars.h"
34#ifdef _WIN32
35typedef unsigned int uint;
36#endif
37
38namespace tensorflow {
39
40static bool mkl_small_allocator_collect_stats = false;
41
42class MklSubAllocator : public BasicCPUAllocator {
43 public:
44 MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {}
45 ~MklSubAllocator() override {}
46};
47
48// CPU allocator that handles small-size allocations by calling
49// suballocator directly. Mostly, it is just a wrapper around a suballocator
50// (that calls malloc and free directly) with support for bookkeeping.
51class MklSmallSizeAllocator : public Allocator {
52 public:
53 MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory,
54 const string& name)
55 : sub_allocator_(sub_allocator), name_(name) {
56 stats_.bytes_limit = total_memory;
57 }
58 ~MklSmallSizeAllocator() override {}
59
60 TF_DISALLOW_COPY_AND_ASSIGN(MklSmallSizeAllocator);
61
62 inline string Name() override { return name_; }
63
64 void* AllocateRaw(size_t alignment, size_t num_bytes) override {
65 void* ptr = port::AlignedMalloc(num_bytes, alignment);
66 if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes);
67 return ptr;
68 }
69
70 void DeallocateRaw(void* ptr) override {
71 if (ptr == nullptr) {
72 LOG(ERROR) << "tried to deallocate nullptr";
73 return;
74 }
75
76 if (mkl_small_allocator_collect_stats) {
77 const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr);
78 DecrementStats(alloc_size);
79 }
80 port::AlignedFree(ptr);
81 }
82
83 absl::optional<AllocatorStats> GetStats() override {
84 mutex_lock l(mutex_);
85 return stats_;
86 }
87
88 bool ClearStats() override {
89 mutex_lock l(mutex_);
90 stats_.num_allocs = 0;
91 stats_.peak_bytes_in_use = 0;
92 stats_.largest_alloc_size = 0;
93 stats_.bytes_in_use = 0;
94 stats_.bytes_limit = 0;
95 return true;
96 }
97
98 private:
99 // Increment statistics for the allocator handling small allocations.
100 inline void IncrementStats(size_t alloc_size) TF_LOCKS_EXCLUDED(mutex_) {
101 mutex_lock l(mutex_);
102 ++stats_.num_allocs;
103 stats_.bytes_in_use += alloc_size;
104 stats_.peak_bytes_in_use =
105 std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
106 stats_.largest_alloc_size =
107 std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size));
108 }
109
110 // Decrement statistics for the allocator handling small allocations.
111 inline void DecrementStats(size_t dealloc_size) TF_LOCKS_EXCLUDED(mutex_) {
112 mutex_lock l(mutex_);
113 stats_.bytes_in_use -= dealloc_size;
114 }
115
116 SubAllocator* sub_allocator_; // Not owned by this class.
117
118 // Mutex for protecting updates to map of allocations.
119 mutable mutex mutex_;
120
121 // Allocator name
122 string name_;
123
124 // Allocator stats for small allocs
125 AllocatorStats stats_ TF_GUARDED_BY(mutex_);
126};
127
128/// CPU allocator for MKL that wraps BFC allocator and intercepts
129/// and redirects memory allocation calls from MKL.
130class MklCPUAllocator : public Allocator {
131 public:
132 // Constructor and other standard functions
133
134 /// Environment variable that user can set to upper bound on memory allocation
135 static constexpr const char* kMaxLimitStr = "TF_MKL_ALLOC_MAX_BYTES";
136
137 /// Default upper limit on allocator size - 64GB
138 static constexpr size_t kDefaultMaxLimit = 64LL << 30;
139
140 MklCPUAllocator() { TF_CHECK_OK(Initialize()); }
141
142 ~MklCPUAllocator() override {
143 delete small_size_allocator_;
144 delete large_size_allocator_;
145 }
146
147 Status Initialize() {
148 VLOG(2) << "MklCPUAllocator: In MklCPUAllocator";
149
150 // Set upper bound on memory allocation to physical RAM available on the
151 // CPU unless explicitly specified by user
152 uint64 max_mem_bytes = kDefaultMaxLimit;
153#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
154 max_mem_bytes =
155 (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE);
156#endif
157 char* user_mem_bytes = getenv(kMaxLimitStr);
158
159 if (user_mem_bytes != NULL) {
160 uint64 user_val = 0;
161 if (!strings::safe_strtou64(user_mem_bytes, &user_val)) {
162 return errors::InvalidArgument("Invalid memory limit (", user_mem_bytes,
163 ") specified for MKL allocator through ",
164 kMaxLimitStr);
165 }
166#if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE)
167 if (user_val > max_mem_bytes) {
168 LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr
169 << "=" << user_val
170 << " greater than available physical memory: "
171 << max_mem_bytes
172 << ". This could significantly reduce performance!";
173 }
174#endif
175 max_mem_bytes = user_val;
176 }
177
178 VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes;
179
180 sub_allocator_ = new MklSubAllocator();
181
182 // SubAllocator is owned by BFCAllocator, so we do not need to deallocate
183 // it in MklSmallSizeAllocator.
184 small_size_allocator_ =
185 new MklSmallSizeAllocator(sub_allocator_, max_mem_bytes, kName);
186
187 BFCAllocator::Options large_allocator_opts;
188 large_allocator_opts.allow_growth = kAllowGrowth;
189 large_size_allocator_ =
190 new BFCAllocator(absl::WrapUnique(sub_allocator_), max_mem_bytes, kName,
191 large_allocator_opts);
192 return OkStatus();
193 }
194
195 inline string Name() override { return kName; }
196 inline bool IsSmallSizeAllocation(const void* ptr) const
197 TF_LOCKS_EXCLUDED(mutex_) {
198 mutex_lock l(mutex_);
199 return large_allocations_map_.find(ptr) == large_allocations_map_.end();
200 }
201 // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held
202 inline void AddLargeAllocMap(void* ptr, size_t num_bytes)
203 TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
204 if (ptr != nullptr) {
205 std::pair<void*, size_t> map_val(ptr, num_bytes);
206 large_allocations_map_.insert(map_val);
207 }
208 }
209 inline void RemoveLargeAllocMap(void* ptr)
210 TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) {
211 auto map_iter = large_allocations_map_.find(ptr);
212 if (map_iter != large_allocations_map_.end()) {
213 large_allocations_map_.erase(map_iter);
214 } else {
215 LOG(ERROR) << "tried to deallocate invalid pointer";
216 }
217 return;
218 }
219
220 inline void* AllocateRaw(size_t alignment, size_t num_bytes) override {
221 // If the allocation size is less than threshold, call small allocator,
222 // otherwise call large-size allocator (BFC). We found that BFC allocator
223 // does not deliver good performance for small allocations when
224 // inter_op_parallelism_threads is high.
225 if (UseSystemAlloc() || num_bytes < kSmallAllocationsThreshold) {
226 return small_size_allocator_->AllocateRaw(alignment, num_bytes);
227 } else {
228 mutex_lock l(mutex_);
229 void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes);
230 AddLargeAllocMap(ptr, num_bytes);
231 return ptr;
232 }
233 }
234 inline void DeallocateRaw(void* ptr) override {
235 // Check if ptr is for "small" allocation. If it is, then call Free
236 // directly. Otherwise, call BFC to handle free.
237 if (UseSystemAlloc() || IsSmallSizeAllocation(ptr)) {
238 small_size_allocator_->DeallocateRaw(ptr);
239 } else {
240 mutex_lock l(mutex_);
241 RemoveLargeAllocMap(ptr);
242 large_size_allocator_->DeallocateRaw(ptr);
243 }
244 }
245 absl::optional<AllocatorStats> GetStats() override {
246 auto s_stats = small_size_allocator_->GetStats();
247 auto l_stats = large_size_allocator_->GetStats();
248
249 // Combine statistics from small-size and large-size allocator.
250 mutex_lock l(mutex_);
251 stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs;
252 stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use;
253 stats_.peak_bytes_in_use =
254 l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use;
255
256 // Since small-size allocations go to MklSmallSizeAllocator,
257 // max_alloc_size from large_size_allocator would be the maximum
258 // size allocated by MklCPUAllocator.
259 stats_.largest_alloc_size = l_stats->largest_alloc_size;
260 stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit);
261 return stats_;
262 }
263
264 bool ClearStats() override {
265 bool stats_cleared = small_size_allocator_->ClearStats();
266 stats_cleared &= large_size_allocator_->ClearStats();
267 return stats_cleared;
268 }
269
270 private:
271 // Hooks provided by this allocator for memory allocation routines from MKL
272 static inline void* MallocHook(size_t size) {
273 VLOG(3) << "MklCPUAllocator: In MallocHook";
274 return cpu_allocator()->AllocateRaw(kAlignment, size);
275 }
276
277 static inline void FreeHook(void* ptr) {
278 VLOG(3) << "MklCPUAllocator: In FreeHook";
279 cpu_allocator()->DeallocateRaw(ptr);
280 }
281
282 static inline void* CallocHook(size_t num, size_t size) {
283 Status s = Status(error::Code::UNIMPLEMENTED,
284 "Unimplemented case for hooking MKL function.");
285 TF_CHECK_OK(s); // way to assert with an error message
286 return nullptr; // return a value and make static code analyzers happy
287 }
288
289 static inline void* ReallocHook(void* ptr, size_t size) {
290 Status s = Status(error::Code::UNIMPLEMENTED,
291 "Unimplemented case for hooking MKL function.");
292 TF_CHECK_OK(s); // way to assert with an error message
293 return nullptr; // return a value and make static code analyzers happy
294 }
295
296 // Do we allow growth in BFC Allocator
297 static const bool kAllowGrowth = true;
298
299 // Name
300 static constexpr const char* kName = "mklcpu";
301
302 // The alignment that we need for the allocations
303 static constexpr const size_t kAlignment = 64;
304
305 Allocator* large_size_allocator_; // owned by this class
306 MklSmallSizeAllocator* small_size_allocator_; // owned by this class.
307
308 SubAllocator* sub_allocator_; // not owned by this class
309 mutable mutex mutex_;
310 AllocatorStats stats_ TF_GUARDED_BY(mutex_);
311
312 // Hash map to keep track of "BFC" allocations
313 // We do not use BFC allocator for small allocations.
314 std::unordered_map<const void*, size_t> large_allocations_map_
315 TF_GUARDED_BY(mutex_);
316
317 // Size in bytes that defines the upper-bound for "small" allocations.
318 // Any allocation below this threshold is "small" allocation.
319 static constexpr const size_t kSmallAllocationsThreshold = 4096;
320
321 // Prevent copying and assignment
322 TF_DISALLOW_COPY_AND_ASSIGN(MklCPUAllocator);
323};
324
325} // namespace tensorflow
326
327#endif // INTEL_MKL
328
329#endif // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_
330