1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // A simple CPU allocator that intercepts malloc/free calls from MKL library |
17 | // and redirects them to Tensorflow allocator |
18 | |
19 | #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ |
20 | #define TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ |
21 | |
22 | #ifdef INTEL_MKL |
23 | |
24 | #include <cstdlib> |
25 | |
26 | #include "tensorflow/core/common_runtime/bfc_allocator.h" |
27 | #include "tensorflow/core/common_runtime/pool_allocator.h" |
28 | #include "tensorflow/core/lib/strings/numbers.h" |
29 | #include "tensorflow/core/lib/strings/str_util.h" |
30 | #include "tensorflow/core/platform/mem.h" |
31 | #include "tensorflow/core/platform/numa.h" |
32 | #include "tensorflow/core/util/env_var.h" |
33 | #include "tensorflow/core/util/onednn_env_vars.h" |
34 | #ifdef _WIN32 |
35 | typedef unsigned int uint; |
36 | #endif |
37 | |
38 | namespace tensorflow { |
39 | |
40 | static bool mkl_small_allocator_collect_stats = false; |
41 | |
42 | class MklSubAllocator : public BasicCPUAllocator { |
43 | public: |
44 | MklSubAllocator() : BasicCPUAllocator(port::kNUMANoAffinity, {}, {}) {} |
45 | ~MklSubAllocator() override {} |
46 | }; |
47 | |
48 | // CPU allocator that handles small-size allocations by calling |
49 | // suballocator directly. Mostly, it is just a wrapper around a suballocator |
50 | // (that calls malloc and free directly) with support for bookkeeping. |
51 | class MklSmallSizeAllocator : public Allocator { |
52 | public: |
53 | MklSmallSizeAllocator(SubAllocator* sub_allocator, size_t total_memory, |
54 | const string& name) |
55 | : sub_allocator_(sub_allocator), name_(name) { |
56 | stats_.bytes_limit = total_memory; |
57 | } |
58 | ~MklSmallSizeAllocator() override {} |
59 | |
60 | TF_DISALLOW_COPY_AND_ASSIGN(MklSmallSizeAllocator); |
61 | |
62 | inline string Name() override { return name_; } |
63 | |
64 | void* AllocateRaw(size_t alignment, size_t num_bytes) override { |
65 | void* ptr = port::AlignedMalloc(num_bytes, alignment); |
66 | if (mkl_small_allocator_collect_stats) IncrementStats(num_bytes); |
67 | return ptr; |
68 | } |
69 | |
70 | void DeallocateRaw(void* ptr) override { |
71 | if (ptr == nullptr) { |
72 | LOG(ERROR) << "tried to deallocate nullptr" ; |
73 | return; |
74 | } |
75 | |
76 | if (mkl_small_allocator_collect_stats) { |
77 | const size_t alloc_size = port::MallocExtension_GetAllocatedSize(ptr); |
78 | DecrementStats(alloc_size); |
79 | } |
80 | port::AlignedFree(ptr); |
81 | } |
82 | |
83 | absl::optional<AllocatorStats> GetStats() override { |
84 | mutex_lock l(mutex_); |
85 | return stats_; |
86 | } |
87 | |
88 | bool ClearStats() override { |
89 | mutex_lock l(mutex_); |
90 | stats_.num_allocs = 0; |
91 | stats_.peak_bytes_in_use = 0; |
92 | stats_.largest_alloc_size = 0; |
93 | stats_.bytes_in_use = 0; |
94 | stats_.bytes_limit = 0; |
95 | return true; |
96 | } |
97 | |
98 | private: |
99 | // Increment statistics for the allocator handling small allocations. |
100 | inline void IncrementStats(size_t alloc_size) TF_LOCKS_EXCLUDED(mutex_) { |
101 | mutex_lock l(mutex_); |
102 | ++stats_.num_allocs; |
103 | stats_.bytes_in_use += alloc_size; |
104 | stats_.peak_bytes_in_use = |
105 | std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); |
106 | stats_.largest_alloc_size = |
107 | std::max(alloc_size, static_cast<size_t>(stats_.largest_alloc_size)); |
108 | } |
109 | |
110 | // Decrement statistics for the allocator handling small allocations. |
111 | inline void DecrementStats(size_t dealloc_size) TF_LOCKS_EXCLUDED(mutex_) { |
112 | mutex_lock l(mutex_); |
113 | stats_.bytes_in_use -= dealloc_size; |
114 | } |
115 | |
116 | SubAllocator* sub_allocator_; // Not owned by this class. |
117 | |
118 | // Mutex for protecting updates to map of allocations. |
119 | mutable mutex mutex_; |
120 | |
121 | // Allocator name |
122 | string name_; |
123 | |
124 | // Allocator stats for small allocs |
125 | AllocatorStats stats_ TF_GUARDED_BY(mutex_); |
126 | }; |
127 | |
128 | /// CPU allocator for MKL that wraps BFC allocator and intercepts |
129 | /// and redirects memory allocation calls from MKL. |
130 | class MklCPUAllocator : public Allocator { |
131 | public: |
132 | // Constructor and other standard functions |
133 | |
134 | /// Environment variable that user can set to upper bound on memory allocation |
135 | static constexpr const char* kMaxLimitStr = "TF_MKL_ALLOC_MAX_BYTES" ; |
136 | |
137 | /// Default upper limit on allocator size - 64GB |
138 | static constexpr size_t kDefaultMaxLimit = 64LL << 30; |
139 | |
140 | MklCPUAllocator() { TF_CHECK_OK(Initialize()); } |
141 | |
142 | ~MklCPUAllocator() override { |
143 | delete small_size_allocator_; |
144 | delete large_size_allocator_; |
145 | } |
146 | |
147 | Status Initialize() { |
148 | VLOG(2) << "MklCPUAllocator: In MklCPUAllocator" ; |
149 | |
150 | // Set upper bound on memory allocation to physical RAM available on the |
151 | // CPU unless explicitly specified by user |
152 | uint64 max_mem_bytes = kDefaultMaxLimit; |
153 | #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) |
154 | max_mem_bytes = |
155 | (uint64)sysconf(_SC_PHYS_PAGES) * (uint64)sysconf(_SC_PAGESIZE); |
156 | #endif |
157 | char* user_mem_bytes = getenv(kMaxLimitStr); |
158 | |
159 | if (user_mem_bytes != NULL) { |
160 | uint64 user_val = 0; |
161 | if (!strings::safe_strtou64(user_mem_bytes, &user_val)) { |
162 | return errors::InvalidArgument("Invalid memory limit (" , user_mem_bytes, |
163 | ") specified for MKL allocator through " , |
164 | kMaxLimitStr); |
165 | } |
166 | #if defined(_SC_PHYS_PAGES) && defined(_SC_PAGESIZE) |
167 | if (user_val > max_mem_bytes) { |
168 | LOG(WARNING) << "The user specified a memory limit " << kMaxLimitStr |
169 | << "=" << user_val |
170 | << " greater than available physical memory: " |
171 | << max_mem_bytes |
172 | << ". This could significantly reduce performance!" ; |
173 | } |
174 | #endif |
175 | max_mem_bytes = user_val; |
176 | } |
177 | |
178 | VLOG(1) << "MklCPUAllocator: Setting max_mem_bytes: " << max_mem_bytes; |
179 | |
180 | sub_allocator_ = new MklSubAllocator(); |
181 | |
182 | // SubAllocator is owned by BFCAllocator, so we do not need to deallocate |
183 | // it in MklSmallSizeAllocator. |
184 | small_size_allocator_ = |
185 | new MklSmallSizeAllocator(sub_allocator_, max_mem_bytes, kName); |
186 | |
187 | BFCAllocator::Options large_allocator_opts; |
188 | large_allocator_opts.allow_growth = kAllowGrowth; |
189 | large_size_allocator_ = |
190 | new BFCAllocator(absl::WrapUnique(sub_allocator_), max_mem_bytes, kName, |
191 | large_allocator_opts); |
192 | return OkStatus(); |
193 | } |
194 | |
195 | inline string Name() override { return kName; } |
196 | inline bool IsSmallSizeAllocation(const void* ptr) const |
197 | TF_LOCKS_EXCLUDED(mutex_) { |
198 | mutex_lock l(mutex_); |
199 | return large_allocations_map_.find(ptr) == large_allocations_map_.end(); |
200 | } |
201 | // AddLargeAllocMap and RemoveLargeAllocMap are always called with a lock held |
202 | inline void AddLargeAllocMap(void* ptr, size_t num_bytes) |
203 | TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { |
204 | if (ptr != nullptr) { |
205 | std::pair<void*, size_t> map_val(ptr, num_bytes); |
206 | large_allocations_map_.insert(map_val); |
207 | } |
208 | } |
209 | inline void RemoveLargeAllocMap(void* ptr) |
210 | TF_EXCLUSIVE_LOCKS_REQUIRED(mutex_) { |
211 | auto map_iter = large_allocations_map_.find(ptr); |
212 | if (map_iter != large_allocations_map_.end()) { |
213 | large_allocations_map_.erase(map_iter); |
214 | } else { |
215 | LOG(ERROR) << "tried to deallocate invalid pointer" ; |
216 | } |
217 | return; |
218 | } |
219 | |
220 | inline void* AllocateRaw(size_t alignment, size_t num_bytes) override { |
221 | // If the allocation size is less than threshold, call small allocator, |
222 | // otherwise call large-size allocator (BFC). We found that BFC allocator |
223 | // does not deliver good performance for small allocations when |
224 | // inter_op_parallelism_threads is high. |
225 | if (UseSystemAlloc() || num_bytes < kSmallAllocationsThreshold) { |
226 | return small_size_allocator_->AllocateRaw(alignment, num_bytes); |
227 | } else { |
228 | mutex_lock l(mutex_); |
229 | void* ptr = large_size_allocator_->AllocateRaw(alignment, num_bytes); |
230 | AddLargeAllocMap(ptr, num_bytes); |
231 | return ptr; |
232 | } |
233 | } |
234 | inline void DeallocateRaw(void* ptr) override { |
235 | // Check if ptr is for "small" allocation. If it is, then call Free |
236 | // directly. Otherwise, call BFC to handle free. |
237 | if (UseSystemAlloc() || IsSmallSizeAllocation(ptr)) { |
238 | small_size_allocator_->DeallocateRaw(ptr); |
239 | } else { |
240 | mutex_lock l(mutex_); |
241 | RemoveLargeAllocMap(ptr); |
242 | large_size_allocator_->DeallocateRaw(ptr); |
243 | } |
244 | } |
245 | absl::optional<AllocatorStats> GetStats() override { |
246 | auto s_stats = small_size_allocator_->GetStats(); |
247 | auto l_stats = large_size_allocator_->GetStats(); |
248 | |
249 | // Combine statistics from small-size and large-size allocator. |
250 | mutex_lock l(mutex_); |
251 | stats_.num_allocs = l_stats->num_allocs + s_stats->num_allocs; |
252 | stats_.bytes_in_use = l_stats->bytes_in_use + s_stats->bytes_in_use; |
253 | stats_.peak_bytes_in_use = |
254 | l_stats->peak_bytes_in_use + s_stats->peak_bytes_in_use; |
255 | |
256 | // Since small-size allocations go to MklSmallSizeAllocator, |
257 | // max_alloc_size from large_size_allocator would be the maximum |
258 | // size allocated by MklCPUAllocator. |
259 | stats_.largest_alloc_size = l_stats->largest_alloc_size; |
260 | stats_.bytes_limit = std::max(s_stats->bytes_limit, l_stats->bytes_limit); |
261 | return stats_; |
262 | } |
263 | |
264 | bool ClearStats() override { |
265 | bool stats_cleared = small_size_allocator_->ClearStats(); |
266 | stats_cleared &= large_size_allocator_->ClearStats(); |
267 | return stats_cleared; |
268 | } |
269 | |
270 | private: |
271 | // Hooks provided by this allocator for memory allocation routines from MKL |
272 | static inline void* MallocHook(size_t size) { |
273 | VLOG(3) << "MklCPUAllocator: In MallocHook" ; |
274 | return cpu_allocator()->AllocateRaw(kAlignment, size); |
275 | } |
276 | |
277 | static inline void FreeHook(void* ptr) { |
278 | VLOG(3) << "MklCPUAllocator: In FreeHook" ; |
279 | cpu_allocator()->DeallocateRaw(ptr); |
280 | } |
281 | |
282 | static inline void* CallocHook(size_t num, size_t size) { |
283 | Status s = Status(error::Code::UNIMPLEMENTED, |
284 | "Unimplemented case for hooking MKL function." ); |
285 | TF_CHECK_OK(s); // way to assert with an error message |
286 | return nullptr; // return a value and make static code analyzers happy |
287 | } |
288 | |
289 | static inline void* ReallocHook(void* ptr, size_t size) { |
290 | Status s = Status(error::Code::UNIMPLEMENTED, |
291 | "Unimplemented case for hooking MKL function." ); |
292 | TF_CHECK_OK(s); // way to assert with an error message |
293 | return nullptr; // return a value and make static code analyzers happy |
294 | } |
295 | |
296 | // Do we allow growth in BFC Allocator |
297 | static const bool kAllowGrowth = true; |
298 | |
299 | // Name |
300 | static constexpr const char* kName = "mklcpu" ; |
301 | |
302 | // The alignment that we need for the allocations |
303 | static constexpr const size_t kAlignment = 64; |
304 | |
305 | Allocator* large_size_allocator_; // owned by this class |
306 | MklSmallSizeAllocator* small_size_allocator_; // owned by this class. |
307 | |
308 | SubAllocator* sub_allocator_; // not owned by this class |
309 | mutable mutex mutex_; |
310 | AllocatorStats stats_ TF_GUARDED_BY(mutex_); |
311 | |
312 | // Hash map to keep track of "BFC" allocations |
313 | // We do not use BFC allocator for small allocations. |
314 | std::unordered_map<const void*, size_t> large_allocations_map_ |
315 | TF_GUARDED_BY(mutex_); |
316 | |
317 | // Size in bytes that defines the upper-bound for "small" allocations. |
318 | // Any allocation below this threshold is "small" allocation. |
319 | static constexpr const size_t kSmallAllocationsThreshold = 4096; |
320 | |
321 | // Prevent copying and assignment |
322 | TF_DISALLOW_COPY_AND_ASSIGN(MklCPUAllocator); |
323 | }; |
324 | |
325 | } // namespace tensorflow |
326 | |
327 | #endif // INTEL_MKL |
328 | |
329 | #endif // TENSORFLOW_CORE_COMMON_RUNTIME_MKL_CPU_ALLOCATOR_H_ |
330 | |