1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ |
17 | #define TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ |
18 | |
19 | #include <array> |
20 | #include <deque> |
21 | #include <memory> |
22 | #include <string> |
23 | #include <unordered_map> |
24 | #include <vector> |
25 | |
26 | #include "absl/container/flat_hash_set.h" |
27 | #include "tensorflow/core/common_runtime/allocator_retry.h" |
28 | #include "tensorflow/core/common_runtime/shared_counter.h" |
29 | #include "tensorflow/core/framework/allocator.h" |
30 | #include "tensorflow/core/lib/strings/numbers.h" |
31 | #include "tensorflow/core/lib/strings/strcat.h" |
32 | #include "tensorflow/core/platform/macros.h" |
33 | #include "tensorflow/core/platform/mutex.h" |
34 | #include "tensorflow/core/platform/thread_annotations.h" |
35 | #include "tensorflow/core/platform/types.h" |
36 | |
37 | namespace tensorflow { |
38 | |
39 | class MemoryDump; |
40 | |
41 | // A memory allocator that implements a 'best-fit with coalescing' |
42 | // algorithm. This is essentially a very simple version of Doug Lea's |
43 | // malloc (dlmalloc). |
44 | // |
45 | // The goal of this allocator is to support defragmentation via |
46 | // coalescing. One assumption we make is that the process using this |
47 | // allocator owns pretty much all of the memory, and that nearly |
48 | // all requests to allocate memory go through this interface. |
49 | class BFCAllocator : public Allocator { |
50 | public: |
51 | struct Options { |
52 | bool allow_growth = true; |
53 | |
54 | // If true, the allocator may sleep for a period of time when it can't |
55 | // fulfill an allocation request, in the hopes that another thread will free |
56 | // up memory in the meantime. |
57 | // |
58 | // If false, the allocator will never sleep, even if |
59 | // AllocationAttributes::attr_retry_on_failure is true. |
60 | bool allow_retry_on_failure = true; |
61 | |
62 | // Whether the allocator will deallocate free regions to avoid OOM due to |
63 | // memory fragmentation. |
64 | bool garbage_collection = false; |
65 | |
66 | // Controls when a chunk should be split, if its size exceeds the requested |
67 | // allocation size. |
68 | double fragmentation_fraction = 0; |
69 | }; |
70 | BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, size_t total_memory, |
71 | const string& name, const Options& opts); |
72 | |
73 | ~BFCAllocator() override; |
74 | |
75 | string Name() override { return name_; } |
76 | |
77 | void* AllocateRaw(size_t alignment, size_t num_bytes) override { |
78 | return AllocateRaw(alignment, num_bytes, AllocationAttributes()); |
79 | } |
80 | |
81 | void* AllocateRaw(size_t alignment, size_t num_bytes, |
82 | const AllocationAttributes& allocation_attr) override; |
83 | |
84 | void DeallocateRaw(void* ptr) override; |
85 | |
86 | bool TracksAllocationSizes() const override; |
87 | |
88 | size_t RequestedSize(const void* ptr) const override; |
89 | |
90 | size_t AllocatedSize(const void* ptr) const override; |
91 | |
92 | int64_t AllocationId(const void* ptr) const override; |
93 | |
94 | absl::optional<AllocatorStats> GetStats() override; |
95 | |
96 | bool ClearStats() override; |
97 | |
98 | void SetTimingCounter(SharedCounter* sc) { timing_counter_ = sc; } |
99 | |
100 | void SetSafeFrontier(uint64 count) override; |
101 | |
102 | AllocatorMemoryType GetMemoryType() const override; |
103 | |
104 | bool ShouldRecordOpName() const { return true; } |
105 | |
106 | MemoryDump RecordMemoryMap(); |
107 | |
108 | private: |
109 | struct Bin; |
110 | |
111 | void* AllocateRawInternal(size_t alignment, size_t num_bytes, |
112 | bool dump_log_on_failure, |
113 | uint64 freed_before_count); |
114 | |
115 | void* AllocateRawInternalWithRetry( |
116 | size_t alignment, size_t num_bytes, |
117 | const AllocationAttributes& allocation_attr); |
118 | |
119 | void DeallocateRawInternal(void* ptr); |
120 | |
121 | // Chunks whose freed_at_count is later than the safe frontier value are kept |
122 | // on a special list and not subject to merging immediately upon being freed. |
123 | // |
124 | // This function sweeps that list looking for Chunks whose timestamp is now |
125 | // safe. When found their freed_at_count is set to 0 and we attempt to merge |
126 | // them with their neighbors. |
127 | // |
128 | // If required_bytes > 0 then this function is being called in the context of |
129 | // a need for this many bytes that could not be satisfied without merging |
130 | // unsafe chunks, so we go ahead and merge the unsafe chunks too, just up to |
131 | // the point that a free chunk of required_bytes is produced. Note that |
132 | // unsafe merged chunks adopt the most conservative timestamp from their |
133 | // constituents so they're only useful for allocations not requiring a |
134 | // particular timestamp. |
135 | bool MergeTimestampedChunks(size_t required_bytes) |
136 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
137 | |
138 | // Return the largest free chunk bytes from the largest bin in constant time. |
139 | // The free chunks are sorted by size (and then address) in a bin. |
140 | int64_t LargestFreeChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
141 | |
142 | // Add TraceMe (in memory allocation and deallocation) for memory stats |
143 | // profiling. The chunk_ptr is passed to get information such as address, |
144 | // chunk size and requested_size. |
145 | void AddTraceMe(absl::string_view traceme_name, const void* ptr) |
146 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
147 | |
148 | // Overloaded AddTraceMe function with chunk information. |
149 | void AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr, |
150 | int64_t req_bytes, int64_t alloc_bytes) |
151 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
152 | |
153 | // A ChunkHandle is an index into the chunks_ vector in BFCAllocator |
154 | // kInvalidChunkHandle means an invalid chunk |
155 | typedef size_t ChunkHandle; |
156 | static constexpr ChunkHandle kInvalidChunkHandle = SIZE_MAX; |
157 | |
158 | typedef int BinNum; |
159 | static constexpr int kInvalidBinNum = -1; |
160 | // The following means that the largest bin'd chunk size is 256 << 21 = 512MB. |
161 | static constexpr int kNumBins = 21; |
162 | |
163 | // A Chunk points to a piece of memory that's either entirely free or entirely |
164 | // in use by one user memory allocation. |
165 | // |
166 | // An AllocationRegion's memory is split up into one or more disjoint Chunks, |
167 | // which together cover the whole region without gaps. Chunks participate in |
168 | // a doubly-linked list, and the prev/next pointers point to the physically |
169 | // adjacent chunks. |
170 | // |
171 | // Since a chunk cannot be partially in use, we may need to split a free chunk |
172 | // in order to service a user allocation. We always merge adjacent free |
173 | // chunks. |
174 | // |
175 | // Chunks contain information about whether they are in use or whether they |
176 | // are free, and contain a pointer to the bin they are in. |
177 | struct Chunk { |
178 | size_t size = 0; // Full size of buffer. |
179 | |
180 | // We sometimes give chunks that are larger than needed to reduce |
181 | // fragmentation. requested_size keeps track of what the client |
182 | // actually wanted so we can understand whether our splitting |
183 | // strategy is efficient. |
184 | size_t requested_size = 0; |
185 | |
186 | // allocation_id is set to -1 when the chunk is not in use. It is assigned a |
187 | // value greater than zero before the chunk is returned from |
188 | // AllocateRaw, and this value is unique among values assigned by |
189 | // the parent allocator. |
190 | int64_t allocation_id = -1; |
191 | void* ptr = nullptr; // pointer to granted subbuffer. |
192 | |
193 | // If not kInvalidChunkHandle, the memory referred to by 'prev' is directly |
194 | // preceding the memory used by this chunk. E.g., It should start |
195 | // at 'ptr - prev->size' |
196 | ChunkHandle prev = kInvalidChunkHandle; |
197 | |
198 | // If not kInvalidChunkHandle, the memory referred to by 'next' is directly |
199 | // following the memory used by this chunk. E.g., It should be at |
200 | // 'ptr + size' |
201 | ChunkHandle next = kInvalidChunkHandle; |
202 | |
203 | // What bin are we in? |
204 | BinNum bin_num = kInvalidBinNum; |
205 | |
206 | // Optional count when this chunk was most recently made free. |
207 | uint64 freed_at_count = 0; |
208 | |
209 | bool in_use() const { return allocation_id != -1; } |
210 | |
211 | #ifdef TENSORFLOW_MEM_DEBUG |
212 | // optional debugging info |
213 | const char* op_name = nullptr; |
214 | uint64 step_id = 0; |
215 | int64 action_count = 0; |
216 | #endif |
217 | |
218 | string DebugString(BFCAllocator* a, |
219 | bool recurse) TF_NO_THREAD_SAFETY_ANALYSIS { |
220 | string dbg; |
221 | strings::StrAppend( |
222 | &dbg, " Size: " , strings::HumanReadableNumBytes(size), |
223 | " | Requested Size: " , strings::HumanReadableNumBytes(requested_size), |
224 | " | in_use: " , in_use(), " | bin_num: " , bin_num); |
225 | if (recurse && prev != BFCAllocator::kInvalidChunkHandle) { |
226 | Chunk* p = a->ChunkFromHandle(prev); |
227 | strings::StrAppend(&dbg, ", prev: " , p->DebugString(a, false)); |
228 | } |
229 | if (recurse && next != BFCAllocator::kInvalidChunkHandle) { |
230 | Chunk* n = a->ChunkFromHandle(next); |
231 | strings::StrAppend(&dbg, ", next: " , n->DebugString(a, false)); |
232 | } |
233 | #ifdef TENSORFLOW_MEM_DEBUG |
234 | strings::StrAppend(&dbg, ", for: " , op_name ? op_name : "UNKNOWN" , |
235 | ", stepid: " , step_id, |
236 | ", last_action: " , action_count); |
237 | #endif |
238 | return dbg; |
239 | } |
240 | }; |
241 | |
242 | // A Bin is a collection of similar-sized free chunks. |
243 | // Allocated chunks are never in a Bin. |
244 | struct Bin { |
245 | // All chunks in this bin have >= bin_size memory. |
246 | size_t bin_size = 0; |
247 | |
248 | class ChunkComparator { |
249 | public: |
250 | explicit ChunkComparator(BFCAllocator* allocator) |
251 | : allocator_(allocator) {} |
252 | // Sort first by size and then use pointer address as a tie breaker. |
253 | bool operator()(const ChunkHandle ha, |
254 | const ChunkHandle hb) const TF_NO_THREAD_SAFETY_ANALYSIS { |
255 | const Chunk* a = allocator_->ChunkFromHandle(ha); |
256 | const Chunk* b = allocator_->ChunkFromHandle(hb); |
257 | if (a->size != b->size) { |
258 | return a->size < b->size; |
259 | } |
260 | return a->ptr < b->ptr; |
261 | } |
262 | |
263 | private: |
264 | BFCAllocator* allocator_; // The parent allocator |
265 | }; |
266 | |
267 | typedef std::set<ChunkHandle, ChunkComparator> FreeChunkSet; |
268 | // List of free chunks within the bin, sorted by chunk size. |
269 | // Chunk * not owned. |
270 | FreeChunkSet free_chunks; |
271 | Bin(BFCAllocator* allocator, size_t bs) |
272 | : bin_size(bs), free_chunks(ChunkComparator(allocator)) {} |
273 | }; |
274 | |
275 | static constexpr size_t kMinAllocationBits = 8; |
276 | static constexpr size_t kMinAllocationSize = 1 << kMinAllocationBits; |
277 | |
278 | // BFCAllocator allocates memory into a collection of disjoint |
279 | // AllocationRegions. Each AllocationRegion corresponds to one call to |
280 | // SubAllocator::Alloc(). (Actually, if a subsequent call to |
281 | // SubAllocator::Alloc() returns another region immediately adjacent to the |
282 | // last, it will be used to extend the first AllocationRegion, not create a |
283 | // separate one.) |
284 | // |
285 | // An AllocationRegion contains one or more Chunks, covering all of its |
286 | // memory. Its primary job is to map pointers to ChunkHandles. |
287 | // |
288 | // This class is thread-compatible. |
289 | class AllocationRegion { |
290 | public: |
291 | AllocationRegion(void* ptr, size_t memory_size) |
292 | : ptr_(ptr), |
293 | memory_size_(memory_size), |
294 | end_ptr_( |
295 | static_cast<void*>(static_cast<char*>(ptr_) + memory_size_)) { |
296 | DCHECK_EQ(0, memory_size % kMinAllocationSize); |
297 | const size_t n_handles = |
298 | (memory_size + kMinAllocationSize - 1) / kMinAllocationSize; |
299 | handles_.resize(n_handles, kInvalidChunkHandle); |
300 | } |
301 | |
302 | AllocationRegion() = default; |
303 | AllocationRegion(AllocationRegion&& other) { Swap(&other); } |
304 | AllocationRegion& operator=(AllocationRegion&& other) { |
305 | Swap(&other); |
306 | return *this; |
307 | } |
308 | |
309 | void* ptr() const { return ptr_; } |
310 | void* end_ptr() const { return end_ptr_; } |
311 | size_t memory_size() const { return memory_size_; } |
312 | void extend(size_t size) { |
313 | memory_size_ += size; |
314 | DCHECK_EQ(0, memory_size_ % kMinAllocationSize); |
315 | |
316 | end_ptr_ = static_cast<void*>(static_cast<char*>(end_ptr_) + size); |
317 | const size_t n_handles = |
318 | (memory_size_ + kMinAllocationSize - 1) / kMinAllocationSize; |
319 | handles_.resize(n_handles, kInvalidChunkHandle); |
320 | } |
321 | ChunkHandle get_handle(const void* p) const { |
322 | return handles_[IndexFor(p)]; |
323 | } |
324 | void set_handle(const void* p, ChunkHandle h) { handles_[IndexFor(p)] = h; } |
325 | void erase(const void* p) { set_handle(p, kInvalidChunkHandle); } |
326 | |
327 | private: |
328 | void Swap(AllocationRegion* other) { |
329 | std::swap(ptr_, other->ptr_); |
330 | std::swap(memory_size_, other->memory_size_); |
331 | std::swap(end_ptr_, other->end_ptr_); |
332 | std::swap(handles_, other->handles_); |
333 | } |
334 | |
335 | size_t IndexFor(const void* p) const { |
336 | std::uintptr_t p_int = reinterpret_cast<std::uintptr_t>(p); |
337 | std::uintptr_t base_int = reinterpret_cast<std::uintptr_t>(ptr_); |
338 | DCHECK_GE(p_int, base_int); |
339 | DCHECK_LT(p_int, base_int + memory_size_); |
340 | return static_cast<size_t>(((p_int - base_int) >> kMinAllocationBits)); |
341 | } |
342 | |
343 | // Metadata about the allocation region. |
344 | void* ptr_ = nullptr; |
345 | size_t memory_size_ = 0; |
346 | void* end_ptr_ = nullptr; |
347 | |
348 | // Array of size "memory_size / kMinAllocationSize". It is |
349 | // indexed by (p-base) / kMinAllocationSize, contains ChunkHandle |
350 | // for the memory allocation represented by "p" |
351 | std::vector<ChunkHandle> handles_; |
352 | |
353 | TF_DISALLOW_COPY_AND_ASSIGN(AllocationRegion); |
354 | }; |
355 | |
356 | // RegionManager aggregates one or more "AllocationRegions" and provides |
357 | // a layer of indirection from pointers to the underlying ChunkHandle, |
358 | // allowing allocation across multiple discontiguous memory regions. |
359 | // |
360 | // This class is thread-compatible. |
361 | class RegionManager { |
362 | public: |
363 | RegionManager() {} |
364 | ~RegionManager() {} |
365 | |
366 | void AddAllocationRegion(void* ptr, size_t memory_size) { |
367 | // Insert sorted by end_ptr. |
368 | auto entry = |
369 | std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); |
370 | regions_.insert(entry, AllocationRegion(ptr, memory_size)); |
371 | } |
372 | |
373 | // Adds an alloation region for the given ptr and size, potentially |
374 | // extending a region if ptr matches the end_ptr of an existing region. |
375 | // If a region is extended, returns a pointer to the extended region so that |
376 | // the BFC allocator can reason about chunkification. |
377 | AllocationRegion* AddOrExtendAllocationRegion(void* ptr, |
378 | size_t memory_size) { |
379 | // Insert sorted by end_ptr. |
380 | auto entry = |
381 | std::upper_bound(regions_.begin(), regions_.end(), ptr, &Comparator); |
382 | // Check if can be coalesced with preceding region. |
383 | if (entry != regions_.begin()) { |
384 | auto preceding_region = entry - 1; |
385 | if (preceding_region->end_ptr() == ptr) { |
386 | if (VLOG_IS_ON(1)) { |
387 | LOG(INFO) << "Extending region " << preceding_region->ptr() |
388 | << " of " |
389 | << strings::HumanReadableNumBytes( |
390 | preceding_region->memory_size()) |
391 | << " by " << strings::HumanReadableNumBytes(memory_size) |
392 | << " bytes" ; |
393 | } |
394 | preceding_region->extend(memory_size); |
395 | return &*preceding_region; |
396 | } |
397 | } |
398 | VLOG(1) << "Inserting new region " << ptr << " of " |
399 | << strings::HumanReadableNumBytes(memory_size); |
400 | regions_.insert(entry, AllocationRegion(ptr, memory_size)); |
401 | return nullptr; |
402 | } |
403 | |
404 | std::vector<AllocationRegion>::iterator RemoveAllocationRegion( |
405 | std::vector<AllocationRegion>::iterator it) { |
406 | return regions_.erase(it); |
407 | } |
408 | |
409 | ChunkHandle get_handle(const void* p) const { |
410 | return RegionFor(p)->get_handle(p); |
411 | } |
412 | |
413 | void set_handle(const void* p, ChunkHandle h) { |
414 | return MutableRegionFor(p)->set_handle(p, h); |
415 | } |
416 | void erase(const void* p) { return MutableRegionFor(p)->erase(p); } |
417 | |
418 | const std::vector<AllocationRegion>& regions() const { return regions_; } |
419 | |
420 | private: |
421 | static bool Comparator(const void* ptr, const AllocationRegion& other) { |
422 | return ptr < other.end_ptr(); |
423 | } |
424 | |
425 | AllocationRegion* MutableRegionFor(const void* p) { |
426 | return const_cast<AllocationRegion*>(RegionFor(p)); |
427 | } |
428 | |
429 | const AllocationRegion* RegionFor(const void* p) const { |
430 | auto entry = |
431 | std::upper_bound(regions_.begin(), regions_.end(), p, &Comparator); |
432 | |
433 | if (entry != regions_.end()) { |
434 | return &(*entry); |
435 | } |
436 | |
437 | LOG(FATAL) << "Could not find Region for " << p; |
438 | return nullptr; |
439 | } |
440 | |
441 | private: |
442 | std::vector<AllocationRegion> regions_; |
443 | }; |
444 | |
445 | // Returns 'bytes' rounded up to the next highest kMinAllocationSize. |
446 | static size_t RoundedBytes(size_t bytes); |
447 | |
448 | // Try to add a new memory region that can satisfy an allocation of |
449 | // 'rounded_bytes' bytes. Returns true on success and false on |
450 | // failure. |
451 | bool Extend(size_t alignment, size_t rounded_bytes) |
452 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
453 | |
454 | // Deallocate free regions to give back the memory to suballocator, so that |
455 | // we can re-allocate a larger region. The main use scenario of this function |
456 | // is when OOM happens but we have free regions and the sum of sizes of free |
457 | // regions and unallocated bytes is larger than the requested size, implying |
458 | // (external) memory fragmentation. Returns true if any free regions are |
459 | // found and freed; false otherwise. |
460 | bool DeallocateFreeRegions(size_t rounded_bytes); |
461 | |
462 | // Helper function to deallocate regions. |
463 | void DeallocateRegions(const absl::flat_hash_set<void*>& region_ptrs) |
464 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
465 | |
466 | // Returns a pointer to an underlying allocated chunk of size |
467 | // 'rounded_bytes'. |
468 | void* FindChunkPtr(BinNum bin_num, size_t rounded_bytes, size_t num_bytes, |
469 | uint64 freed_before) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
470 | |
471 | // Splits the chunk specified by 'h' into two chunks, one at least |
472 | // of size 'num_bytes'. |
473 | void SplitChunk(ChunkHandle h, size_t num_bytes) |
474 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
475 | |
476 | // Merges the two chunk handles. Requires that the chunks are |
477 | // contiguous in their allocation. |
478 | void Merge(ChunkHandle h, ChunkHandle h2) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
479 | |
480 | // Adds the chunk 'h' to the proper free bin. |
481 | void InsertFreeChunkIntoBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
482 | |
483 | // Removes the free chunk pointed to by 'c' from the set free_chunks. |
484 | void RemoveFreeChunkIterFromBin(Bin::FreeChunkSet* free_chunks, |
485 | const Bin::FreeChunkSet::iterator& c) |
486 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
487 | |
488 | // Removes a free chunk from the bin. |
489 | void RemoveFreeChunkFromBin(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
490 | void MaybeRemoveFreeChunkFromBin(ChunkHandle h) |
491 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
492 | |
493 | // Removes the chunk metadata represented by 'h'. |
494 | void DeleteChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
495 | |
496 | string RenderOccupancy() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
497 | void DumpMemoryLog(size_t num_bytes) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
498 | MemoryDump RecordMemoryMapInternal() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
499 | void MaybeWriteMemoryMap() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
500 | |
501 | ChunkHandle AllocateChunk() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
502 | void DeallocateChunk(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
503 | |
504 | Chunk* ChunkFromHandle(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
505 | const Chunk* ChunkFromHandle(ChunkHandle h) const |
506 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
507 | |
508 | void MarkFree(ChunkHandle h) TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
509 | |
510 | ChunkHandle TryToCoalesce(ChunkHandle h, bool ignore_freed_at) |
511 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
512 | |
513 | // Fragmentation is calculated as the reverse ratio of the largest free chunk |
514 | // size over total free memory, and returns a value within [0, 1]. |
515 | double GetFragmentation() TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
516 | |
517 | // Information about a Bin that is useful for debugging. |
518 | struct BinDebugInfo { |
519 | size_t total_bytes_in_use = 0; |
520 | size_t total_bytes_in_bin = 0; |
521 | size_t total_requested_bytes_in_use = 0; |
522 | size_t total_chunks_in_use = 0; |
523 | size_t total_chunks_in_bin = 0; |
524 | }; |
525 | |
526 | // Computes and returns a BinDebugInfo for each Bin. |
527 | std::array<BinDebugInfo, kNumBins> get_bin_debug_info() |
528 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_); |
529 | |
530 | AllocatorRetry retry_helper_; |
531 | |
532 | // Structures immutable after construction |
533 | size_t memory_limit_ = 0; |
534 | |
535 | inline int Log2FloorNonZeroSlow(uint64 n) { |
536 | int r = 0; |
537 | while (n > 0) { |
538 | r++; |
539 | n >>= 1; |
540 | } |
541 | return r - 1; |
542 | } |
543 | |
544 | // Returns floor(log2(n)). |
545 | inline int Log2FloorNonZero(uint64 n) { |
546 | #if defined(__GNUC__) |
547 | return 63 ^ __builtin_clzll(n); |
548 | #elif defined(PLATFORM_WINDOWS) && (_WIN64) |
549 | unsigned long index; |
550 | _BitScanReverse64(&index, n); |
551 | return index; |
552 | #else |
553 | return Log2FloorNonZeroSlow(n); |
554 | #endif |
555 | } |
556 | |
557 | // Map from bin size to Bin |
558 | Bin* BinFromIndex(BinNum index) { |
559 | return reinterpret_cast<Bin*>(&(bins_space_[index * sizeof(Bin)])); |
560 | } |
561 | size_t BinNumToSize(BinNum index) { |
562 | return static_cast<size_t>(256) << index; |
563 | } |
564 | BinNum BinNumForSize(size_t bytes) { |
565 | uint64 v = std::max<size_t>(bytes, 256) >> kMinAllocationBits; |
566 | int b = std::min(kNumBins - 1, Log2FloorNonZero(v)); |
567 | return b; |
568 | } |
569 | Bin* BinForSize(size_t bytes) { return BinFromIndex(BinNumForSize(bytes)); } |
570 | |
571 | char bins_space_[sizeof(Bin) * kNumBins]; |
572 | |
573 | const Options opts_; |
574 | |
575 | // The size of the current region allocation. |
576 | size_t curr_region_allocation_bytes_; |
577 | |
578 | // The total number of allocated bytes by the allocator. |
579 | size_t total_region_allocated_bytes_ = 0; |
580 | |
581 | // An indicator that expansion of a region has hit the limits |
582 | // of the available memory. |
583 | bool started_backpedal_ = false; |
584 | |
585 | // Whether the allocator will coalesce adjacent sub allocator provided |
586 | // AllocationRegions. This may be disabled if discrete sub allocator |
587 | // regions can't be treated as contiguous (e.g. if the allocation refers to |
588 | // device visible memory which is not adjacent to the other region in the |
589 | // device's address space). |
590 | const bool coalesce_regions_; |
591 | |
592 | std::unique_ptr<SubAllocator> sub_allocator_; |
593 | string name_; |
594 | SharedCounter* timing_counter_ = nullptr; |
595 | std::deque<ChunkHandle> timestamped_chunks_; |
596 | |
597 | std::atomic<uint64> safe_frontier_ = {0}; |
598 | |
599 | // Structures mutable after construction |
600 | mutable mutex lock_; |
601 | RegionManager region_manager_ TF_GUARDED_BY(lock_); |
602 | |
603 | std::vector<Chunk> chunks_ TF_GUARDED_BY(lock_); |
604 | |
605 | // Pointer to head of linked list of free Chunks |
606 | ChunkHandle free_chunks_list_ TF_GUARDED_BY(lock_); |
607 | |
608 | // Counter containing the next unique identifier to assign to a |
609 | // newly-created chunk. |
610 | int64_t next_allocation_id_ TF_GUARDED_BY(lock_); |
611 | |
612 | // Stats. |
613 | AllocatorStats stats_ TF_GUARDED_BY(lock_); |
614 | #ifdef TENSORFLOW_MEM_DEBUG |
615 | int64 action_counter_ = 0 TF_GUARDED_BY(lock_); |
616 | #define MEM_DEBUG_SIZE_HISTORY_SIZE 4096 |
617 | int64 size_history_[MEM_DEBUG_SIZE_HISTORY_SIZE]; |
618 | #endif |
619 | |
620 | friend class GPUBFCAllocatorPrivateMethodsTest; |
621 | friend class GPUBFCAllocatorPrivateMethodsTest_SubAllocatorSpecific; |
622 | TF_DISALLOW_COPY_AND_ASSIGN(BFCAllocator); |
623 | }; |
624 | |
625 | } // namespace tensorflow |
626 | |
627 | #endif // TENSORFLOW_CORE_COMMON_RUNTIME_BFC_ALLOCATOR_H_ |
628 | |