1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/core/common_runtime/bfc_allocator.h"
17
18#include <algorithm>
19#include <atomic>
20#include <utility>
21
22#include "absl/strings/string_view.h"
23#include "tensorflow/core/common_runtime/allocator_retry.h"
24#include "tensorflow/core/lib/core/bits.h"
25#include "tensorflow/core/lib/strings/numbers.h"
26#include "tensorflow/core/lib/strings/str_util.h"
27#include "tensorflow/core/lib/strings/strcat.h"
28#include "tensorflow/core/platform/file_system.h"
29#include "tensorflow/core/platform/logging.h"
30#include "tensorflow/core/platform/mutex.h"
31#ifdef TENSORFLOW_MEM_DEBUG
32#include "tensorflow/core/platform/stacktrace.h"
33#endif
34#include "tensorflow/core/platform/types.h"
35#include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
36#include "tensorflow/core/profiler/lib/traceme.h"
37#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
38
39namespace tensorflow {
40
41constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
42
43BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,
44 size_t total_memory, const string& name,
45 const Options& opts)
46 : opts_(opts),
47 coalesce_regions_(sub_allocator->SupportsCoalescing()),
48 sub_allocator_(std::move(sub_allocator)),
49 name_(name),
50 free_chunks_list_(kInvalidChunkHandle),
51 next_allocation_id_(1) {
52 if (opts.allow_growth) {
53 // 2MiB smallest initial allocation, unless total memory available
54 // is less.
55 curr_region_allocation_bytes_ =
56 RoundedBytes(std::min(total_memory, size_t{2 << 20}));
57 } else {
58 curr_region_allocation_bytes_ = RoundedBytes(total_memory);
59 }
60
61 // Allocate the requested amount of memory.
62 memory_limit_ = total_memory;
63 stats_.bytes_limit = static_cast<int64_t>(total_memory);
64
65 // Create a bunch of bins of various good sizes.
66
67 // We create bins to fit all possible ranges that cover the
68 // memory_limit_ starting from allocations up to 256 bytes to
69 // allocations up to (and including) the memory limit.
70 VLOG(1) << "Creating new BFCAllocator named: " << name;
71 for (BinNum b = 0; b < kNumBins; b++) {
72 size_t bin_size = BinNumToSize(b);
73 VLOG(1) << "Creating bin of max chunk size "
74 << strings::HumanReadableNumBytes(bin_size);
75 new (BinFromIndex(b)) Bin(this, bin_size);
76 CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
77 CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b));
78 CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b));
79 if (b + 1 < kNumBins) {
80 CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b));
81 }
82 }
83}
84
85BFCAllocator::~BFCAllocator() {
86 // Return memory back.
87 VLOG(2) << "Number of regions allocated: "
88 << region_manager_.regions().size();
89 for (const auto& region : region_manager_.regions()) {
90 sub_allocator_->Free(region.ptr(), region.memory_size());
91 }
92
93 for (BinNum b = 0; b < kNumBins; b++) {
94 BinFromIndex(b)->~Bin();
95 }
96}
97
98BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
99 DCHECK_GE(h, 0);
100 DCHECK_LT(h, static_cast<int>(chunks_.size()));
101 return &(chunks_[h]);
102}
103
104const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
105 DCHECK_GE(h, 0);
106 DCHECK_LT(h, static_cast<int>(chunks_.size()));
107 return &(chunks_[h]);
108}
109
110bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
111 size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
112 // Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
113 available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
114
115 // Do we have enough space to handle the client's request?
116 // If not, fail immediately.
117 if (rounded_bytes > available_bytes) {
118 return false;
119 }
120
121 // If curr_region_allocation_bytes_ is not enough to satisfy the
122 // allocation, keep multiplying by a power of two until that is
123 // sufficient.
124 bool increased_allocation = false;
125 while (rounded_bytes > curr_region_allocation_bytes_) {
126 curr_region_allocation_bytes_ *= 2;
127 increased_allocation = true;
128 }
129
130 // Try allocating.
131 size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
132 size_t bytes_received;
133 void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
134 if (mem_addr == nullptr && !started_backpedal_) {
135 // Only backpedal once.
136 started_backpedal_ = true;
137
138 static constexpr float kBackpedalFactor = 0.9;
139
140 // Try allocating less memory.
141 while (mem_addr == nullptr) {
142 bytes = RoundedBytes(bytes * kBackpedalFactor);
143 if (bytes < rounded_bytes) break;
144 mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
145 }
146 }
147
148 if (mem_addr == nullptr) {
149 return false;
150 }
151
152 if (!increased_allocation) {
153 // Increase the region size of the next required allocation.
154 curr_region_allocation_bytes_ *= 2;
155 }
156
157 VLOG(1) << "Extending allocation by "
158 << strings::HumanReadableNumBytes(bytes_received) << " bytes for "
159 << Name() << ".";
160
161 total_region_allocated_bytes_ += bytes_received;
162 VLOG(1) << "Total allocated bytes: "
163 << strings::HumanReadableNumBytes(total_region_allocated_bytes_);
164
165 VLOG(1) << "Allocated memory at " << mem_addr << " to "
166 << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received);
167
168 AllocationRegion* maybe_extended_region = nullptr;
169 if (coalesce_regions_) {
170 maybe_extended_region =
171 region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
172 } else {
173 region_manager_.AddAllocationRegion(mem_addr, bytes_received);
174 }
175
176 // Create one large chunk for the whole memory space that will
177 // be chunked later.
178 ChunkHandle h = AllocateChunk();
179 BFCAllocator::Chunk* c = ChunkFromHandle(h);
180 c->ptr = mem_addr;
181 c->size = bytes_received;
182 c->allocation_id = -1;
183 c->prev = kInvalidChunkHandle;
184 c->next = kInvalidChunkHandle;
185 c->freed_at_count = 0;
186
187 region_manager_.set_handle(c->ptr, h);
188
189 // If the region was extended, then there exists a previous chunk that should
190 // be linked to the new chunk.
191 if (maybe_extended_region != nullptr) {
192 ChunkHandle prev =
193 maybe_extended_region->get_handle(maybe_extended_region->ptr());
194 BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
195 // Find the last recorded chunk in the extended region.
196 while (prev_chunk->next != kInvalidChunkHandle) {
197 prev = prev_chunk->next;
198 prev_chunk = ChunkFromHandle(prev);
199 }
200 c->prev = prev;
201 prev_chunk->next = h;
202 }
203
204 // Maybe merge adjacent chunks and insert the chunk into the right bin.
205 InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false));
206
207 return true;
208}
209
210BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
211 if (free_chunks_list_ != kInvalidChunkHandle) {
212 ChunkHandle h = free_chunks_list_;
213 Chunk* c = ChunkFromHandle(h);
214 free_chunks_list_ = c->next;
215 return h;
216 } else {
217 ChunkHandle h = chunks_.size();
218 chunks_.resize(h + 1);
219 return h;
220 }
221}
222
223void BFCAllocator::DeallocateChunk(ChunkHandle h) {
224 Chunk* c = ChunkFromHandle(h);
225 c->allocation_id = -1;
226 c->bin_num = kInvalidBinNum;
227 c->next = free_chunks_list_;
228 free_chunks_list_ = h;
229}
230
231void* BFCAllocator::AllocateRawInternalWithRetry(
232 size_t unused_alignment, size_t num_bytes,
233 const AllocationAttributes& allocation_attr) {
234 // Fast path: Try once to allocate without getting the retry_helper_ involved
235 uint64 freed_by_count = 0;
236 if (allocation_attr.freed_by_func != nullptr) {
237 freed_by_count = (*allocation_attr.freed_by_func)();
238 }
239 void* r =
240 AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
241 if (r != nullptr) {
242 return r;
243 } else {
244 static const int64_t kMaxMillisToWait = 10000; // 10 seconds
245 r = retry_helper_.AllocateRaw(
246 [this, &allocation_attr](size_t a, size_t nb, bool v) {
247 uint64 freed_by_count = 0;
248 if (allocation_attr.freed_by_func != nullptr) {
249 freed_by_count = (*allocation_attr.freed_by_func)();
250 }
251 return AllocateRawInternal(a, nb, v, freed_by_count);
252 },
253 kMaxMillisToWait, unused_alignment, num_bytes);
254 return r;
255 }
256}
257
258void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
259 const AllocationAttributes& allocation_attr) {
260 VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes;
261 void* result = [&] {
262 if (!opts_.allow_retry_on_failure || !allocation_attr.retry_on_failure) {
263 // If we have globally disabled retry-on-failure and fail to allocate an
264 // "important" alloc, we want to print a log, because the program may be
265 // about to fail due to OOM.
266 //
267 // Bit of a hack: We deem "important" allocs as those which are retryable.
268 // In TF, *non*-retryable allocations are usually those which we can
269 // tolerate failing. For example, we allocate convolution scratch memory
270 // as non-retryable; if it fails, we'll just use a fallback algorithm that
271 // uses no scratch.
272 static std::atomic<int32> log_counter{0};
273 constexpr int kMaxFailureLogs = 10;
274 bool dump_log_on_failure =
275 (/*retry is globally disabled*/ !opts_.allow_retry_on_failure &&
276 /*alloc is "important"*/ allocation_attr.retry_on_failure &&
277 log_counter.load(std::memory_order_relaxed) < kMaxFailureLogs) ||
278 VLOG_IS_ON(2);
279
280 uint64 freed_by_count = 0;
281 if (allocation_attr.freed_by_func != nullptr) {
282 freed_by_count = (*allocation_attr.freed_by_func)();
283 }
284 void* res = AllocateRawInternal(unused_alignment, num_bytes,
285 dump_log_on_failure, freed_by_count);
286 if (res == nullptr) {
287 int32 counter_value = log_counter.load(std::memory_order_relaxed);
288 if (counter_value < kMaxFailureLogs) {
289 log_counter.store(counter_value + 1, std::memory_order_relaxed);
290 LOG(WARNING)
291 << "Allocator (" << Name() << ") ran out of memory trying "
292 << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
293 << " with freed_by_count=" << freed_by_count << "."
294 << (!allocation_attr.retry_on_failure
295 ? " The caller indicates that this is not a failure, but"
296 " this may mean that there could be performance gains "
297 "if more memory were available."
298 : "");
299 }
300 }
301 return res;
302 } else {
303 return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
304 allocation_attr);
305 }
306 }();
307 VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes << " " << result;
308 return result;
309}
310
311// static
312size_t BFCAllocator::RoundedBytes(size_t bytes) {
313 size_t rounded_bytes =
314 (kMinAllocationSize *
315 ((bytes + kMinAllocationSize - 1) / kMinAllocationSize));
316 DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize);
317 return rounded_bytes;
318}
319
320bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
321 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
322 // Do nothing if garbage collection is off.
323 if (!opts_.garbage_collection) {
324 return false;
325 }
326
327 // Searching for free regions.
328 absl::flat_hash_set<void*> free_region_ptrs;
329 size_t total_free_bytes = 0;
330 for (const AllocationRegion& region : region_manager_.regions()) {
331 ChunkHandle h = region_manager_.get_handle(region.ptr());
332 bool any_use = false;
333 while (h != kInvalidChunkHandle) {
334 const Chunk* c = ChunkFromHandle(h);
335 if (c->in_use()) {
336 any_use = true;
337 break;
338 }
339 h = c->next;
340 }
341
342 if (!any_use) {
343 VLOG(2) << "Found free region with ptr = " << region.ptr();
344 free_region_ptrs.insert(region.ptr());
345 total_free_bytes += region.memory_size();
346 }
347 }
348
349 if (total_free_bytes == 0) {
350 return false;
351 }
352
353 // Rough estimation to check whether deallocation can help.
354 size_t available_bytes =
355 memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
356 if (rounded_bytes > available_bytes) {
357 return false;
358 }
359
360 LOG(WARNING) << "Garbage collection: deallocate free memory regions"
361 << " (i.e., allocations) so that we can re-allocate a larger"
362 << " region to avoid OOM due to memory fragmentation. If you"
363 << " see this message frequently, you are running near the"
364 << " threshold of the available device memory and re-allocation"
365 << " may incur great performance overhead. You may try smaller"
366 << " batch sizes to observe the performance impact."
367 << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
368 << " disable this feature.";
369
370 // Deallocate free regions.
371 DeallocateRegions(free_region_ptrs);
372
373 return true;
374}
375
376void BFCAllocator::DeallocateRegions(
377 const absl::flat_hash_set<void*>& region_ptrs)
378 TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
379 // Explicitly remove the const qualifier as some compilers disallow passing
380 // const_iterator to std::vector::erase(), which is used in
381 // RemoveAllocationRegion().
382 auto regions =
383 const_cast<std::vector<AllocationRegion>*>(&region_manager_.regions());
384 auto it = regions->begin();
385 while (it != regions->end()) {
386 if (!region_ptrs.contains(it->ptr())) {
387 ++it;
388 continue;
389 }
390
391 VLOG(2) << "Deallocate region with ptr = " << it->ptr();
392 // Remove all chunk registrations from Bins.
393 ChunkHandle h = region_manager_.get_handle(it->ptr());
394 while (h != kInvalidChunkHandle) {
395 const Chunk* c = ChunkFromHandle(h);
396 if (c->bin_num != kInvalidBinNum) {
397 RemoveFreeChunkFromBin(h);
398 }
399 auto h_to_delete = h;
400 h = c->next;
401 DeleteChunk(h_to_delete);
402 }
403
404 // Deallocate the memory.
405 sub_allocator_->Free(it->ptr(), it->memory_size());
406 total_region_allocated_bytes_ -= it->memory_size();
407 it = region_manager_.RemoveAllocationRegion(it);
408 }
409}
410
411void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
412 size_t num_bytes,
413 bool dump_log_on_failure,
414 uint64 freed_before) {
415 if (num_bytes == 0) {
416 VLOG(2) << "tried to allocate 0 bytes";
417 return nullptr;
418 }
419 // First, always allocate memory of at least kMinAllocationSize
420 // bytes, and always allocate multiples of kMinAllocationSize bytes
421 // so all memory addresses are nicely byte aligned.
422 size_t rounded_bytes = RoundedBytes(num_bytes);
423
424 // The BFC allocator tries to find the best fit first.
425 BinNum bin_num = BinNumForSize(rounded_bytes);
426
427 mutex_lock l(lock_);
428 if (!timestamped_chunks_.empty()) {
429 // Merge timestamped chunks whose counts have become safe for general use.
430 MergeTimestampedChunks(0);
431 }
432 void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
433 if (ptr != nullptr) {
434 AddTraceMe("MemoryAllocation", ptr);
435 return ptr;
436 }
437
438 // Try to extend
439 if (Extend(unused_alignment, rounded_bytes)) {
440 ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
441 if (ptr != nullptr) {
442 AddTraceMe("MemoryAllocation", ptr);
443 return ptr;
444 }
445 }
446
447 if ((freed_before == 0) && (!timestamped_chunks_.empty())) {
448 // We're unable to satisfy an allocation request without a specific
449 // timestamp requirement. Rather than fail, try merging any held-out
450 // timestamped chunks more aggressively until a free chunk of the necessary
451 // size is formed.
452 if (MergeTimestampedChunks(rounded_bytes)) {
453 ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
454 if (ptr != nullptr) {
455 AddTraceMe("MemoryAllocation", ptr);
456 return ptr;
457 }
458 }
459 }
460
461 // Reaching this point means that no chunks can satisfy the request. Also,
462 // the unallocated bytes cannot satisfy the request. Before giving up, let's
463 // try deallocating free regions so that suballocator can combine them with
464 // the unallocated bytes and form a larger region.
465 if (DeallocateFreeRegions(rounded_bytes) &&
466 Extend(unused_alignment, rounded_bytes)) {
467 ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
468 if (ptr != nullptr) {
469 AddTraceMe("MemoryAllocation", ptr);
470 return ptr;
471 }
472 }
473
474 // We searched all bins for an existing free chunk to use and
475 // couldn't find one. This means we must have run out of memory,
476 // Dump the memory log for analysis.
477 MaybeWriteMemoryMap();
478 if (dump_log_on_failure) {
479 LOG(WARNING)
480 << "Allocator (" << Name() << ") ran out of memory trying "
481 << "to allocate " << strings::HumanReadableNumBytes(num_bytes)
482 << " (rounded to " << rounded_bytes << ")"
483 << "requested by op "
484 << profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation()
485 .pending_op_name
486 << "\nIf the cause is memory fragmentation maybe the environment "
487 << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
488 << "improve the situation. \nCurrent allocation summary follows."
489 << "\nCurrent allocation summary follows.";
490 DumpMemoryLog(rounded_bytes);
491 LOG(WARNING) << RenderOccupancy();
492 }
493 return nullptr;
494}
495
496int64_t BFCAllocator::LargestFreeChunk() {
497 for (int i = kNumBins - 1; i >= 0; i--) {
498 if (!BinFromIndex(i)->free_chunks.empty()) {
499 return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
500 }
501 }
502 return 0;
503}
504
505double BFCAllocator::GetFragmentation() {
506 int64_t bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
507 DCHECK_GT(bytes_available, 0);
508 return static_cast<double>(bytes_available - LargestFreeChunk()) /
509 bytes_available;
510}
511
512void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
513 BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
514 AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
515}
516
517void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
518 const void* chunk_ptr, int64_t req_bytes,
519 int64_t alloc_bytes) {
520 tensorflow::profiler::TraceMe::InstantActivity(
521 [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
522 TF_NO_THREAD_SAFETY_ANALYSIS {
523 int64_t bytes_available =
524 memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
525 const auto& annotation =
526 profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
527 const auto op_name = annotation.pending_op_name
528 ? annotation.pending_op_name
529 : "(null)";
530 const auto region_type = annotation.pending_region_type
531 ? annotation.pending_region_type
532 : "(null)";
533 return tensorflow::profiler::TraceMeEncode(
534 traceme_name, {{"allocator_name", name_},
535 {"bytes_reserved", stats_.bytes_reserved},
536 {"bytes_allocated", stats_.bytes_in_use},
537 {"bytes_available", bytes_available},
538 {"fragmentation", GetFragmentation()},
539 {"peak_bytes_in_use", stats_.peak_bytes_in_use},
540 {"requested_bytes", req_bytes},
541 {"allocation_bytes", alloc_bytes},
542 {"addr", reinterpret_cast<uint64>(chunk_ptr)},
543 {"tf_op", op_name},
544 {"id", annotation.pending_step_id},
545 {"region_type", region_type},
546 {"data_type", annotation.pending_data_type},
547 {"shape", annotation.pending_shape_func()}});
548 },
549 /*level=*/profiler::TraceMeLevel::kInfo);
550}
551
552void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
553 size_t num_bytes, uint64 freed_before) {
554 // First identify the first bin that could satisfy rounded_bytes.
555 for (; bin_num < kNumBins; bin_num++) {
556 // Start searching from the first bin for the smallest chunk that fits
557 // rounded_bytes.
558 Bin* b = BinFromIndex(bin_num);
559 for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
560 ++citer) {
561 const BFCAllocator::ChunkHandle h = (*citer);
562 BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
563 DCHECK(!chunk->in_use());
564 if (freed_before > 0 && freed_before < chunk->freed_at_count) {
565 continue;
566 }
567 if (chunk->size >= rounded_bytes) {
568 // We found an existing chunk that fits us that wasn't in use, so remove
569 // it from the free bin structure prior to using.
570 RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
571
572 // If we can break the size of the chunk into two reasonably large
573 // pieces, do don't waste more than max_internal_fragmentation_bytes on
574 // padding. If this threshold is not set by the user, then use 128MB as
575 // the default.
576 const int64_t max_internal_fragmentation_bytes =
577 (opts_.fragmentation_fraction > 0.0)
578 ? opts_.fragmentation_fraction * memory_limit_
579 : 128 << 20;
580
581 if (chunk->size >= rounded_bytes * 2 ||
582 static_cast<int64_t>(chunk->size) - rounded_bytes >=
583 max_internal_fragmentation_bytes) {
584 SplitChunk(h, rounded_bytes);
585 chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved
586 }
587
588 // The requested size of the returned chunk is what the user
589 // has allocated.
590 chunk->requested_size = num_bytes;
591 // Assign a unique id and increment the id counter, marking the
592 // chunk as being in use.
593 chunk->allocation_id = next_allocation_id_++;
594
595 // Update stats.
596 ++stats_.num_allocs;
597 stats_.bytes_in_use += chunk->size;
598 if (stats_.bytes_in_use > stats_.peak_bytes_in_use) {
599 VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use
600 << " bytes for " << Name();
601 }
602 stats_.peak_bytes_in_use =
603 std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
604 stats_.largest_alloc_size =
605 std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
606
607#ifdef TENSORFLOW_MEM_DEBUG
608 if (ShouldRecordOpName()) {
609 const auto& annotation =
610 profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
611 if (annotation.pending_op_name != nullptr) {
612 chunk->op_name = annotation.pending_op_name;
613 } else {
614 LOG(INFO) << "missing pending_op_name for " << Name()
615 << " reading addr "
616 << static_cast<const void*>(&annotation.pending_op_name)
617 << "\n"
618 << CurrentStackTrace();
619 chunk->op_name = nullptr;
620 }
621 chunk->action_count = ++action_counter_;
622 chunk->step_id = annotation.pending_step_id;
623 int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
624 size_history_[slot] = stats_.bytes_in_use;
625 }
626#endif
627
628 VLOG(4) << "Returning: " << chunk->ptr;
629 if (VLOG_IS_ON(4)) {
630 LOG(INFO) << "A: " << RenderOccupancy();
631 }
632 return chunk->ptr;
633 }
634 }
635 }
636
637 return nullptr;
638}
639
640void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
641 // Allocate the new chunk before we do any ChunkFromHandle
642 ChunkHandle h_new_chunk = AllocateChunk();
643
644 Chunk* c = ChunkFromHandle(h);
645 CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
646
647 // Create a new chunk starting num_bytes after c
648 BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
649 new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes);
650 region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
651
652 // Set the new sizes of the chunks.
653 new_chunk->size = c->size - num_bytes;
654 c->size = num_bytes;
655
656 // The new chunk is not in use.
657 new_chunk->allocation_id = -1;
658
659 // It inherits the freed time.
660 new_chunk->freed_at_count = c->freed_at_count;
661
662 // Maintain the pointers.
663 // c <-> c_neighbor becomes
664 // c <-> new_chunk <-> c_neighbor
665 BFCAllocator::ChunkHandle h_neighbor = c->next;
666 new_chunk->prev = h;
667 new_chunk->next = h_neighbor;
668 c->next = h_new_chunk;
669 if (h_neighbor != kInvalidChunkHandle) {
670 Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
671 c_neighbor->prev = h_new_chunk;
672 }
673
674 // Add the newly free chunk to the free bin.
675 InsertFreeChunkIntoBin(h_new_chunk);
676}
677
678void BFCAllocator::DeallocateRaw(void* ptr) {
679 VLOG(3) << "DeallocateRaw " << Name() << " "
680 << (ptr ? RequestedSize(ptr) : 0);
681 DeallocateRawInternal(ptr);
682 retry_helper_.NotifyDealloc();
683}
684
685void BFCAllocator::DeallocateRawInternal(void* ptr) {
686 if (ptr == nullptr) {
687 VLOG(2) << "tried to deallocate nullptr";
688 return;
689 }
690 mutex_lock l(lock_);
691
692 // Find the chunk from the ptr.
693 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
694 CHECK(h != kInvalidChunkHandle);
695 // Record chunk information before it's freed.
696 Chunk* chunk = ChunkFromHandle(h);
697 void* chunk_ptr = chunk->ptr;
698 int64_t req_bytes = chunk->requested_size;
699 int64_t alloc_bytes = chunk->size;
700
701 MarkFree(h);
702
703 // Consider coalescing it.
704 if (timing_counter_) {
705 InsertFreeChunkIntoBin(h);
706 timestamped_chunks_.push_back(h);
707 } else {
708 InsertFreeChunkIntoBin(TryToCoalesce(h, false));
709 }
710
711 // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
712 // correct aggregation stats (bytes_in_use, fragmentation).
713 AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
714
715 if (VLOG_IS_ON(4)) {
716 LOG(INFO) << "F: " << RenderOccupancy();
717 }
718}
719
720// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
721// We merge Chunk(h2) into Chunk(h1).
722void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
723 BFCAllocator::ChunkHandle h2) {
724 Chunk* c1 = ChunkFromHandle(h1);
725 Chunk* c2 = ChunkFromHandle(h2);
726 // We can only merge chunks that are not in use.
727 CHECK(!c1->in_use() && !c2->in_use());
728
729 // c1's prev doesn't change, still points to the same ptr, and is
730 // still not in use.
731
732 // Fix up neighbor pointers
733 //
734 // c1 <-> c2 <-> c3 should become
735 // c1 <-> c3
736
737 BFCAllocator::ChunkHandle h3 = c2->next;
738 c1->next = h3;
739 CHECK(c2->prev == h1);
740 if (h3 != kInvalidChunkHandle) {
741 BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
742 c3->prev = h1;
743 }
744
745 // Set the new size
746 c1->size += c2->size;
747
748 // Pick latest free time.
749 c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
750
751 DeleteChunk(h2);
752}
753
754void BFCAllocator::DeleteChunk(ChunkHandle h) {
755 // Delete h and cleanup all state
756 Chunk* c = ChunkFromHandle(h);
757 // VLOG(4) << "Removing: " << c->ptr;
758 region_manager_.erase(c->ptr);
759 DeallocateChunk(h);
760}
761
762void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
763 Chunk* c = ChunkFromHandle(h);
764 CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
765 BinNum bin_num = BinNumForSize(c->size);
766 Bin* new_bin = BinFromIndex(bin_num);
767 c->bin_num = bin_num;
768 new_bin->free_chunks.insert(h);
769}
770
771void BFCAllocator::RemoveFreeChunkIterFromBin(
772 BFCAllocator::Bin::FreeChunkSet* free_chunks,
773 const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
774 ChunkHandle h = *citer;
775 Chunk* c = ChunkFromHandle(h);
776 CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
777 free_chunks->erase(citer);
778 c->bin_num = kInvalidBinNum;
779}
780
781void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
782 Chunk* c = ChunkFromHandle(h);
783 CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
784 CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0)
785 << "Could not find chunk in bin";
786 c->bin_num = kInvalidBinNum;
787}
788
789void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
790 Chunk* c = ChunkFromHandle(h);
791 CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
792
793 // Mark the chunk as no longer in use.
794 c->allocation_id = -1;
795
796 // Optionally record the free time.
797 if (timing_counter_) {
798 c->freed_at_count = timing_counter_->next();
799 }
800
801 // Updates the stats.
802 stats_.bytes_in_use -= c->size;
803
804#ifdef TENSORFLOW_MEM_DEBUG
805 if (ShouldRecordOpName()) {
806 c->action_count = ++action_counter_;
807 int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
808 size_history_[slot] = stats_.bytes_in_use;
809 }
810#endif
811}
812
813BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
814 bool ignore_freed_at) {
815 Chunk* c = ChunkFromHandle(h);
816 if ((!ignore_freed_at) && c->freed_at_count > 0) return h;
817 ChunkHandle coalesced_chunk = h;
818
819 // If the next chunk is free, merge it into c and delete it.
820 if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
821 Chunk* n = ChunkFromHandle(c->next);
822 if ((n->freed_at_count == 0) || ignore_freed_at) {
823 VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr;
824 RemoveFreeChunkFromBin(c->next);
825 Merge(h, c->next);
826 }
827 }
828
829 // If the previous chunk is free, merge c into it and delete c.
830 if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
831 Chunk* n = ChunkFromHandle(c->prev);
832 if ((n->freed_at_count == 0) || ignore_freed_at) {
833 VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
834 coalesced_chunk = c->prev;
835 RemoveFreeChunkFromBin(c->prev);
836 Merge(c->prev, h);
837 }
838 }
839
840 return coalesced_chunk;
841}
842
843void BFCAllocator::SetSafeFrontier(uint64 count) {
844 uint64 current = safe_frontier_.load(std::memory_order_relaxed);
845 while (count > current) {
846 if (safe_frontier_.compare_exchange_strong(current, count)) {
847 retry_helper_.NotifyDealloc();
848 return;
849 } else {
850 current = safe_frontier_.load(std::memory_order_relaxed);
851 }
852 }
853}
854
855bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
856 VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
857 << " required_bytes=" << required_bytes;
858 bool satisfied = (required_bytes == 0);
859 std::vector<void*> to_merge;
860 std::deque<ChunkHandle> new_ts_queue;
861 while (!timestamped_chunks_.empty()) {
862 ChunkHandle h = timestamped_chunks_.front();
863 timestamped_chunks_.pop_front();
864 DCHECK_NE(h, kInvalidChunkHandle);
865 Chunk* c = ChunkFromHandle(h);
866 // It's possible this chunk has already been merged so refetch and retest
867 // the handle.
868 h = region_manager_.get_handle(c->ptr);
869 if (h == kInvalidChunkHandle) {
870 continue;
871 }
872 if (c->in_use() || (c->bin_num == kInvalidBinNum)) {
873 // This chunk has already been reallocated.
874 continue;
875 }
876 if (c->freed_at_count == 0) {
877 to_merge.push_back(c->ptr);
878 continue;
879 }
880 // Chunk should be free and assigned to a bin.
881 DCHECK_NE(c->bin_num, kInvalidBinNum);
882 if (c->freed_at_count < safe_frontier_) {
883 c->freed_at_count = 0;
884 to_merge.push_back(c->ptr);
885 } else if (required_bytes > 0) {
886 to_merge.push_back(c->ptr);
887 } else {
888 new_ts_queue.push_back(h);
889 }
890 }
891 DCHECK(timestamped_chunks_.empty());
892 std::swap(timestamped_chunks_, new_ts_queue);
893
894 // At this point all candidate chunks have been moved from timestamped_chunks_
895 // to to_merge. If this is a standard merge (required_bytes == 0) then
896 // merge them all, otherwise merge just until a Chunk of the required size
897 // is produced.
898 for (int ci = 0, end = to_merge.size(); ci < end; ++ci) {
899 void* ptr = to_merge[ci];
900 // It's possible that the Chunk associated with this memory location got
901 // merged and deallocated in a prior iteration so refetch the handle and
902 // retest.
903 ChunkHandle h = region_manager_.get_handle(ptr);
904 if (h == kInvalidChunkHandle) continue;
905 if (required_bytes == 0 || !satisfied) {
906 Chunk* c = ChunkFromHandle(h);
907 DCHECK_NE(c->bin_num, kInvalidBinNum);
908 DCHECK(!c->in_use());
909 RemoveFreeChunkFromBin(h);
910 ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0));
911 InsertFreeChunkIntoBin(new_h);
912 if (required_bytes > 0) {
913 c = ChunkFromHandle(new_h);
914 if (new_h != h && c->freed_at_count > 0) {
915 timestamped_chunks_.push_back(new_h);
916 }
917 if (c->size >= required_bytes) {
918 satisfied = true;
919 }
920 }
921 } else {
922 // We were force merging Chunks with unsafe timestamps, but managed
923 // to create a satisfying Chunk so just requeue the rest.
924 timestamped_chunks_.push_back(h);
925 }
926 }
927 return satisfied;
928}
929
930bool BFCAllocator::TracksAllocationSizes() const { return true; }
931
932size_t BFCAllocator::RequestedSize(const void* ptr) const {
933 CHECK(ptr);
934 mutex_lock l(lock_);
935 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
936 CHECK(h != kInvalidChunkHandle)
937 << "Asked for requested size of pointer we never allocated: " << ptr;
938 const BFCAllocator::Chunk* c = ChunkFromHandle(h);
939 return c->requested_size;
940}
941
942size_t BFCAllocator::AllocatedSize(const void* ptr) const {
943 mutex_lock l(lock_);
944 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
945 CHECK(h != kInvalidChunkHandle)
946 << "Asked for allocated size of pointer we never allocated: " << ptr;
947 const BFCAllocator::Chunk* c = ChunkFromHandle(h);
948 return c->size;
949}
950
951int64_t BFCAllocator::AllocationId(const void* ptr) const {
952 mutex_lock l(lock_);
953 BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
954 CHECK(h != kInvalidChunkHandle)
955 << "Asked for allocation id of pointer we never allocated: " << ptr;
956 const BFCAllocator::Chunk* c = ChunkFromHandle(h);
957 return c->allocation_id;
958}
959
960namespace {
961
962void RenderRegion(char* rendered, const size_t resolution,
963 const size_t total_render_size, const size_t offset,
964 const void* base_ptr, const void* ptr, const size_t size,
965 const char c) {
966 const char* base_ptr_c = static_cast<const char*>(base_ptr);
967 const char* ptr_c = static_cast<const char*>(ptr);
968
969 size_t start_location =
970 ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
971 CHECK_GE(start_location, 0);
972 CHECK_LT(start_location, resolution);
973 size_t end_location =
974 ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) /
975 total_render_size;
976 CHECK_GE(end_location, 0);
977 CHECK_LT(end_location, resolution);
978
979 for (size_t i = start_location; i <= end_location; ++i) {
980 rendered[i] = c;
981 }
982}
983
984} // namespace
985
986string BFCAllocator::RenderOccupancy() {
987 // Make a buffer for the ASCII-art representation.
988 const size_t resolution = 100;
989 char rendered[resolution];
990
991 // Compute the total region size to render over
992 size_t total_region_size = 0;
993 for (const auto& region : region_manager_.regions()) {
994 total_region_size += region.memory_size();
995 }
996
997 if (total_region_size == 0) {
998 return "<allocator contains no memory>";
999 }
1000
1001 // Start out with everything empty
1002 RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr,
1003 total_region_size, '_');
1004
1005 size_t region_offset = 0;
1006 for (const auto& region : region_manager_.regions()) {
1007 ChunkHandle h = region_manager_.get_handle(region.ptr());
1008 // Then render each chunk left to right.
1009 while (h != kInvalidChunkHandle) {
1010 Chunk* c = ChunkFromHandle(h);
1011 if (c->in_use()) {
1012 // Render the wasted space
1013 size_t wasted = c->size - c->requested_size;
1014 if (wasted > 0) {
1015 RenderRegion(rendered, resolution, total_region_size,
1016 region_offset + c->requested_size, region.ptr(), c->ptr,
1017 wasted, 'x');
1018 }
1019 // Then the occupied space
1020 RenderRegion(rendered, resolution, total_region_size, region_offset,
1021 region.ptr(), c->ptr, c->requested_size, '*');
1022 }
1023 h = c->next;
1024 }
1025 region_offset += region.memory_size();
1026 }
1027
1028 return string(rendered, resolution);
1029}
1030
1031void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
1032 const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1033 LOG(INFO) << "BFCAllocator dump for " << Name();
1034 for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1035 Bin* b = BinFromIndex(bin_num);
1036 const BinDebugInfo& bin_info = bin_infos[bin_num];
1037 CHECK_EQ(b->free_chunks.size(),
1038 bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1039
1040 LOG(INFO) << "Bin (" << b->bin_size
1041 << "): \tTotal Chunks: " << bin_info.total_chunks_in_bin
1042 << ", Chunks in use: " << bin_info.total_chunks_in_use << ". "
1043 << strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin)
1044 << " allocated for chunks. "
1045 << strings::HumanReadableNumBytes(bin_info.total_bytes_in_use)
1046 << " in use in bin. "
1047 << strings::HumanReadableNumBytes(
1048 bin_info.total_requested_bytes_in_use)
1049 << " client-requested in use in bin.";
1050 }
1051
1052 // Find the bin that we would have liked to allocate in, so we
1053 // can get some further analysis about fragmentation.
1054 Bin* b = BinForSize(num_bytes);
1055
1056 LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
1057 << " was " << strings::HumanReadableNumBytes(b->bin_size)
1058 << ", Chunk State: ";
1059
1060 for (ChunkHandle h : b->free_chunks) {
1061 Chunk* c = ChunkFromHandle(h);
1062 LOG(INFO) << c->DebugString(this, true);
1063 }
1064
1065 // Next show the chunks that are in use, and also summarize their
1066 // number by size.
1067 std::map<size_t, int> in_use_by_size;
1068 for (const auto& region : region_manager_.regions()) {
1069 LOG(INFO) << "Next region of size " << region.memory_size();
1070 ChunkHandle h = region_manager_.get_handle(region.ptr());
1071 while (h != kInvalidChunkHandle) {
1072 const Chunk* c = ChunkFromHandle(h);
1073 if (c->in_use()) {
1074 in_use_by_size[c->size]++;
1075 }
1076 string buf = strings::StrCat(
1077 (c->in_use() ? "InUse" : "Free "), " at ",
1078 strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
1079#ifdef TENSORFLOW_MEM_DEBUG
1080 if (ShouldRecordOpName()) {
1081 strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
1082 c->action_count, " step ", c->step_id);
1083 }
1084#endif
1085 strings::StrAppend(&buf, " next ", c->next);
1086 if (timing_counter_) {
1087 strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
1088 }
1089 LOG(INFO) << buf;
1090 h = c->next;
1091 }
1092 }
1093
1094 LOG(INFO) << " Summary of in-use Chunks by size: ";
1095 size_t total_bytes = 0;
1096 for (auto& it : in_use_by_size) {
1097 LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
1098 << strings::HumanReadableNumBytes(it.first * it.second);
1099 total_bytes += (it.first * it.second);
1100 }
1101 LOG(INFO) << "Sum Total of in-use chunks: "
1102 << strings::HumanReadableNumBytes(total_bytes);
1103 LOG(INFO) << "total_region_allocated_bytes_: "
1104 << total_region_allocated_bytes_
1105 << " memory_limit_: " << memory_limit_ << " available bytes: "
1106 << (memory_limit_ - total_region_allocated_bytes_)
1107 << " curr_region_allocation_bytes_: "
1108 << curr_region_allocation_bytes_;
1109 LOG(INFO) << "Stats: \n" << stats_.DebugString();
1110}
1111
1112void BFCAllocator::MaybeWriteMemoryMap() {
1113 const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP");
1114 if (gpu_memory_map_file != nullptr) {
1115 std::unique_ptr<WritableFile> dump_file;
1116 string file_name = strings::StrCat(gpu_memory_map_file, "_", Name(), ".",
1117 Env::Default()->NowMicros());
1118 Status status = Env::Default()->NewWritableFile(file_name, &dump_file);
1119 if (!status.ok()) {
1120 LOG(ERROR) << "Failed to open file " << file_name;
1121 return;
1122 }
1123 MemoryDump md = RecordMemoryMapInternal();
1124 status = dump_file->Append(md.SerializeAsString());
1125 if (!status.ok()) {
1126 LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": "
1127 << status;
1128 }
1129 }
1130}
1131
1132MemoryDump BFCAllocator::RecordMemoryMap() {
1133 mutex_lock l(lock_);
1134 return RecordMemoryMapInternal();
1135}
1136
1137MemoryDump BFCAllocator::RecordMemoryMapInternal() {
1138 MemoryDump md;
1139 md.set_allocator_name(Name());
1140
1141 // Record the general stats
1142 MemAllocatorStats* mas = md.mutable_stats();
1143 mas->set_num_allocs(stats_.num_allocs);
1144 mas->set_bytes_in_use(stats_.bytes_in_use);
1145 mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
1146 mas->set_largest_alloc_size(stats_.largest_alloc_size);
1147
1148 // Record summary data for every bin.
1149 const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1150 for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) {
1151 Bin* b = BinFromIndex(bin_num);
1152 const BinDebugInfo& bin_info = bin_infos[bin_num];
1153 DCHECK_EQ(b->free_chunks.size(),
1154 bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1155 BinSummary* bs = md.add_bin_summary();
1156 bs->set_bin(bin_num);
1157 bs->set_total_bytes_in_use(bin_info.total_bytes_in_use);
1158 bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin);
1159 bs->set_total_chunks_in_use(bin_info.total_chunks_in_use);
1160 bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin);
1161 }
1162
1163 // Record state of every defined Chunk.
1164 for (const auto& region : region_manager_.regions()) {
1165 ChunkHandle h = region_manager_.get_handle(region.ptr());
1166 while (h != kInvalidChunkHandle) {
1167 const Chunk* c = ChunkFromHandle(h);
1168 MemChunk* mc = md.add_chunk();
1169 mc->set_in_use(c->in_use());
1170 mc->set_address(reinterpret_cast<uint64>(c->ptr));
1171 mc->set_size(c->size);
1172 mc->set_requested_size(c->requested_size);
1173 mc->set_bin(c->bin_num);
1174#ifdef TENSORFLOW_MEM_DEBUG
1175 mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
1176 mc->set_step_id(c->step_id);
1177 mc->set_action_count(c->action_count);
1178#endif
1179 if (timing_counter_) {
1180 mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count);
1181 }
1182 h = c->next;
1183 }
1184 }
1185
1186 mas->set_fragmentation_metric(GetFragmentation());
1187
1188#ifdef TENSORFLOW_MEM_DEBUG
1189 // Record the recent size history
1190 int history_len = static_cast<int>(std::min(
1191 action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
1192 for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
1193 SnapShot* ss = md.add_snap_shot();
1194 ss->set_action_count(i);
1195 int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
1196 ss->set_size(size_history_[slot]);
1197 }
1198#endif
1199
1200 return md;
1201}
1202
1203absl::optional<AllocatorStats> BFCAllocator::GetStats() {
1204 mutex_lock l(lock_);
1205 return stats_;
1206}
1207
1208bool BFCAllocator::ClearStats() {
1209 mutex_lock l(lock_);
1210 stats_.num_allocs = 0;
1211 stats_.peak_bytes_in_use = stats_.bytes_in_use;
1212 stats_.largest_alloc_size = 0;
1213 return true;
1214}
1215
1216std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
1217BFCAllocator::get_bin_debug_info() {
1218 std::array<BinDebugInfo, kNumBins> bin_infos;
1219 for (const auto& region : region_manager_.regions()) {
1220 ChunkHandle h = region_manager_.get_handle(region.ptr());
1221 while (h != kInvalidChunkHandle) {
1222 const Chunk* c = ChunkFromHandle(h);
1223 BinNum bin_num = BinNumForSize(c->size);
1224 BinDebugInfo& bin_info = bin_infos[bin_num];
1225 bin_info.total_bytes_in_bin += c->size;
1226 bin_info.total_chunks_in_bin++;
1227 if (c->in_use()) {
1228 bin_info.total_bytes_in_use += c->size;
1229 bin_info.total_requested_bytes_in_use += c->requested_size;
1230 bin_info.total_chunks_in_use++;
1231 } else {
1232 Bin* bin = BinFromIndex(bin_num);
1233 CHECK_EQ(bin->free_chunks.count(h), 1);
1234 CHECK_EQ(c->bin_num, bin_num);
1235 }
1236 h = c->next;
1237 }
1238 }
1239 return bin_infos;
1240}
1241
1242AllocatorMemoryType BFCAllocator::GetMemoryType() const {
1243 return sub_allocator_->GetMemoryType();
1244}
1245
1246} // namespace tensorflow
1247