1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/core/common_runtime/bfc_allocator.h" |
17 | |
18 | #include <algorithm> |
19 | #include <atomic> |
20 | #include <utility> |
21 | |
22 | #include "absl/strings/string_view.h" |
23 | #include "tensorflow/core/common_runtime/allocator_retry.h" |
24 | #include "tensorflow/core/lib/core/bits.h" |
25 | #include "tensorflow/core/lib/strings/numbers.h" |
26 | #include "tensorflow/core/lib/strings/str_util.h" |
27 | #include "tensorflow/core/lib/strings/strcat.h" |
28 | #include "tensorflow/core/platform/file_system.h" |
29 | #include "tensorflow/core/platform/logging.h" |
30 | #include "tensorflow/core/platform/mutex.h" |
31 | #ifdef TENSORFLOW_MEM_DEBUG |
32 | #include "tensorflow/core/platform/stacktrace.h" |
33 | #endif |
34 | #include "tensorflow/core/platform/types.h" |
35 | #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h" |
36 | #include "tensorflow/core/profiler/lib/traceme.h" |
37 | #include "tensorflow/core/protobuf/bfc_memory_map.pb.h" |
38 | |
39 | namespace tensorflow { |
40 | |
41 | constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle; |
42 | |
43 | BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator, |
44 | size_t total_memory, const string& name, |
45 | const Options& opts) |
46 | : opts_(opts), |
47 | coalesce_regions_(sub_allocator->SupportsCoalescing()), |
48 | sub_allocator_(std::move(sub_allocator)), |
49 | name_(name), |
50 | free_chunks_list_(kInvalidChunkHandle), |
51 | next_allocation_id_(1) { |
52 | if (opts.allow_growth) { |
53 | // 2MiB smallest initial allocation, unless total memory available |
54 | // is less. |
55 | curr_region_allocation_bytes_ = |
56 | RoundedBytes(std::min(total_memory, size_t{2 << 20})); |
57 | } else { |
58 | curr_region_allocation_bytes_ = RoundedBytes(total_memory); |
59 | } |
60 | |
61 | // Allocate the requested amount of memory. |
62 | memory_limit_ = total_memory; |
63 | stats_.bytes_limit = static_cast<int64_t>(total_memory); |
64 | |
65 | // Create a bunch of bins of various good sizes. |
66 | |
67 | // We create bins to fit all possible ranges that cover the |
68 | // memory_limit_ starting from allocations up to 256 bytes to |
69 | // allocations up to (and including) the memory limit. |
70 | VLOG(1) << "Creating new BFCAllocator named: " << name; |
71 | for (BinNum b = 0; b < kNumBins; b++) { |
72 | size_t bin_size = BinNumToSize(b); |
73 | VLOG(1) << "Creating bin of max chunk size " |
74 | << strings::HumanReadableNumBytes(bin_size); |
75 | new (BinFromIndex(b)) Bin(this, bin_size); |
76 | CHECK_EQ(BinForSize(bin_size), BinFromIndex(b)); |
77 | CHECK_EQ(BinForSize(bin_size + 255), BinFromIndex(b)); |
78 | CHECK_EQ(BinForSize(bin_size * 2 - 1), BinFromIndex(b)); |
79 | if (b + 1 < kNumBins) { |
80 | CHECK_NE(BinForSize(bin_size * 2), BinFromIndex(b)); |
81 | } |
82 | } |
83 | } |
84 | |
85 | BFCAllocator::~BFCAllocator() { |
86 | // Return memory back. |
87 | VLOG(2) << "Number of regions allocated: " |
88 | << region_manager_.regions().size(); |
89 | for (const auto& region : region_manager_.regions()) { |
90 | sub_allocator_->Free(region.ptr(), region.memory_size()); |
91 | } |
92 | |
93 | for (BinNum b = 0; b < kNumBins; b++) { |
94 | BinFromIndex(b)->~Bin(); |
95 | } |
96 | } |
97 | |
98 | BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) { |
99 | DCHECK_GE(h, 0); |
100 | DCHECK_LT(h, static_cast<int>(chunks_.size())); |
101 | return &(chunks_[h]); |
102 | } |
103 | |
104 | const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const { |
105 | DCHECK_GE(h, 0); |
106 | DCHECK_LT(h, static_cast<int>(chunks_.size())); |
107 | return &(chunks_[h]); |
108 | } |
109 | |
110 | bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) { |
111 | size_t available_bytes = memory_limit_ - total_region_allocated_bytes_; |
112 | // Rounds available_bytes down to the nearest multiple of kMinAllocationSize. |
113 | available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize; |
114 | |
115 | // Do we have enough space to handle the client's request? |
116 | // If not, fail immediately. |
117 | if (rounded_bytes > available_bytes) { |
118 | return false; |
119 | } |
120 | |
121 | // If curr_region_allocation_bytes_ is not enough to satisfy the |
122 | // allocation, keep multiplying by a power of two until that is |
123 | // sufficient. |
124 | bool increased_allocation = false; |
125 | while (rounded_bytes > curr_region_allocation_bytes_) { |
126 | curr_region_allocation_bytes_ *= 2; |
127 | increased_allocation = true; |
128 | } |
129 | |
130 | // Try allocating. |
131 | size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes); |
132 | size_t bytes_received; |
133 | void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received); |
134 | if (mem_addr == nullptr && !started_backpedal_) { |
135 | // Only backpedal once. |
136 | started_backpedal_ = true; |
137 | |
138 | static constexpr float kBackpedalFactor = 0.9; |
139 | |
140 | // Try allocating less memory. |
141 | while (mem_addr == nullptr) { |
142 | bytes = RoundedBytes(bytes * kBackpedalFactor); |
143 | if (bytes < rounded_bytes) break; |
144 | mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received); |
145 | } |
146 | } |
147 | |
148 | if (mem_addr == nullptr) { |
149 | return false; |
150 | } |
151 | |
152 | if (!increased_allocation) { |
153 | // Increase the region size of the next required allocation. |
154 | curr_region_allocation_bytes_ *= 2; |
155 | } |
156 | |
157 | VLOG(1) << "Extending allocation by " |
158 | << strings::HumanReadableNumBytes(bytes_received) << " bytes for " |
159 | << Name() << "." ; |
160 | |
161 | total_region_allocated_bytes_ += bytes_received; |
162 | VLOG(1) << "Total allocated bytes: " |
163 | << strings::HumanReadableNumBytes(total_region_allocated_bytes_); |
164 | |
165 | VLOG(1) << "Allocated memory at " << mem_addr << " to " |
166 | << static_cast<void*>(static_cast<char*>(mem_addr) + bytes_received); |
167 | |
168 | AllocationRegion* maybe_extended_region = nullptr; |
169 | if (coalesce_regions_) { |
170 | maybe_extended_region = |
171 | region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received); |
172 | } else { |
173 | region_manager_.AddAllocationRegion(mem_addr, bytes_received); |
174 | } |
175 | |
176 | // Create one large chunk for the whole memory space that will |
177 | // be chunked later. |
178 | ChunkHandle h = AllocateChunk(); |
179 | BFCAllocator::Chunk* c = ChunkFromHandle(h); |
180 | c->ptr = mem_addr; |
181 | c->size = bytes_received; |
182 | c->allocation_id = -1; |
183 | c->prev = kInvalidChunkHandle; |
184 | c->next = kInvalidChunkHandle; |
185 | c->freed_at_count = 0; |
186 | |
187 | region_manager_.set_handle(c->ptr, h); |
188 | |
189 | // If the region was extended, then there exists a previous chunk that should |
190 | // be linked to the new chunk. |
191 | if (maybe_extended_region != nullptr) { |
192 | ChunkHandle prev = |
193 | maybe_extended_region->get_handle(maybe_extended_region->ptr()); |
194 | BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev); |
195 | // Find the last recorded chunk in the extended region. |
196 | while (prev_chunk->next != kInvalidChunkHandle) { |
197 | prev = prev_chunk->next; |
198 | prev_chunk = ChunkFromHandle(prev); |
199 | } |
200 | c->prev = prev; |
201 | prev_chunk->next = h; |
202 | } |
203 | |
204 | // Maybe merge adjacent chunks and insert the chunk into the right bin. |
205 | InsertFreeChunkIntoBin(TryToCoalesce(h, /*ignore_freed_at=*/false)); |
206 | |
207 | return true; |
208 | } |
209 | |
210 | BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() { |
211 | if (free_chunks_list_ != kInvalidChunkHandle) { |
212 | ChunkHandle h = free_chunks_list_; |
213 | Chunk* c = ChunkFromHandle(h); |
214 | free_chunks_list_ = c->next; |
215 | return h; |
216 | } else { |
217 | ChunkHandle h = chunks_.size(); |
218 | chunks_.resize(h + 1); |
219 | return h; |
220 | } |
221 | } |
222 | |
223 | void BFCAllocator::DeallocateChunk(ChunkHandle h) { |
224 | Chunk* c = ChunkFromHandle(h); |
225 | c->allocation_id = -1; |
226 | c->bin_num = kInvalidBinNum; |
227 | c->next = free_chunks_list_; |
228 | free_chunks_list_ = h; |
229 | } |
230 | |
231 | void* BFCAllocator::AllocateRawInternalWithRetry( |
232 | size_t unused_alignment, size_t num_bytes, |
233 | const AllocationAttributes& allocation_attr) { |
234 | // Fast path: Try once to allocate without getting the retry_helper_ involved |
235 | uint64 freed_by_count = 0; |
236 | if (allocation_attr.freed_by_func != nullptr) { |
237 | freed_by_count = (*allocation_attr.freed_by_func)(); |
238 | } |
239 | void* r = |
240 | AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count); |
241 | if (r != nullptr) { |
242 | return r; |
243 | } else { |
244 | static const int64_t kMaxMillisToWait = 10000; // 10 seconds |
245 | r = retry_helper_.AllocateRaw( |
246 | [this, &allocation_attr](size_t a, size_t nb, bool v) { |
247 | uint64 freed_by_count = 0; |
248 | if (allocation_attr.freed_by_func != nullptr) { |
249 | freed_by_count = (*allocation_attr.freed_by_func)(); |
250 | } |
251 | return AllocateRawInternal(a, nb, v, freed_by_count); |
252 | }, |
253 | kMaxMillisToWait, unused_alignment, num_bytes); |
254 | return r; |
255 | } |
256 | } |
257 | |
258 | void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes, |
259 | const AllocationAttributes& allocation_attr) { |
260 | VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes; |
261 | void* result = [&] { |
262 | if (!opts_.allow_retry_on_failure || !allocation_attr.retry_on_failure) { |
263 | // If we have globally disabled retry-on-failure and fail to allocate an |
264 | // "important" alloc, we want to print a log, because the program may be |
265 | // about to fail due to OOM. |
266 | // |
267 | // Bit of a hack: We deem "important" allocs as those which are retryable. |
268 | // In TF, *non*-retryable allocations are usually those which we can |
269 | // tolerate failing. For example, we allocate convolution scratch memory |
270 | // as non-retryable; if it fails, we'll just use a fallback algorithm that |
271 | // uses no scratch. |
272 | static std::atomic<int32> log_counter{0}; |
273 | constexpr int kMaxFailureLogs = 10; |
274 | bool dump_log_on_failure = |
275 | (/*retry is globally disabled*/ !opts_.allow_retry_on_failure && |
276 | /*alloc is "important"*/ allocation_attr.retry_on_failure && |
277 | log_counter.load(std::memory_order_relaxed) < kMaxFailureLogs) || |
278 | VLOG_IS_ON(2); |
279 | |
280 | uint64 freed_by_count = 0; |
281 | if (allocation_attr.freed_by_func != nullptr) { |
282 | freed_by_count = (*allocation_attr.freed_by_func)(); |
283 | } |
284 | void* res = AllocateRawInternal(unused_alignment, num_bytes, |
285 | dump_log_on_failure, freed_by_count); |
286 | if (res == nullptr) { |
287 | int32 counter_value = log_counter.load(std::memory_order_relaxed); |
288 | if (counter_value < kMaxFailureLogs) { |
289 | log_counter.store(counter_value + 1, std::memory_order_relaxed); |
290 | LOG(WARNING) |
291 | << "Allocator (" << Name() << ") ran out of memory trying " |
292 | << "to allocate " << strings::HumanReadableNumBytes(num_bytes) |
293 | << " with freed_by_count=" << freed_by_count << "." |
294 | << (!allocation_attr.retry_on_failure |
295 | ? " The caller indicates that this is not a failure, but" |
296 | " this may mean that there could be performance gains " |
297 | "if more memory were available." |
298 | : "" ); |
299 | } |
300 | } |
301 | return res; |
302 | } else { |
303 | return AllocateRawInternalWithRetry(unused_alignment, num_bytes, |
304 | allocation_attr); |
305 | } |
306 | }(); |
307 | VLOG(3) << "AllocateRaw " << Name() << " " << num_bytes << " " << result; |
308 | return result; |
309 | } |
310 | |
311 | // static |
312 | size_t BFCAllocator::RoundedBytes(size_t bytes) { |
313 | size_t rounded_bytes = |
314 | (kMinAllocationSize * |
315 | ((bytes + kMinAllocationSize - 1) / kMinAllocationSize)); |
316 | DCHECK_EQ(size_t{0}, rounded_bytes % kMinAllocationSize); |
317 | return rounded_bytes; |
318 | } |
319 | |
320 | bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes) |
321 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) { |
322 | // Do nothing if garbage collection is off. |
323 | if (!opts_.garbage_collection) { |
324 | return false; |
325 | } |
326 | |
327 | // Searching for free regions. |
328 | absl::flat_hash_set<void*> free_region_ptrs; |
329 | size_t total_free_bytes = 0; |
330 | for (const AllocationRegion& region : region_manager_.regions()) { |
331 | ChunkHandle h = region_manager_.get_handle(region.ptr()); |
332 | bool any_use = false; |
333 | while (h != kInvalidChunkHandle) { |
334 | const Chunk* c = ChunkFromHandle(h); |
335 | if (c->in_use()) { |
336 | any_use = true; |
337 | break; |
338 | } |
339 | h = c->next; |
340 | } |
341 | |
342 | if (!any_use) { |
343 | VLOG(2) << "Found free region with ptr = " << region.ptr(); |
344 | free_region_ptrs.insert(region.ptr()); |
345 | total_free_bytes += region.memory_size(); |
346 | } |
347 | } |
348 | |
349 | if (total_free_bytes == 0) { |
350 | return false; |
351 | } |
352 | |
353 | // Rough estimation to check whether deallocation can help. |
354 | size_t available_bytes = |
355 | memory_limit_ - total_region_allocated_bytes_ + total_free_bytes; |
356 | if (rounded_bytes > available_bytes) { |
357 | return false; |
358 | } |
359 | |
360 | LOG(WARNING) << "Garbage collection: deallocate free memory regions" |
361 | << " (i.e., allocations) so that we can re-allocate a larger" |
362 | << " region to avoid OOM due to memory fragmentation. If you" |
363 | << " see this message frequently, you are running near the" |
364 | << " threshold of the available device memory and re-allocation" |
365 | << " may incur great performance overhead. You may try smaller" |
366 | << " batch sizes to observe the performance impact." |
367 | << " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to" |
368 | << " disable this feature." ; |
369 | |
370 | // Deallocate free regions. |
371 | DeallocateRegions(free_region_ptrs); |
372 | |
373 | return true; |
374 | } |
375 | |
376 | void BFCAllocator::DeallocateRegions( |
377 | const absl::flat_hash_set<void*>& region_ptrs) |
378 | TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) { |
379 | // Explicitly remove the const qualifier as some compilers disallow passing |
380 | // const_iterator to std::vector::erase(), which is used in |
381 | // RemoveAllocationRegion(). |
382 | auto regions = |
383 | const_cast<std::vector<AllocationRegion>*>(®ion_manager_.regions()); |
384 | auto it = regions->begin(); |
385 | while (it != regions->end()) { |
386 | if (!region_ptrs.contains(it->ptr())) { |
387 | ++it; |
388 | continue; |
389 | } |
390 | |
391 | VLOG(2) << "Deallocate region with ptr = " << it->ptr(); |
392 | // Remove all chunk registrations from Bins. |
393 | ChunkHandle h = region_manager_.get_handle(it->ptr()); |
394 | while (h != kInvalidChunkHandle) { |
395 | const Chunk* c = ChunkFromHandle(h); |
396 | if (c->bin_num != kInvalidBinNum) { |
397 | RemoveFreeChunkFromBin(h); |
398 | } |
399 | auto h_to_delete = h; |
400 | h = c->next; |
401 | DeleteChunk(h_to_delete); |
402 | } |
403 | |
404 | // Deallocate the memory. |
405 | sub_allocator_->Free(it->ptr(), it->memory_size()); |
406 | total_region_allocated_bytes_ -= it->memory_size(); |
407 | it = region_manager_.RemoveAllocationRegion(it); |
408 | } |
409 | } |
410 | |
411 | void* BFCAllocator::AllocateRawInternal(size_t unused_alignment, |
412 | size_t num_bytes, |
413 | bool dump_log_on_failure, |
414 | uint64 freed_before) { |
415 | if (num_bytes == 0) { |
416 | VLOG(2) << "tried to allocate 0 bytes" ; |
417 | return nullptr; |
418 | } |
419 | // First, always allocate memory of at least kMinAllocationSize |
420 | // bytes, and always allocate multiples of kMinAllocationSize bytes |
421 | // so all memory addresses are nicely byte aligned. |
422 | size_t rounded_bytes = RoundedBytes(num_bytes); |
423 | |
424 | // The BFC allocator tries to find the best fit first. |
425 | BinNum bin_num = BinNumForSize(rounded_bytes); |
426 | |
427 | mutex_lock l(lock_); |
428 | if (!timestamped_chunks_.empty()) { |
429 | // Merge timestamped chunks whose counts have become safe for general use. |
430 | MergeTimestampedChunks(0); |
431 | } |
432 | void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); |
433 | if (ptr != nullptr) { |
434 | AddTraceMe("MemoryAllocation" , ptr); |
435 | return ptr; |
436 | } |
437 | |
438 | // Try to extend |
439 | if (Extend(unused_alignment, rounded_bytes)) { |
440 | ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); |
441 | if (ptr != nullptr) { |
442 | AddTraceMe("MemoryAllocation" , ptr); |
443 | return ptr; |
444 | } |
445 | } |
446 | |
447 | if ((freed_before == 0) && (!timestamped_chunks_.empty())) { |
448 | // We're unable to satisfy an allocation request without a specific |
449 | // timestamp requirement. Rather than fail, try merging any held-out |
450 | // timestamped chunks more aggressively until a free chunk of the necessary |
451 | // size is formed. |
452 | if (MergeTimestampedChunks(rounded_bytes)) { |
453 | ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); |
454 | if (ptr != nullptr) { |
455 | AddTraceMe("MemoryAllocation" , ptr); |
456 | return ptr; |
457 | } |
458 | } |
459 | } |
460 | |
461 | // Reaching this point means that no chunks can satisfy the request. Also, |
462 | // the unallocated bytes cannot satisfy the request. Before giving up, let's |
463 | // try deallocating free regions so that suballocator can combine them with |
464 | // the unallocated bytes and form a larger region. |
465 | if (DeallocateFreeRegions(rounded_bytes) && |
466 | Extend(unused_alignment, rounded_bytes)) { |
467 | ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before); |
468 | if (ptr != nullptr) { |
469 | AddTraceMe("MemoryAllocation" , ptr); |
470 | return ptr; |
471 | } |
472 | } |
473 | |
474 | // We searched all bins for an existing free chunk to use and |
475 | // couldn't find one. This means we must have run out of memory, |
476 | // Dump the memory log for analysis. |
477 | MaybeWriteMemoryMap(); |
478 | if (dump_log_on_failure) { |
479 | LOG(WARNING) |
480 | << "Allocator (" << Name() << ") ran out of memory trying " |
481 | << "to allocate " << strings::HumanReadableNumBytes(num_bytes) |
482 | << " (rounded to " << rounded_bytes << ")" |
483 | << "requested by op " |
484 | << profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation() |
485 | .pending_op_name |
486 | << "\nIf the cause is memory fragmentation maybe the environment " |
487 | << "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will " |
488 | << "improve the situation. \nCurrent allocation summary follows." |
489 | << "\nCurrent allocation summary follows." ; |
490 | DumpMemoryLog(rounded_bytes); |
491 | LOG(WARNING) << RenderOccupancy(); |
492 | } |
493 | return nullptr; |
494 | } |
495 | |
496 | int64_t BFCAllocator::LargestFreeChunk() { |
497 | for (int i = kNumBins - 1; i >= 0; i--) { |
498 | if (!BinFromIndex(i)->free_chunks.empty()) { |
499 | return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size; |
500 | } |
501 | } |
502 | return 0; |
503 | } |
504 | |
505 | double BFCAllocator::GetFragmentation() { |
506 | int64_t bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use; |
507 | DCHECK_GT(bytes_available, 0); |
508 | return static_cast<double>(bytes_available - LargestFreeChunk()) / |
509 | bytes_available; |
510 | } |
511 | |
512 | void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) { |
513 | BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr)); |
514 | AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size); |
515 | } |
516 | |
517 | void BFCAllocator::AddTraceMe(absl::string_view traceme_name, |
518 | const void* chunk_ptr, int64_t req_bytes, |
519 | int64_t alloc_bytes) { |
520 | tensorflow::profiler::TraceMe::InstantActivity( |
521 | [this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]() |
522 | TF_NO_THREAD_SAFETY_ANALYSIS { |
523 | int64_t bytes_available = |
524 | memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use; |
525 | const auto& annotation = |
526 | profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation(); |
527 | const auto op_name = annotation.pending_op_name |
528 | ? annotation.pending_op_name |
529 | : "(null)" ; |
530 | const auto region_type = annotation.pending_region_type |
531 | ? annotation.pending_region_type |
532 | : "(null)" ; |
533 | return tensorflow::profiler::TraceMeEncode( |
534 | traceme_name, {{"allocator_name" , name_}, |
535 | {"bytes_reserved" , stats_.bytes_reserved}, |
536 | {"bytes_allocated" , stats_.bytes_in_use}, |
537 | {"bytes_available" , bytes_available}, |
538 | {"fragmentation" , GetFragmentation()}, |
539 | {"peak_bytes_in_use" , stats_.peak_bytes_in_use}, |
540 | {"requested_bytes" , req_bytes}, |
541 | {"allocation_bytes" , alloc_bytes}, |
542 | {"addr" , reinterpret_cast<uint64>(chunk_ptr)}, |
543 | {"tf_op" , op_name}, |
544 | {"id" , annotation.pending_step_id}, |
545 | {"region_type" , region_type}, |
546 | {"data_type" , annotation.pending_data_type}, |
547 | {"shape" , annotation.pending_shape_func()}}); |
548 | }, |
549 | /*level=*/profiler::TraceMeLevel::kInfo); |
550 | } |
551 | |
552 | void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes, |
553 | size_t num_bytes, uint64 freed_before) { |
554 | // First identify the first bin that could satisfy rounded_bytes. |
555 | for (; bin_num < kNumBins; bin_num++) { |
556 | // Start searching from the first bin for the smallest chunk that fits |
557 | // rounded_bytes. |
558 | Bin* b = BinFromIndex(bin_num); |
559 | for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end(); |
560 | ++citer) { |
561 | const BFCAllocator::ChunkHandle h = (*citer); |
562 | BFCAllocator::Chunk* chunk = ChunkFromHandle(h); |
563 | DCHECK(!chunk->in_use()); |
564 | if (freed_before > 0 && freed_before < chunk->freed_at_count) { |
565 | continue; |
566 | } |
567 | if (chunk->size >= rounded_bytes) { |
568 | // We found an existing chunk that fits us that wasn't in use, so remove |
569 | // it from the free bin structure prior to using. |
570 | RemoveFreeChunkIterFromBin(&b->free_chunks, citer); |
571 | |
572 | // If we can break the size of the chunk into two reasonably large |
573 | // pieces, do don't waste more than max_internal_fragmentation_bytes on |
574 | // padding. If this threshold is not set by the user, then use 128MB as |
575 | // the default. |
576 | const int64_t max_internal_fragmentation_bytes = |
577 | (opts_.fragmentation_fraction > 0.0) |
578 | ? opts_.fragmentation_fraction * memory_limit_ |
579 | : 128 << 20; |
580 | |
581 | if (chunk->size >= rounded_bytes * 2 || |
582 | static_cast<int64_t>(chunk->size) - rounded_bytes >= |
583 | max_internal_fragmentation_bytes) { |
584 | SplitChunk(h, rounded_bytes); |
585 | chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved |
586 | } |
587 | |
588 | // The requested size of the returned chunk is what the user |
589 | // has allocated. |
590 | chunk->requested_size = num_bytes; |
591 | // Assign a unique id and increment the id counter, marking the |
592 | // chunk as being in use. |
593 | chunk->allocation_id = next_allocation_id_++; |
594 | |
595 | // Update stats. |
596 | ++stats_.num_allocs; |
597 | stats_.bytes_in_use += chunk->size; |
598 | if (stats_.bytes_in_use > stats_.peak_bytes_in_use) { |
599 | VLOG(2) << "New Peak memory usage of " << stats_.bytes_in_use |
600 | << " bytes for " << Name(); |
601 | } |
602 | stats_.peak_bytes_in_use = |
603 | std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use); |
604 | stats_.largest_alloc_size = |
605 | std::max<std::size_t>(stats_.largest_alloc_size, chunk->size); |
606 | |
607 | #ifdef TENSORFLOW_MEM_DEBUG |
608 | if (ShouldRecordOpName()) { |
609 | const auto& annotation = |
610 | profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation(); |
611 | if (annotation.pending_op_name != nullptr) { |
612 | chunk->op_name = annotation.pending_op_name; |
613 | } else { |
614 | LOG(INFO) << "missing pending_op_name for " << Name() |
615 | << " reading addr " |
616 | << static_cast<const void*>(&annotation.pending_op_name) |
617 | << "\n" |
618 | << CurrentStackTrace(); |
619 | chunk->op_name = nullptr; |
620 | } |
621 | chunk->action_count = ++action_counter_; |
622 | chunk->step_id = annotation.pending_step_id; |
623 | int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE; |
624 | size_history_[slot] = stats_.bytes_in_use; |
625 | } |
626 | #endif |
627 | |
628 | VLOG(4) << "Returning: " << chunk->ptr; |
629 | if (VLOG_IS_ON(4)) { |
630 | LOG(INFO) << "A: " << RenderOccupancy(); |
631 | } |
632 | return chunk->ptr; |
633 | } |
634 | } |
635 | } |
636 | |
637 | return nullptr; |
638 | } |
639 | |
640 | void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) { |
641 | // Allocate the new chunk before we do any ChunkFromHandle |
642 | ChunkHandle h_new_chunk = AllocateChunk(); |
643 | |
644 | Chunk* c = ChunkFromHandle(h); |
645 | CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); |
646 | |
647 | // Create a new chunk starting num_bytes after c |
648 | BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk); |
649 | new_chunk->ptr = static_cast<void*>(static_cast<char*>(c->ptr) + num_bytes); |
650 | region_manager_.set_handle(new_chunk->ptr, h_new_chunk); |
651 | |
652 | // Set the new sizes of the chunks. |
653 | new_chunk->size = c->size - num_bytes; |
654 | c->size = num_bytes; |
655 | |
656 | // The new chunk is not in use. |
657 | new_chunk->allocation_id = -1; |
658 | |
659 | // It inherits the freed time. |
660 | new_chunk->freed_at_count = c->freed_at_count; |
661 | |
662 | // Maintain the pointers. |
663 | // c <-> c_neighbor becomes |
664 | // c <-> new_chunk <-> c_neighbor |
665 | BFCAllocator::ChunkHandle h_neighbor = c->next; |
666 | new_chunk->prev = h; |
667 | new_chunk->next = h_neighbor; |
668 | c->next = h_new_chunk; |
669 | if (h_neighbor != kInvalidChunkHandle) { |
670 | Chunk* c_neighbor = ChunkFromHandle(h_neighbor); |
671 | c_neighbor->prev = h_new_chunk; |
672 | } |
673 | |
674 | // Add the newly free chunk to the free bin. |
675 | InsertFreeChunkIntoBin(h_new_chunk); |
676 | } |
677 | |
678 | void BFCAllocator::DeallocateRaw(void* ptr) { |
679 | VLOG(3) << "DeallocateRaw " << Name() << " " |
680 | << (ptr ? RequestedSize(ptr) : 0); |
681 | DeallocateRawInternal(ptr); |
682 | retry_helper_.NotifyDealloc(); |
683 | } |
684 | |
685 | void BFCAllocator::DeallocateRawInternal(void* ptr) { |
686 | if (ptr == nullptr) { |
687 | VLOG(2) << "tried to deallocate nullptr" ; |
688 | return; |
689 | } |
690 | mutex_lock l(lock_); |
691 | |
692 | // Find the chunk from the ptr. |
693 | BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); |
694 | CHECK(h != kInvalidChunkHandle); |
695 | // Record chunk information before it's freed. |
696 | Chunk* chunk = ChunkFromHandle(h); |
697 | void* chunk_ptr = chunk->ptr; |
698 | int64_t req_bytes = chunk->requested_size; |
699 | int64_t alloc_bytes = chunk->size; |
700 | |
701 | MarkFree(h); |
702 | |
703 | // Consider coalescing it. |
704 | if (timing_counter_) { |
705 | InsertFreeChunkIntoBin(h); |
706 | timestamped_chunks_.push_back(h); |
707 | } else { |
708 | InsertFreeChunkIntoBin(TryToCoalesce(h, false)); |
709 | } |
710 | |
711 | // TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for |
712 | // correct aggregation stats (bytes_in_use, fragmentation). |
713 | AddTraceMe("MemoryDeallocation" , chunk_ptr, req_bytes, alloc_bytes); |
714 | |
715 | if (VLOG_IS_ON(4)) { |
716 | LOG(INFO) << "F: " << RenderOccupancy(); |
717 | } |
718 | } |
719 | |
720 | // Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1. |
721 | // We merge Chunk(h2) into Chunk(h1). |
722 | void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1, |
723 | BFCAllocator::ChunkHandle h2) { |
724 | Chunk* c1 = ChunkFromHandle(h1); |
725 | Chunk* c2 = ChunkFromHandle(h2); |
726 | // We can only merge chunks that are not in use. |
727 | CHECK(!c1->in_use() && !c2->in_use()); |
728 | |
729 | // c1's prev doesn't change, still points to the same ptr, and is |
730 | // still not in use. |
731 | |
732 | // Fix up neighbor pointers |
733 | // |
734 | // c1 <-> c2 <-> c3 should become |
735 | // c1 <-> c3 |
736 | |
737 | BFCAllocator::ChunkHandle h3 = c2->next; |
738 | c1->next = h3; |
739 | CHECK(c2->prev == h1); |
740 | if (h3 != kInvalidChunkHandle) { |
741 | BFCAllocator::Chunk* c3 = ChunkFromHandle(h3); |
742 | c3->prev = h1; |
743 | } |
744 | |
745 | // Set the new size |
746 | c1->size += c2->size; |
747 | |
748 | // Pick latest free time. |
749 | c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count); |
750 | |
751 | DeleteChunk(h2); |
752 | } |
753 | |
754 | void BFCAllocator::DeleteChunk(ChunkHandle h) { |
755 | // Delete h and cleanup all state |
756 | Chunk* c = ChunkFromHandle(h); |
757 | // VLOG(4) << "Removing: " << c->ptr; |
758 | region_manager_.erase(c->ptr); |
759 | DeallocateChunk(h); |
760 | } |
761 | |
762 | void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) { |
763 | Chunk* c = ChunkFromHandle(h); |
764 | CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum)); |
765 | BinNum bin_num = BinNumForSize(c->size); |
766 | Bin* new_bin = BinFromIndex(bin_num); |
767 | c->bin_num = bin_num; |
768 | new_bin->free_chunks.insert(h); |
769 | } |
770 | |
771 | void BFCAllocator::RemoveFreeChunkIterFromBin( |
772 | BFCAllocator::Bin::FreeChunkSet* free_chunks, |
773 | const BFCAllocator::Bin::FreeChunkSet::iterator& citer) { |
774 | ChunkHandle h = *citer; |
775 | Chunk* c = ChunkFromHandle(h); |
776 | CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum)); |
777 | free_chunks->erase(citer); |
778 | c->bin_num = kInvalidBinNum; |
779 | } |
780 | |
781 | void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) { |
782 | Chunk* c = ChunkFromHandle(h); |
783 | CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum)); |
784 | CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), 0) |
785 | << "Could not find chunk in bin" ; |
786 | c->bin_num = kInvalidBinNum; |
787 | } |
788 | |
789 | void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) { |
790 | Chunk* c = ChunkFromHandle(h); |
791 | CHECK(c->in_use() && (c->bin_num == kInvalidBinNum)); |
792 | |
793 | // Mark the chunk as no longer in use. |
794 | c->allocation_id = -1; |
795 | |
796 | // Optionally record the free time. |
797 | if (timing_counter_) { |
798 | c->freed_at_count = timing_counter_->next(); |
799 | } |
800 | |
801 | // Updates the stats. |
802 | stats_.bytes_in_use -= c->size; |
803 | |
804 | #ifdef TENSORFLOW_MEM_DEBUG |
805 | if (ShouldRecordOpName()) { |
806 | c->action_count = ++action_counter_; |
807 | int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE; |
808 | size_history_[slot] = stats_.bytes_in_use; |
809 | } |
810 | #endif |
811 | } |
812 | |
813 | BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h, |
814 | bool ignore_freed_at) { |
815 | Chunk* c = ChunkFromHandle(h); |
816 | if ((!ignore_freed_at) && c->freed_at_count > 0) return h; |
817 | ChunkHandle coalesced_chunk = h; |
818 | |
819 | // If the next chunk is free, merge it into c and delete it. |
820 | if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) { |
821 | Chunk* n = ChunkFromHandle(c->next); |
822 | if ((n->freed_at_count == 0) || ignore_freed_at) { |
823 | VLOG(4) << "Merging c->next " << n->ptr << " with c " << c->ptr; |
824 | RemoveFreeChunkFromBin(c->next); |
825 | Merge(h, c->next); |
826 | } |
827 | } |
828 | |
829 | // If the previous chunk is free, merge c into it and delete c. |
830 | if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) { |
831 | Chunk* n = ChunkFromHandle(c->prev); |
832 | if ((n->freed_at_count == 0) || ignore_freed_at) { |
833 | VLOG(4) << "Merging c " << c->ptr << " into c->prev " << n->ptr; |
834 | coalesced_chunk = c->prev; |
835 | RemoveFreeChunkFromBin(c->prev); |
836 | Merge(c->prev, h); |
837 | } |
838 | } |
839 | |
840 | return coalesced_chunk; |
841 | } |
842 | |
843 | void BFCAllocator::SetSafeFrontier(uint64 count) { |
844 | uint64 current = safe_frontier_.load(std::memory_order_relaxed); |
845 | while (count > current) { |
846 | if (safe_frontier_.compare_exchange_strong(current, count)) { |
847 | retry_helper_.NotifyDealloc(); |
848 | return; |
849 | } else { |
850 | current = safe_frontier_.load(std::memory_order_relaxed); |
851 | } |
852 | } |
853 | } |
854 | |
855 | bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) { |
856 | VLOG(1) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size() |
857 | << " required_bytes=" << required_bytes; |
858 | bool satisfied = (required_bytes == 0); |
859 | std::vector<void*> to_merge; |
860 | std::deque<ChunkHandle> new_ts_queue; |
861 | while (!timestamped_chunks_.empty()) { |
862 | ChunkHandle h = timestamped_chunks_.front(); |
863 | timestamped_chunks_.pop_front(); |
864 | DCHECK_NE(h, kInvalidChunkHandle); |
865 | Chunk* c = ChunkFromHandle(h); |
866 | // It's possible this chunk has already been merged so refetch and retest |
867 | // the handle. |
868 | h = region_manager_.get_handle(c->ptr); |
869 | if (h == kInvalidChunkHandle) { |
870 | continue; |
871 | } |
872 | if (c->in_use() || (c->bin_num == kInvalidBinNum)) { |
873 | // This chunk has already been reallocated. |
874 | continue; |
875 | } |
876 | if (c->freed_at_count == 0) { |
877 | to_merge.push_back(c->ptr); |
878 | continue; |
879 | } |
880 | // Chunk should be free and assigned to a bin. |
881 | DCHECK_NE(c->bin_num, kInvalidBinNum); |
882 | if (c->freed_at_count < safe_frontier_) { |
883 | c->freed_at_count = 0; |
884 | to_merge.push_back(c->ptr); |
885 | } else if (required_bytes > 0) { |
886 | to_merge.push_back(c->ptr); |
887 | } else { |
888 | new_ts_queue.push_back(h); |
889 | } |
890 | } |
891 | DCHECK(timestamped_chunks_.empty()); |
892 | std::swap(timestamped_chunks_, new_ts_queue); |
893 | |
894 | // At this point all candidate chunks have been moved from timestamped_chunks_ |
895 | // to to_merge. If this is a standard merge (required_bytes == 0) then |
896 | // merge them all, otherwise merge just until a Chunk of the required size |
897 | // is produced. |
898 | for (int ci = 0, end = to_merge.size(); ci < end; ++ci) { |
899 | void* ptr = to_merge[ci]; |
900 | // It's possible that the Chunk associated with this memory location got |
901 | // merged and deallocated in a prior iteration so refetch the handle and |
902 | // retest. |
903 | ChunkHandle h = region_manager_.get_handle(ptr); |
904 | if (h == kInvalidChunkHandle) continue; |
905 | if (required_bytes == 0 || !satisfied) { |
906 | Chunk* c = ChunkFromHandle(h); |
907 | DCHECK_NE(c->bin_num, kInvalidBinNum); |
908 | DCHECK(!c->in_use()); |
909 | RemoveFreeChunkFromBin(h); |
910 | ChunkHandle new_h = TryToCoalesce(h, (required_bytes > 0)); |
911 | InsertFreeChunkIntoBin(new_h); |
912 | if (required_bytes > 0) { |
913 | c = ChunkFromHandle(new_h); |
914 | if (new_h != h && c->freed_at_count > 0) { |
915 | timestamped_chunks_.push_back(new_h); |
916 | } |
917 | if (c->size >= required_bytes) { |
918 | satisfied = true; |
919 | } |
920 | } |
921 | } else { |
922 | // We were force merging Chunks with unsafe timestamps, but managed |
923 | // to create a satisfying Chunk so just requeue the rest. |
924 | timestamped_chunks_.push_back(h); |
925 | } |
926 | } |
927 | return satisfied; |
928 | } |
929 | |
930 | bool BFCAllocator::TracksAllocationSizes() const { return true; } |
931 | |
932 | size_t BFCAllocator::RequestedSize(const void* ptr) const { |
933 | CHECK(ptr); |
934 | mutex_lock l(lock_); |
935 | BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); |
936 | CHECK(h != kInvalidChunkHandle) |
937 | << "Asked for requested size of pointer we never allocated: " << ptr; |
938 | const BFCAllocator::Chunk* c = ChunkFromHandle(h); |
939 | return c->requested_size; |
940 | } |
941 | |
942 | size_t BFCAllocator::AllocatedSize(const void* ptr) const { |
943 | mutex_lock l(lock_); |
944 | BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); |
945 | CHECK(h != kInvalidChunkHandle) |
946 | << "Asked for allocated size of pointer we never allocated: " << ptr; |
947 | const BFCAllocator::Chunk* c = ChunkFromHandle(h); |
948 | return c->size; |
949 | } |
950 | |
951 | int64_t BFCAllocator::AllocationId(const void* ptr) const { |
952 | mutex_lock l(lock_); |
953 | BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr); |
954 | CHECK(h != kInvalidChunkHandle) |
955 | << "Asked for allocation id of pointer we never allocated: " << ptr; |
956 | const BFCAllocator::Chunk* c = ChunkFromHandle(h); |
957 | return c->allocation_id; |
958 | } |
959 | |
960 | namespace { |
961 | |
962 | void RenderRegion(char* rendered, const size_t resolution, |
963 | const size_t total_render_size, const size_t offset, |
964 | const void* base_ptr, const void* ptr, const size_t size, |
965 | const char c) { |
966 | const char* base_ptr_c = static_cast<const char*>(base_ptr); |
967 | const char* ptr_c = static_cast<const char*>(ptr); |
968 | |
969 | size_t start_location = |
970 | ((ptr_c - base_ptr_c + offset) * resolution) / total_render_size; |
971 | CHECK_GE(start_location, 0); |
972 | CHECK_LT(start_location, resolution); |
973 | size_t end_location = |
974 | ((ptr_c + size - 1 - base_ptr_c + offset) * resolution) / |
975 | total_render_size; |
976 | CHECK_GE(end_location, 0); |
977 | CHECK_LT(end_location, resolution); |
978 | |
979 | for (size_t i = start_location; i <= end_location; ++i) { |
980 | rendered[i] = c; |
981 | } |
982 | } |
983 | |
984 | } // namespace |
985 | |
986 | string BFCAllocator::RenderOccupancy() { |
987 | // Make a buffer for the ASCII-art representation. |
988 | const size_t resolution = 100; |
989 | char rendered[resolution]; |
990 | |
991 | // Compute the total region size to render over |
992 | size_t total_region_size = 0; |
993 | for (const auto& region : region_manager_.regions()) { |
994 | total_region_size += region.memory_size(); |
995 | } |
996 | |
997 | if (total_region_size == 0) { |
998 | return "<allocator contains no memory>" ; |
999 | } |
1000 | |
1001 | // Start out with everything empty |
1002 | RenderRegion(rendered, resolution, total_region_size, 0, nullptr, nullptr, |
1003 | total_region_size, '_'); |
1004 | |
1005 | size_t region_offset = 0; |
1006 | for (const auto& region : region_manager_.regions()) { |
1007 | ChunkHandle h = region_manager_.get_handle(region.ptr()); |
1008 | // Then render each chunk left to right. |
1009 | while (h != kInvalidChunkHandle) { |
1010 | Chunk* c = ChunkFromHandle(h); |
1011 | if (c->in_use()) { |
1012 | // Render the wasted space |
1013 | size_t wasted = c->size - c->requested_size; |
1014 | if (wasted > 0) { |
1015 | RenderRegion(rendered, resolution, total_region_size, |
1016 | region_offset + c->requested_size, region.ptr(), c->ptr, |
1017 | wasted, 'x'); |
1018 | } |
1019 | // Then the occupied space |
1020 | RenderRegion(rendered, resolution, total_region_size, region_offset, |
1021 | region.ptr(), c->ptr, c->requested_size, '*'); |
1022 | } |
1023 | h = c->next; |
1024 | } |
1025 | region_offset += region.memory_size(); |
1026 | } |
1027 | |
1028 | return string(rendered, resolution); |
1029 | } |
1030 | |
1031 | void BFCAllocator::DumpMemoryLog(size_t num_bytes) { |
1032 | const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info(); |
1033 | LOG(INFO) << "BFCAllocator dump for " << Name(); |
1034 | for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) { |
1035 | Bin* b = BinFromIndex(bin_num); |
1036 | const BinDebugInfo& bin_info = bin_infos[bin_num]; |
1037 | CHECK_EQ(b->free_chunks.size(), |
1038 | bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use); |
1039 | |
1040 | LOG(INFO) << "Bin (" << b->bin_size |
1041 | << "): \tTotal Chunks: " << bin_info.total_chunks_in_bin |
1042 | << ", Chunks in use: " << bin_info.total_chunks_in_use << ". " |
1043 | << strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin) |
1044 | << " allocated for chunks. " |
1045 | << strings::HumanReadableNumBytes(bin_info.total_bytes_in_use) |
1046 | << " in use in bin. " |
1047 | << strings::HumanReadableNumBytes( |
1048 | bin_info.total_requested_bytes_in_use) |
1049 | << " client-requested in use in bin." ; |
1050 | } |
1051 | |
1052 | // Find the bin that we would have liked to allocate in, so we |
1053 | // can get some further analysis about fragmentation. |
1054 | Bin* b = BinForSize(num_bytes); |
1055 | |
1056 | LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes) |
1057 | << " was " << strings::HumanReadableNumBytes(b->bin_size) |
1058 | << ", Chunk State: " ; |
1059 | |
1060 | for (ChunkHandle h : b->free_chunks) { |
1061 | Chunk* c = ChunkFromHandle(h); |
1062 | LOG(INFO) << c->DebugString(this, true); |
1063 | } |
1064 | |
1065 | // Next show the chunks that are in use, and also summarize their |
1066 | // number by size. |
1067 | std::map<size_t, int> in_use_by_size; |
1068 | for (const auto& region : region_manager_.regions()) { |
1069 | LOG(INFO) << "Next region of size " << region.memory_size(); |
1070 | ChunkHandle h = region_manager_.get_handle(region.ptr()); |
1071 | while (h != kInvalidChunkHandle) { |
1072 | const Chunk* c = ChunkFromHandle(h); |
1073 | if (c->in_use()) { |
1074 | in_use_by_size[c->size]++; |
1075 | } |
1076 | string buf = strings::StrCat( |
1077 | (c->in_use() ? "InUse" : "Free " ), " at " , |
1078 | strings::Hex(reinterpret_cast<uint64>(c->ptr)), " of size " , c->size); |
1079 | #ifdef TENSORFLOW_MEM_DEBUG |
1080 | if (ShouldRecordOpName()) { |
1081 | strings::StrAppend(&buf, " by op " , c->op_name, " action_count " , |
1082 | c->action_count, " step " , c->step_id); |
1083 | } |
1084 | #endif |
1085 | strings::StrAppend(&buf, " next " , c->next); |
1086 | if (timing_counter_) { |
1087 | strings::StrAppend(&buf, " freed_at_count " , c->freed_at_count); |
1088 | } |
1089 | LOG(INFO) << buf; |
1090 | h = c->next; |
1091 | } |
1092 | } |
1093 | |
1094 | LOG(INFO) << " Summary of in-use Chunks by size: " ; |
1095 | size_t total_bytes = 0; |
1096 | for (auto& it : in_use_by_size) { |
1097 | LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling " |
1098 | << strings::HumanReadableNumBytes(it.first * it.second); |
1099 | total_bytes += (it.first * it.second); |
1100 | } |
1101 | LOG(INFO) << "Sum Total of in-use chunks: " |
1102 | << strings::HumanReadableNumBytes(total_bytes); |
1103 | LOG(INFO) << "total_region_allocated_bytes_: " |
1104 | << total_region_allocated_bytes_ |
1105 | << " memory_limit_: " << memory_limit_ << " available bytes: " |
1106 | << (memory_limit_ - total_region_allocated_bytes_) |
1107 | << " curr_region_allocation_bytes_: " |
1108 | << curr_region_allocation_bytes_; |
1109 | LOG(INFO) << "Stats: \n" << stats_.DebugString(); |
1110 | } |
1111 | |
1112 | void BFCAllocator::MaybeWriteMemoryMap() { |
1113 | const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP" ); |
1114 | if (gpu_memory_map_file != nullptr) { |
1115 | std::unique_ptr<WritableFile> dump_file; |
1116 | string file_name = strings::StrCat(gpu_memory_map_file, "_" , Name(), "." , |
1117 | Env::Default()->NowMicros()); |
1118 | Status status = Env::Default()->NewWritableFile(file_name, &dump_file); |
1119 | if (!status.ok()) { |
1120 | LOG(ERROR) << "Failed to open file " << file_name; |
1121 | return; |
1122 | } |
1123 | MemoryDump md = RecordMemoryMapInternal(); |
1124 | status = dump_file->Append(md.SerializeAsString()); |
1125 | if (!status.ok()) { |
1126 | LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": " |
1127 | << status; |
1128 | } |
1129 | } |
1130 | } |
1131 | |
1132 | MemoryDump BFCAllocator::RecordMemoryMap() { |
1133 | mutex_lock l(lock_); |
1134 | return RecordMemoryMapInternal(); |
1135 | } |
1136 | |
1137 | MemoryDump BFCAllocator::RecordMemoryMapInternal() { |
1138 | MemoryDump md; |
1139 | md.set_allocator_name(Name()); |
1140 | |
1141 | // Record the general stats |
1142 | MemAllocatorStats* mas = md.mutable_stats(); |
1143 | mas->set_num_allocs(stats_.num_allocs); |
1144 | mas->set_bytes_in_use(stats_.bytes_in_use); |
1145 | mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use); |
1146 | mas->set_largest_alloc_size(stats_.largest_alloc_size); |
1147 | |
1148 | // Record summary data for every bin. |
1149 | const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info(); |
1150 | for (BinNum bin_num = 0; bin_num < kNumBins; bin_num++) { |
1151 | Bin* b = BinFromIndex(bin_num); |
1152 | const BinDebugInfo& bin_info = bin_infos[bin_num]; |
1153 | DCHECK_EQ(b->free_chunks.size(), |
1154 | bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use); |
1155 | BinSummary* bs = md.add_bin_summary(); |
1156 | bs->set_bin(bin_num); |
1157 | bs->set_total_bytes_in_use(bin_info.total_bytes_in_use); |
1158 | bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin); |
1159 | bs->set_total_chunks_in_use(bin_info.total_chunks_in_use); |
1160 | bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin); |
1161 | } |
1162 | |
1163 | // Record state of every defined Chunk. |
1164 | for (const auto& region : region_manager_.regions()) { |
1165 | ChunkHandle h = region_manager_.get_handle(region.ptr()); |
1166 | while (h != kInvalidChunkHandle) { |
1167 | const Chunk* c = ChunkFromHandle(h); |
1168 | MemChunk* mc = md.add_chunk(); |
1169 | mc->set_in_use(c->in_use()); |
1170 | mc->set_address(reinterpret_cast<uint64>(c->ptr)); |
1171 | mc->set_size(c->size); |
1172 | mc->set_requested_size(c->requested_size); |
1173 | mc->set_bin(c->bin_num); |
1174 | #ifdef TENSORFLOW_MEM_DEBUG |
1175 | mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN" ); |
1176 | mc->set_step_id(c->step_id); |
1177 | mc->set_action_count(c->action_count); |
1178 | #endif |
1179 | if (timing_counter_) { |
1180 | mc->set_freed_at_count(c->in_use() ? 0 : c->freed_at_count); |
1181 | } |
1182 | h = c->next; |
1183 | } |
1184 | } |
1185 | |
1186 | mas->set_fragmentation_metric(GetFragmentation()); |
1187 | |
1188 | #ifdef TENSORFLOW_MEM_DEBUG |
1189 | // Record the recent size history |
1190 | int history_len = static_cast<int>(std::min( |
1191 | action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE))); |
1192 | for (int i = action_counter_ - history_len; i < action_counter_; ++i) { |
1193 | SnapShot* ss = md.add_snap_shot(); |
1194 | ss->set_action_count(i); |
1195 | int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE; |
1196 | ss->set_size(size_history_[slot]); |
1197 | } |
1198 | #endif |
1199 | |
1200 | return md; |
1201 | } |
1202 | |
1203 | absl::optional<AllocatorStats> BFCAllocator::GetStats() { |
1204 | mutex_lock l(lock_); |
1205 | return stats_; |
1206 | } |
1207 | |
1208 | bool BFCAllocator::ClearStats() { |
1209 | mutex_lock l(lock_); |
1210 | stats_.num_allocs = 0; |
1211 | stats_.peak_bytes_in_use = stats_.bytes_in_use; |
1212 | stats_.largest_alloc_size = 0; |
1213 | return true; |
1214 | } |
1215 | |
1216 | std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins> |
1217 | BFCAllocator::get_bin_debug_info() { |
1218 | std::array<BinDebugInfo, kNumBins> bin_infos; |
1219 | for (const auto& region : region_manager_.regions()) { |
1220 | ChunkHandle h = region_manager_.get_handle(region.ptr()); |
1221 | while (h != kInvalidChunkHandle) { |
1222 | const Chunk* c = ChunkFromHandle(h); |
1223 | BinNum bin_num = BinNumForSize(c->size); |
1224 | BinDebugInfo& bin_info = bin_infos[bin_num]; |
1225 | bin_info.total_bytes_in_bin += c->size; |
1226 | bin_info.total_chunks_in_bin++; |
1227 | if (c->in_use()) { |
1228 | bin_info.total_bytes_in_use += c->size; |
1229 | bin_info.total_requested_bytes_in_use += c->requested_size; |
1230 | bin_info.total_chunks_in_use++; |
1231 | } else { |
1232 | Bin* bin = BinFromIndex(bin_num); |
1233 | CHECK_EQ(bin->free_chunks.count(h), 1); |
1234 | CHECK_EQ(c->bin_num, bin_num); |
1235 | } |
1236 | h = c->next; |
1237 | } |
1238 | } |
1239 | return bin_infos; |
1240 | } |
1241 | |
1242 | AllocatorMemoryType BFCAllocator::GetMemoryType() const { |
1243 | return sub_allocator_->GetMemoryType(); |
1244 | } |
1245 | |
1246 | } // namespace tensorflow |
1247 | |