bfc_allocator.cc source code [tensorflow/tensorflow/core/common_runtime/bfc_allocator.cc]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#include "tensorflow/core/common_runtime/bfc_allocator.h"
17
18	#include <algorithm>
19	#include <atomic>
20	#include <utility>
21
22	#include "absl/strings/string_view.h"
23	#include "tensorflow/core/common_runtime/allocator_retry.h"
24	#include "tensorflow/core/lib/core/bits.h"
25	#include "tensorflow/core/lib/strings/numbers.h"
26	#include "tensorflow/core/lib/strings/str_util.h"
27	#include "tensorflow/core/lib/strings/strcat.h"
28	#include "tensorflow/core/platform/file_system.h"
29	#include "tensorflow/core/platform/logging.h"
30	#include "tensorflow/core/platform/mutex.h"
31	#ifdef TENSORFLOW_MEM_DEBUG
32	#include "tensorflow/core/platform/stacktrace.h"
33	#endif
34	#include "tensorflow/core/platform/types.h"
35	#include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h"
36	#include "tensorflow/core/profiler/lib/traceme.h"
37	#include "tensorflow/core/protobuf/bfc_memory_map.pb.h"
38
39	namespace tensorflow {
40
41	constexpr BFCAllocator::ChunkHandle BFCAllocator::kInvalidChunkHandle;
42
43	BFCAllocator::BFCAllocator(std::unique_ptr<SubAllocator> sub_allocator,
44	size_t total_memory, const string& name,
45	const Options& opts)
46	: opts_(opts),
47	coalesce_regions_(sub_allocator ->SupportsCoalescing()),
48	sub_allocator_(std::move(sub_allocator)),
49	name_(name),
50	free_chunks_list_(kInvalidChunkHandle),
51	next_allocation_id_(`1`) {
52	if (opts.allow_growth) {
53	// 2MiB smallest initial allocation, unless total memory available
54	// is less.
55	curr_region_allocation_bytes_ =
56	RoundedBytes(std::min(total_memory, size_t{`2` << `20`}));
57	} else {
58	curr_region_allocation_bytes_ = RoundedBytes(total_memory);
59	}
60
61	// Allocate the requested amount of memory.
62	memory_limit_ = total_memory;
63	stats_.bytes_limit = static_cast<int64_t>(total_memory);
64
65	// Create a bunch of bins of various good sizes.
66
67	// We create bins to fit all possible ranges that cover the
68	// memory_limit_ starting from allocations up to 256 bytes to
69	// allocations up to (and including) the memory limit.
70	VLOG(`1`) << "Creating new BFCAllocator named: " << name;
71	for (BinNum b = `0`; b < kNumBins; b++) {
72	size_t bin_size = BinNumToSize(b);
73	VLOG(`1`) << "Creating bin of max chunk size "
74	<< strings::HumanReadableNumBytes(bin_size);
75	new (BinFromIndex(b)) Bin (this, bin_size);
76	CHECK_EQ(BinForSize(bin_size), BinFromIndex(b));
77	CHECK_EQ(BinForSize(bin_size + `255`), BinFromIndex(b));
78	CHECK_EQ(BinForSize(bin_size * `2` - `1`), BinFromIndex(b));
79	if (b + `1` < kNumBins) {
80	CHECK_NE(BinForSize(bin_size * `2`), BinFromIndex(b));
81	}
82	}
83	}
84
85	BFCAllocator::~BFCAllocator() {
86	// Return memory back.
87	VLOG(`2`) << "Number of regions allocated: "
88	<< region_manager_.regions().size();
89	for (const auto& region : region_manager_.regions()) {
90	sub_allocator_->Free(region.ptr(), region.memory_size());
91	}
92
93	for (BinNum b = `0`; b < kNumBins; b++) {
94	BinFromIndex(b)->~Bin();
95	}
96	}
97
98	BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) {
99	DCHECK_GE(h, `0`);
100	DCHECK_LT(h, static_cast<int>(chunks_.size()));
101	return &(chunks_[h]);
102	}
103
104	const BFCAllocator::Chunk* BFCAllocator::ChunkFromHandle(ChunkHandle h) const {
105	DCHECK_GE(h, `0`);
106	DCHECK_LT(h, static_cast<int>(chunks_.size()));
107	return &(chunks_[h]);
108	}
109
110	bool BFCAllocator::Extend(size_t alignment, size_t rounded_bytes) {
111	size_t available_bytes = memory_limit_ - total_region_allocated_bytes_;
112	// Rounds available_bytes down to the nearest multiple of kMinAllocationSize.
113	available_bytes = (available_bytes / kMinAllocationSize) * kMinAllocationSize;
114
115	// Do we have enough space to handle the client's request?
116	// If not, fail immediately.
117	if (rounded_bytes > available_bytes) {
118	return false;
119	}
120
121	// If curr_region_allocation_bytes_ is not enough to satisfy the
122	// allocation, keep multiplying by a power of two until that is
123	// sufficient.
124	bool increased_allocation = false;
125	while (rounded_bytes > curr_region_allocation_bytes_) {
126	curr_region_allocation_bytes_ *= `2`;
127	increased_allocation = true;
128	}
129
130	// Try allocating.
131	size_t bytes = std::min(curr_region_allocation_bytes_, available_bytes);
132	size_t bytes_received;
133	void* mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
134	if (mem_addr == nullptr && !started_backpedal_) {
135	// Only backpedal once.
136	started_backpedal_ = true;
137
138	static constexpr float kBackpedalFactor = `0.9`;
139
140	// Try allocating less memory.
141	while (mem_addr == nullptr) {
142	bytes = RoundedBytes(bytes * kBackpedalFactor);
143	if (bytes < rounded_bytes) break;
144	mem_addr = sub_allocator_->Alloc(alignment, bytes, &bytes_received);
145	}
146	}
147
148	if (mem_addr == nullptr) {
149	return false;
150	}
151
152	if (!increased_allocation) {
153	// Increase the region size of the next required allocation.
154	curr_region_allocation_bytes_ *= `2`;
155	}
156
157	VLOG(`1`) << "Extending allocation by "
158	<< strings::HumanReadableNumBytes(bytes_received) << " bytes for "
159	<< Name() << ".";
160
161	total_region_allocated_bytes_ += bytes_received;
162	VLOG(`1`) << "Total allocated bytes: "
163	<< strings::HumanReadableNumBytes(total_region_allocated_bytes_);
164
165	VLOG(`1`) << "Allocated memory at " << mem_addr << " to "
166	<< static_cast<void>(static_cast<char**>(mem_addr) + bytes_received);
167
168	AllocationRegion* maybe_extended_region = nullptr;
169	if (coalesce_regions_) {
170	maybe_extended_region =
171	region_manager_.AddOrExtendAllocationRegion(mem_addr, bytes_received);
172	} else {
173	region_manager_.AddAllocationRegion(mem_addr, bytes_received);
174	}
175
176	// Create one large chunk for the whole memory space that will
177	// be chunked later.
178	ChunkHandle h = AllocateChunk();
179	BFCAllocator::Chunk* c = ChunkFromHandle(h);
180	c->ptr = mem_addr;
181	c->size = bytes_received;
182	c->allocation_id = -`1`;
183	c->prev = kInvalidChunkHandle;
184	c->next = kInvalidChunkHandle;
185	c->freed_at_count = `0`;
186
187	region_manager_.set_handle(c->ptr, h);
188
189	// If the region was extended, then there exists a previous chunk that should
190	// be linked to the new chunk.
191	if (maybe_extended_region != nullptr) {
192	ChunkHandle prev =
193	maybe_extended_region->get_handle(maybe_extended_region->ptr());
194	BFCAllocator::Chunk* prev_chunk = ChunkFromHandle(prev);
195	// Find the last recorded chunk in the extended region.
196	while (prev_chunk->next != kInvalidChunkHandle) {
197	prev = prev_chunk->next;
198	prev_chunk = ChunkFromHandle(prev);
199	}
200	c->prev = prev;
201	prev_chunk->next = h;
202	}
203
204	// Maybe merge adjacent chunks and insert the chunk into the right bin.
205	InsertFreeChunkIntoBin(TryToCoalesce(h, /ignore_freed_at=/false));
206
207	return true;
208	}
209
210	BFCAllocator::ChunkHandle BFCAllocator::AllocateChunk() {
211	if (free_chunks_list_ != kInvalidChunkHandle) {
212	ChunkHandle h = free_chunks_list_;
213	Chunk* c = ChunkFromHandle(h);
214	free_chunks_list_ = c->next;
215	return h;
216	} else {
217	ChunkHandle h = chunks_.size();
218	chunks_.resize(h + `1`);
219	return h;
220	}
221	}
222
223	void BFCAllocator::DeallocateChunk(ChunkHandle h) {
224	Chunk* c = ChunkFromHandle(h);
225	c->allocation_id = -`1`;
226	c->bin_num = kInvalidBinNum;
227	c->next = free_chunks_list_;
228	free_chunks_list_ = h;
229	}
230
231	void* BFCAllocator::AllocateRawInternalWithRetry(
232	size_t unused_alignment, size_t num_bytes,
233	const AllocationAttributes& allocation_attr) {
234	// Fast path: Try once to allocate without getting the retry_helper_ involved
235	uint64 freed_by_count = `0`;
236	if (allocation_attr.freed_by_func != nullptr) {
237	freed_by_count = (*allocation_attr.freed_by_func)();
238	}
239	void* r =
240	AllocateRawInternal(unused_alignment, num_bytes, false, freed_by_count);
241	if (r != nullptr) {
242	return r;
243	} else {
244	static const int64_t kMaxMillisToWait = `10000`; // 10 seconds
245	r = retry_helper_.AllocateRaw(
246	[this, &allocation_attr](size_t a, size_t nb, bool v) {
247	uint64 freed_by_count = `0`;
248	if (allocation_attr.freed_by_func != nullptr) {
249	freed_by_count = (*allocation_attr.freed_by_func)();
250	}
251	return AllocateRawInternal(a, nb, v, freed_by_count);
252	},
253	kMaxMillisToWait, unused_alignment, num_bytes);
254	return r;
255	}
256	}
257
258	void* BFCAllocator::AllocateRaw(size_t unused_alignment, size_t num_bytes,
259	const AllocationAttributes& allocation_attr) {
260	VLOG(`3`) << "AllocateRaw " << Name() << " " << num_bytes;
261	void* result = [&] {
262	if (!opts_.allow_retry_on_failure \|\| !allocation_attr.retry_on_failure) {
263	// If we have globally disabled retry-on-failure and fail to allocate an
264	// "important" alloc, we want to print a log, because the program may be
265	// about to fail due to OOM.
266	//
267	// Bit of a hack: We deem "important" allocs as those which are retryable.
268	// In TF, non-retryable allocations are usually those which we can
269	// tolerate failing. For example, we allocate convolution scratch memory
270	// as non-retryable; if it fails, we'll just use a fallback algorithm that
271	// uses no scratch.
272	static std::atomic<int32> log_counter{`0`};
273	constexpr int kMaxFailureLogs = `10`;
274	bool dump_log_on_failure =
275	(/retry is globally disabled/ !opts_.allow_retry_on_failure &&
276	/alloc is "important"/ allocation_attr.retry_on_failure &&
277	log_counter.load(std::memory_order_relaxed) < kMaxFailureLogs) \|\|
278	VLOG_IS_ON(`2`);
279
280	uint64 freed_by_count = `0`;
281	if (allocation_attr.freed_by_func != nullptr) {
282	freed_by_count = (*allocation_attr.freed_by_func)();
283	}
284	void* res = AllocateRawInternal(unused_alignment, num_bytes,
285	dump_log_on_failure, freed_by_count);
286	if (res == nullptr) {
287	int32 counter_value = log_counter.load(std::memory_order_relaxed);
288	if (counter_value < kMaxFailureLogs) {
289	log_counter.store(counter_value + `1`, std::memory_order_relaxed);
290	LOG(WARNING)
291	<< "Allocator (" << Name() << ") ran out of memory trying "
292	<< "to allocate " << strings::HumanReadableNumBytes(num_bytes)
293	<< " with freed_by_count=" << freed_by_count << "."
294	<< (!allocation_attr.retry_on_failure
295	? " The caller indicates that this is not a failure, but"
296	" this may mean that there could be performance gains "
297	"if more memory were available."
298	: "");
299	}
300	}
301	return res;
302	} else {
303	return AllocateRawInternalWithRetry(unused_alignment, num_bytes,
304	allocation_attr);
305	}
306	}();
307	VLOG(`3`) << "AllocateRaw " << Name() << " " << num_bytes << " " << result;
308	return result;
309	}
310
311	// static
312	size_t BFCAllocator::RoundedBytes(size_t bytes) {
313	size_t rounded_bytes =
314	(kMinAllocationSize *
315	((bytes + kMinAllocationSize - `1`) / kMinAllocationSize));
316	DCHECK_EQ(size_t{`0`}, rounded_bytes % kMinAllocationSize);
317	return rounded_bytes;
318	}
319
320	bool BFCAllocator::DeallocateFreeRegions(size_t rounded_bytes)
321	TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
322	// Do nothing if garbage collection is off.
323	if (!opts_.garbage_collection) {
324	return false;
325	}
326
327	// Searching for free regions.
328	absl::flat_hash_set<void*> free_region_ptrs;
329	size_t total_free_bytes = `0`;
330	for (const AllocationRegion& region : region_manager_.regions()) {
331	ChunkHandle h = region_manager_.get_handle(region.ptr());
332	bool any_use = false;
333	while (h != kInvalidChunkHandle) {
334	const Chunk* c = ChunkFromHandle(h);
335	if (c->in_use()) {
336	any_use = true;
337	break;
338	}
339	h = c->next;
340	}
341
342	if (!any_use) {
343	VLOG(`2`) << "Found free region with ptr = " << region.ptr();
344	free_region_ptrs.insert(region.ptr());
345	total_free_bytes += region.memory_size();
346	}
347	}
348
349	if (total_free_bytes == `0`) {
350	return false;
351	}
352
353	// Rough estimation to check whether deallocation can help.
354	size_t available_bytes =
355	memory_limit_ - total_region_allocated_bytes_ + total_free_bytes;
356	if (rounded_bytes > available_bytes) {
357	return false;
358	}
359
360	LOG(WARNING) << "Garbage collection: deallocate free memory regions"
361	<< " (i.e., allocations) so that we can re-allocate a larger"
362	<< " region to avoid OOM due to memory fragmentation. If you"
363	<< " see this message frequently, you are running near the"
364	<< " threshold of the available device memory and re-allocation"
365	<< " may incur great performance overhead. You may try smaller"
366	<< " batch sizes to observe the performance impact."
367	<< " Set TF_ENABLE_GPU_GARBAGE_COLLECTION=false if you'd like to"
368	<< " disable this feature.";
369
370	// Deallocate free regions.
371	DeallocateRegions(free_region_ptrs);
372
373	return true;
374	}
375
376	void BFCAllocator::DeallocateRegions(
377	const absl::flat_hash_set<void*>& region_ptrs)
378	TF_EXCLUSIVE_LOCKS_REQUIRED(lock_) {
379	// Explicitly remove the const qualifier as some compilers disallow passing
380	// const_iterator to std::vector::erase(), which is used in
381	// RemoveAllocationRegion().
382	auto regions =
383	const_cast<std::vector<AllocationRegion>*>(&region_manager_.regions());
384	auto it = regions->begin();
385	while (it != regions->end()) {
386	if (!region_ptrs.contains(it ->ptr())) {
387	++it;
388	continue;
389	}
390
391	VLOG(`2`) << "Deallocate region with ptr = " << it ->ptr();
392	// Remove all chunk registrations from Bins.
393	ChunkHandle h = region_manager_.get_handle(it ->ptr());
394	while (h != kInvalidChunkHandle) {
395	const Chunk* c = ChunkFromHandle(h);
396	if (c->bin_num != kInvalidBinNum) {
397	RemoveFreeChunkFromBin(h);
398	}
399	auto h_to_delete = h;
400	h = c->next;
401	DeleteChunk(h_to_delete);
402	}
403
404	// Deallocate the memory.
405	sub_allocator_->Free(it ->ptr(), it ->memory_size());
406	total_region_allocated_bytes_ -= it ->memory_size();
407	it = region_manager_.RemoveAllocationRegion(it);
408	}
409	}
410
411	void* BFCAllocator::AllocateRawInternal(size_t unused_alignment,
412	size_t num_bytes,
413	bool dump_log_on_failure,
414	uint64 freed_before) {
415	if (num_bytes == `0`) {
416	VLOG(`2`) << "tried to allocate 0 bytes";
417	return nullptr;
418	}
419	// First, always allocate memory of at least kMinAllocationSize
420	// bytes, and always allocate multiples of kMinAllocationSize bytes
421	// so all memory addresses are nicely byte aligned.
422	size_t rounded_bytes = RoundedBytes(num_bytes);
423
424	// The BFC allocator tries to find the best fit first.
425	BinNum bin_num = BinNumForSize(rounded_bytes);
426
427	mutex_lock l(lock_);
428	if (!timestamped_chunks_.empty()) {
429	// Merge timestamped chunks whose counts have become safe for general use.
430	MergeTimestampedChunks(`0`);
431	}
432	void* ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
433	if (ptr != nullptr) {
434	AddTraceMe("MemoryAllocation", ptr);
435	return ptr;
436	}
437
438	// Try to extend
439	if (Extend(unused_alignment, rounded_bytes)) {
440	ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
441	if (ptr != nullptr) {
442	AddTraceMe("MemoryAllocation", ptr);
443	return ptr;
444	}
445	}
446
447	if ((freed_before == `0`) && (!timestamped_chunks_.empty())) {
448	// We're unable to satisfy an allocation request without a specific
449	// timestamp requirement. Rather than fail, try merging any held-out
450	// timestamped chunks more aggressively until a free chunk of the necessary
451	// size is formed.
452	if (MergeTimestampedChunks(rounded_bytes)) {
453	ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
454	if (ptr != nullptr) {
455	AddTraceMe("MemoryAllocation", ptr);
456	return ptr;
457	}
458	}
459	}
460
461	// Reaching this point means that no chunks can satisfy the request. Also,
462	// the unallocated bytes cannot satisfy the request. Before giving up, let's
463	// try deallocating free regions so that suballocator can combine them with
464	// the unallocated bytes and form a larger region.
465	if (DeallocateFreeRegions(rounded_bytes) &&
466	Extend(unused_alignment, rounded_bytes)) {
467	ptr = FindChunkPtr(bin_num, rounded_bytes, num_bytes, freed_before);
468	if (ptr != nullptr) {
469	AddTraceMe("MemoryAllocation", ptr);
470	return ptr;
471	}
472	}
473
474	// We searched all bins for an existing free chunk to use and
475	// couldn't find one. This means we must have run out of memory,
476	// Dump the memory log for analysis.
477	MaybeWriteMemoryMap();
478	if (dump_log_on_failure) {
479	LOG(WARNING)
480	<< "Allocator (" << Name() << ") ran out of memory trying "
481	<< "to allocate " << strings::HumanReadableNumBytes(num_bytes)
482	<< " (rounded to " << rounded_bytes << ")"
483	<< "requested by op "
484	<< profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation()
485	.pending_op_name
486	<< "\nIf the cause is memory fragmentation maybe the environment "
487	<< "variable 'TF_GPU_ALLOCATOR=cuda_malloc_async' will "
488	<< "improve the situation. \nCurrent allocation summary follows."
489	<< "\nCurrent allocation summary follows.";
490	DumpMemoryLog(rounded_bytes);
491	LOG(WARNING) << RenderOccupancy();
492	}
493	return nullptr;
494	}
495
496	int64_t BFCAllocator::LargestFreeChunk() {
497	for (int i = kNumBins - `1`; i >= `0`; i--) {
498	if (!BinFromIndex(i)->free_chunks.empty()) {
499	return ChunkFromHandle(*BinFromIndex(i)->free_chunks.rbegin())->size;
500	}
501	}
502	return `0`;
503	}
504
505	double BFCAllocator::GetFragmentation() {
506	int64_t bytes_available = total_region_allocated_bytes_ - stats_.bytes_in_use;
507	DCHECK_GT(bytes_available, `0`);
508	return static_cast<double>(bytes_available - LargestFreeChunk()) /
509	bytes_available;
510	}
511
512	void BFCAllocator::AddTraceMe(absl::string_view traceme_name, const void* ptr) {
513	BFCAllocator::Chunk* chunk = ChunkFromHandle(region_manager_.get_handle(ptr));
514	AddTraceMe(traceme_name, chunk->ptr, chunk->requested_size, chunk->size);
515	}
516
517	void BFCAllocator::AddTraceMe(absl::string_view traceme_name,
518	const void* chunk_ptr, int64_t req_bytes,
519	int64_t alloc_bytes) {
520	tensorflow::profiler::TraceMe::InstantActivity(
521	[this, traceme_name, chunk_ptr, req_bytes, alloc_bytes]()
522	TF_NO_THREAD_SAFETY_ANALYSIS {
523	int64_t bytes_available =
524	memory_limit_ - stats_.bytes_reserved - stats_.bytes_in_use;
525	const auto& annotation =
526	profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
527	const auto op_name = annotation.pending_op_name
528	? annotation.pending_op_name
529	: "(null)";
530	const auto region_type = annotation.pending_region_type
531	? annotation.pending_region_type
532	: "(null)";
533	return tensorflow::profiler::TraceMeEncode(
534	traceme_name, {{"allocator_name", name_},
535	{"bytes_reserved", stats_.bytes_reserved},
536	{"bytes_allocated", stats_.bytes_in_use},
537	{"bytes_available", bytes_available},
538	{"fragmentation", GetFragmentation()},
539	{"peak_bytes_in_use", stats_.peak_bytes_in_use},
540	{"requested_bytes", req_bytes},
541	{"allocation_bytes", alloc_bytes},
542	{"addr", reinterpret_cast<uint64>(chunk_ptr)},
543	{"tf_op", op_name},
544	{"id", annotation.pending_step_id},
545	{"region_type", region_type},
546	{"data_type", annotation.pending_data_type},
547	{"shape", annotation.pending_shape_func ()}});
548	},
549	/level=/profiler::TraceMeLevel::kInfo);
550	}
551
552	void* BFCAllocator::FindChunkPtr(BinNum bin_num, size_t rounded_bytes,
553	size_t num_bytes, uint64 freed_before) {
554	// First identify the first bin that could satisfy rounded_bytes.
555	for (; bin_num < kNumBins; bin_num++) {
556	// Start searching from the first bin for the smallest chunk that fits
557	// rounded_bytes.
558	Bin* b = BinFromIndex(bin_num);
559	for (auto citer = b->free_chunks.begin(); citer != b->free_chunks.end();
560	++citer) {
561	const BFCAllocator::ChunkHandle h = (*citer);
562	BFCAllocator::Chunk* chunk = ChunkFromHandle(h);
563	DCHECK(!chunk->in_use());
564	if (freed_before > `0` && freed_before < chunk->freed_at_count) {
565	continue;
566	}
567	if (chunk->size >= rounded_bytes) {
568	// We found an existing chunk that fits us that wasn't in use, so remove
569	// it from the free bin structure prior to using.
570	RemoveFreeChunkIterFromBin(&b->free_chunks, citer);
571
572	// If we can break the size of the chunk into two reasonably large
573	// pieces, do don't waste more than max_internal_fragmentation_bytes on
574	// padding. If this threshold is not set by the user, then use 128MB as
575	// the default.
576	const int64_t max_internal_fragmentation_bytes =
577	(opts_.fragmentation_fraction > `0.0`)
578	? opts_.fragmentation_fraction * memory_limit_
579	: `128` << `20`;
580
581	if (chunk->size >= rounded_bytes * `2` \|\|
582	static_cast<int64_t>(chunk->size) - rounded_bytes >=
583	max_internal_fragmentation_bytes) {
584	SplitChunk(h, rounded_bytes);
585	chunk = ChunkFromHandle(h); // Update chunk pointer in case it moved
586	}
587
588	// The requested size of the returned chunk is what the user
589	// has allocated.
590	chunk->requested_size = num_bytes;
591	// Assign a unique id and increment the id counter, marking the
592	// chunk as being in use.
593	chunk->allocation_id = next_allocation_id_++;
594
595	// Update stats.
596	++stats_.num_allocs;
597	stats_.bytes_in_use += chunk->size;
598	if (stats_.bytes_in_use > stats_.peak_bytes_in_use) {
599	VLOG(`2`) << "New Peak memory usage of " << stats_.bytes_in_use
600	<< " bytes for " << Name();
601	}
602	stats_.peak_bytes_in_use =
603	std::max(stats_.peak_bytes_in_use, stats_.bytes_in_use);
604	stats_.largest_alloc_size =
605	std::max<std::size_t>(stats_.largest_alloc_size, chunk->size);
606
607	#ifdef TENSORFLOW_MEM_DEBUG
608	if (ShouldRecordOpName()) {
609	const auto& annotation =
610	profiler::ScopedMemoryDebugAnnotation::CurrentAnnotation();
611	if (annotation.pending_op_name != nullptr) {
612	chunk->op_name = annotation.pending_op_name;
613	} else {
614	LOG(INFO) << "missing pending_op_name for " << Name()
615	<< " reading addr "
616	<< static_cast<const void*>(&annotation.pending_op_name)
617	<< "\n"
618	<< CurrentStackTrace();
619	chunk->op_name = nullptr;
620	}
621	chunk->action_count = ++action_counter_;
622	chunk->step_id = annotation.pending_step_id;
623	int slot = chunk->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
624	size_history_[slot] = stats_.bytes_in_use;
625	}
626	#endif
627
628	VLOG(`4`) << "Returning: " << chunk->ptr;
629	if (VLOG_IS_ON(`4`)) {
630	LOG(INFO) << "A: " << RenderOccupancy();
631	}
632	return chunk->ptr;
633	}
634	}
635	}
636
637	return nullptr;
638	}
639
640	void BFCAllocator::SplitChunk(BFCAllocator::ChunkHandle h, size_t num_bytes) {
641	// Allocate the new chunk before we do any ChunkFromHandle
642	ChunkHandle h_new_chunk = AllocateChunk();
643
644	Chunk* c = ChunkFromHandle(h);
645	CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
646
647	// Create a new chunk starting num_bytes after c
648	BFCAllocator::Chunk* new_chunk = ChunkFromHandle(h_new_chunk);
649	new_chunk->ptr = static_cast<void>(static_cast<char**>(c->ptr) + num_bytes);
650	region_manager_.set_handle(new_chunk->ptr, h_new_chunk);
651
652	// Set the new sizes of the chunks.
653	new_chunk->size = c->size - num_bytes;
654	c->size = num_bytes;
655
656	// The new chunk is not in use.
657	new_chunk->allocation_id = -`1`;
658
659	// It inherits the freed time.
660	new_chunk->freed_at_count = c->freed_at_count;
661
662	// Maintain the pointers.
663	// c <-> c_neighbor becomes
664	// c <-> new_chunk <-> c_neighbor
665	BFCAllocator::ChunkHandle h_neighbor = c->next;
666	new_chunk->prev = h;
667	new_chunk->next = h_neighbor;
668	c->next = h_new_chunk;
669	if (h_neighbor != kInvalidChunkHandle) {
670	Chunk* c_neighbor = ChunkFromHandle(h_neighbor);
671	c_neighbor->prev = h_new_chunk;
672	}
673
674	// Add the newly free chunk to the free bin.
675	InsertFreeChunkIntoBin(h_new_chunk);
676	}
677
678	void BFCAllocator::DeallocateRaw(void* ptr) {
679	VLOG(`3`) << "DeallocateRaw " << Name() << " "
680	<< (ptr ? RequestedSize(ptr) : `0`);
681	DeallocateRawInternal(ptr);
682	retry_helper_.NotifyDealloc();
683	}
684
685	void BFCAllocator::DeallocateRawInternal(void* ptr) {
686	if (ptr == nullptr) {
687	VLOG(`2`) << "tried to deallocate nullptr";
688	return;
689	}
690	mutex_lock l(lock_);
691
692	// Find the chunk from the ptr.
693	BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
694	CHECK(h != kInvalidChunkHandle);
695	// Record chunk information before it's freed.
696	Chunk* chunk = ChunkFromHandle(h);
697	void* chunk_ptr = chunk->ptr;
698	int64_t req_bytes = chunk->requested_size;
699	int64_t alloc_bytes = chunk->size;
700
701	MarkFree(h);
702
703	// Consider coalescing it.
704	if (timing_counter_) {
705	InsertFreeChunkIntoBin(h);
706	timestamped_chunks_.push_back(h);
707	} else {
708	InsertFreeChunkIntoBin(TryToCoalesce(h, false));
709	}
710
711	// TraceMe needs to be added after MarkFree and InsertFreeChunkIntoBin for
712	// correct aggregation stats (bytes_in_use, fragmentation).
713	AddTraceMe("MemoryDeallocation", chunk_ptr, req_bytes, alloc_bytes);
714
715	if (VLOG_IS_ON(`4`)) {
716	LOG(INFO) << "F: " << RenderOccupancy();
717	}
718	}
719
720	// Merges h1 and h2 when Chunk(h1)->next is h2 and Chunk(h2)->prev is c1.
721	// We merge Chunk(h2) into Chunk(h1).
722	void BFCAllocator::Merge(BFCAllocator::ChunkHandle h1,
723	BFCAllocator::ChunkHandle h2) {
724	Chunk* c1 = ChunkFromHandle(h1);
725	Chunk* c2 = ChunkFromHandle(h2);
726	// We can only merge chunks that are not in use.
727	CHECK(!c1->in_use() && !c2->in_use());
728
729	// c1's prev doesn't change, still points to the same ptr, and is
730	// still not in use.
731
732	// Fix up neighbor pointers
733	//
734	// c1 <-> c2 <-> c3 should become
735	// c1 <-> c3
736
737	BFCAllocator::ChunkHandle h3 = c2->next;
738	c1->next = h3;
739	CHECK(c2->prev == h1);
740	if (h3 != kInvalidChunkHandle) {
741	BFCAllocator::Chunk* c3 = ChunkFromHandle(h3);
742	c3->prev = h1;
743	}
744
745	// Set the new size
746	c1->size += c2->size;
747
748	// Pick latest free time.
749	c1->freed_at_count = std::max(c1->freed_at_count, c2->freed_at_count);
750
751	DeleteChunk(h2);
752	}
753
754	void BFCAllocator::DeleteChunk(ChunkHandle h) {
755	// Delete h and cleanup all state
756	Chunk* c = ChunkFromHandle(h);
757	// VLOG(4) << "Removing: " << c->ptr;
758	region_manager_.erase(c->ptr);
759	DeallocateChunk(h);
760	}
761
762	void BFCAllocator::InsertFreeChunkIntoBin(BFCAllocator::ChunkHandle h) {
763	Chunk* c = ChunkFromHandle(h);
764	CHECK(!c->in_use() && (c->bin_num == kInvalidBinNum));
765	BinNum bin_num = BinNumForSize(c->size);
766	Bin* new_bin = BinFromIndex(bin_num);
767	c->bin_num = bin_num;
768	new_bin->free_chunks.insert(h);
769	}
770
771	void BFCAllocator::RemoveFreeChunkIterFromBin(
772	BFCAllocator::Bin::FreeChunkSet* free_chunks,
773	const BFCAllocator::Bin::FreeChunkSet::iterator& citer) {
774	ChunkHandle h = *citer;
775	Chunk* c = ChunkFromHandle(h);
776	CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
777	free_chunks->erase(citer);
778	c->bin_num = kInvalidBinNum;
779	}
780
781	void BFCAllocator::RemoveFreeChunkFromBin(BFCAllocator::ChunkHandle h) {
782	Chunk* c = ChunkFromHandle(h);
783	CHECK(!c->in_use() && (c->bin_num != kInvalidBinNum));
784	CHECK_GT(BinFromIndex(c->bin_num)->free_chunks.erase(h), `0`)
785	<< "Could not find chunk in bin";
786	c->bin_num = kInvalidBinNum;
787	}
788
789	void BFCAllocator::MarkFree(BFCAllocator::ChunkHandle h) {
790	Chunk* c = ChunkFromHandle(h);
791	CHECK(c->in_use() && (c->bin_num == kInvalidBinNum));
792
793	// Mark the chunk as no longer in use.
794	c->allocation_id = -`1`;
795
796	// Optionally record the free time.
797	if (timing_counter_) {
798	c->freed_at_count = timing_counter_->next();
799	}
800
801	// Updates the stats.
802	stats_.bytes_in_use -= c->size;
803
804	#ifdef TENSORFLOW_MEM_DEBUG
805	if (ShouldRecordOpName()) {
806	c->action_count = ++action_counter_;
807	int slot = c->action_count % MEM_DEBUG_SIZE_HISTORY_SIZE;
808	size_history_[slot] = stats_.bytes_in_use;
809	}
810	#endif
811	}
812
813	BFCAllocator::ChunkHandle BFCAllocator::TryToCoalesce(ChunkHandle h,
814	bool ignore_freed_at) {
815	Chunk* c = ChunkFromHandle(h);
816	if ((!ignore_freed_at) && c->freed_at_count > `0`) return h;
817	ChunkHandle coalesced_chunk = h;
818
819	// If the next chunk is free, merge it into c and delete it.
820	if (c->next != kInvalidChunkHandle && !ChunkFromHandle(c->next)->in_use()) {
821	Chunk* n = ChunkFromHandle(c->next);
822	if ((n->freed_at_count == `0`) \|\| ignore_freed_at) {
823	VLOG(`4`) << "Merging c->next " << n->ptr << " with c " << c->ptr;
824	RemoveFreeChunkFromBin(c->next);
825	Merge(h, c->next);
826	}
827	}
828
829	// If the previous chunk is free, merge c into it and delete c.
830	if (c->prev != kInvalidChunkHandle && !ChunkFromHandle(c->prev)->in_use()) {
831	Chunk* n = ChunkFromHandle(c->prev);
832	if ((n->freed_at_count == `0`) \|\| ignore_freed_at) {
833	VLOG(`4`) << "Merging c " << c->ptr << " into c->prev " << n->ptr;
834	coalesced_chunk = c->prev;
835	RemoveFreeChunkFromBin(c->prev);
836	Merge(c->prev, h);
837	}
838	}
839
840	return coalesced_chunk;
841	}
842
843	void BFCAllocator::SetSafeFrontier(uint64 count) {
844	uint64 current = safe_frontier_.load(std::memory_order_relaxed);
845	while (count > current) {
846	if (safe_frontier_.compare_exchange_strong(current, count)) {
847	retry_helper_.NotifyDealloc();
848	return;
849	} else {
850	current = safe_frontier_.load(std::memory_order_relaxed);
851	}
852	}
853	}
854
855	bool BFCAllocator::MergeTimestampedChunks(size_t required_bytes) {
856	VLOG(`1`) << "MergeTimestampedChunks queue_len=" << timestamped_chunks_.size()
857	<< " required_bytes=" << required_bytes;
858	bool satisfied = (required_bytes == `0`);
859	std::vector<void*> to_merge;
860	std::deque<ChunkHandle> new_ts_queue;
861	while (!timestamped_chunks_.empty()) {
862	ChunkHandle h = timestamped_chunks_.front();
863	timestamped_chunks_.pop_front();
864	DCHECK_NE(h, kInvalidChunkHandle);
865	Chunk* c = ChunkFromHandle(h);
866	// It's possible this chunk has already been merged so refetch and retest
867	// the handle.
868	h = region_manager_.get_handle(c->ptr);
869	if (h == kInvalidChunkHandle) {
870	continue;
871	}
872	if (c->in_use() \|\| (c->bin_num == kInvalidBinNum)) {
873	// This chunk has already been reallocated.
874	continue;
875	}
876	if (c->freed_at_count == `0`) {
877	to_merge.push_back(c->ptr);
878	continue;
879	}
880	// Chunk should be free and assigned to a bin.
881	DCHECK_NE(c->bin_num, kInvalidBinNum);
882	if (c->freed_at_count < safe_frontier_) {
883	c->freed_at_count = `0`;
884	to_merge.push_back(c->ptr);
885	} else if (required_bytes > `0`) {
886	to_merge.push_back(c->ptr);
887	} else {
888	new_ts_queue.push_back(h);
889	}
890	}
891	DCHECK(timestamped_chunks_.empty());
892	std::swap(timestamped_chunks_, new_ts_queue);
893
894	// At this point all candidate chunks have been moved from timestamped_chunks_
895	// to to_merge. If this is a standard merge (required_bytes == 0) then
896	// merge them all, otherwise merge just until a Chunk of the required size
897	// is produced.
898	for (int ci = `0`, end = to_merge.size(); ci < end; ++ci) {
899	void* ptr = to_merge [ci];
900	// It's possible that the Chunk associated with this memory location got
901	// merged and deallocated in a prior iteration so refetch the handle and
902	// retest.
903	ChunkHandle h = region_manager_.get_handle(ptr);
904	if (h == kInvalidChunkHandle) continue;
905	if (required_bytes == `0` \|\| !satisfied) {
906	Chunk* c = ChunkFromHandle(h);
907	DCHECK_NE(c->bin_num, kInvalidBinNum);
908	DCHECK(!c->in_use());
909	RemoveFreeChunkFromBin(h);
910	ChunkHandle new_h = TryToCoalesce(h, (required_bytes > `0`));
911	InsertFreeChunkIntoBin(new_h);
912	if (required_bytes > `0`) {
913	c = ChunkFromHandle(new_h);
914	if (new_h != h && c->freed_at_count > `0`) {
915	timestamped_chunks_.push_back(new_h);
916	}
917	if (c->size >= required_bytes) {
918	satisfied = true;
919	}
920	}
921	} else {
922	// We were force merging Chunks with unsafe timestamps, but managed
923	// to create a satisfying Chunk so just requeue the rest.
924	timestamped_chunks_.push_back(h);
925	}
926	}
927	return satisfied;
928	}
929
930	bool BFCAllocator::TracksAllocationSizes() const { return true; }
931
932	size_t BFCAllocator::RequestedSize(const void* ptr) const {
933	CHECK(ptr);
934	mutex_lock l(lock_);
935	BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
936	CHECK(h != kInvalidChunkHandle)
937	<< "Asked for requested size of pointer we never allocated: " << ptr;
938	const BFCAllocator::Chunk* c = ChunkFromHandle(h);
939	return c->requested_size;
940	}
941
942	size_t BFCAllocator::AllocatedSize(const void* ptr) const {
943	mutex_lock l(lock_);
944	BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
945	CHECK(h != kInvalidChunkHandle)
946	<< "Asked for allocated size of pointer we never allocated: " << ptr;
947	const BFCAllocator::Chunk* c = ChunkFromHandle(h);
948	return c->size;
949	}
950
951	int64_t BFCAllocator::AllocationId(const void* ptr) const {
952	mutex_lock l(lock_);
953	BFCAllocator::ChunkHandle h = region_manager_.get_handle(ptr);
954	CHECK(h != kInvalidChunkHandle)
955	<< "Asked for allocation id of pointer we never allocated: " << ptr;
956	const BFCAllocator::Chunk* c = ChunkFromHandle(h);
957	return c->allocation_id;
958	}
959
960	namespace {
961
962	void RenderRegion(char* rendered, const size_t resolution,
963	const size_t total_render_size, const size_t offset,
964	const void* base_ptr, const void* ptr, const size_t size,
965	const char c) {
966	const char* base_ptr_c = static_cast<const char*>(base_ptr);
967	const char* ptr_c = static_cast<const char*>(ptr);
968
969	size_t start_location =
970	((ptr_c - base_ptr_c + offset) * resolution) / total_render_size;
971	CHECK_GE(start_location, `0`);
972	CHECK_LT(start_location, resolution);
973	size_t end_location =
974	((ptr_c + size - `1` - base_ptr_c + offset) * resolution) /
975	total_render_size;
976	CHECK_GE(end_location, `0`);
977	CHECK_LT(end_location, resolution);
978
979	for (size_t i = start_location; i <= end_location; ++i) {
980	rendered[i] = c;
981	}
982	}
983
984	} // namespace
985
986	string BFCAllocator::RenderOccupancy() {
987	// Make a buffer for the ASCII-art representation.
988	const size_t resolution = `100`;
989	char rendered[resolution];
990
991	// Compute the total region size to render over
992	size_t total_region_size = `0`;
993	for (const auto& region : region_manager_.regions()) {
994	total_region_size += region.memory_size();
995	}
996
997	if (total_region_size == `0`) {
998	return "<allocator contains no memory>";
999	}
1000
1001	// Start out with everything empty
1002	RenderRegion(rendered, resolution, total_region_size, `0`, nullptr, nullptr,
1003	total_region_size, `'_'`);
1004
1005	size_t region_offset = `0`;
1006	for (const auto& region : region_manager_.regions()) {
1007	ChunkHandle h = region_manager_.get_handle(region.ptr());
1008	// Then render each chunk left to right.
1009	while (h != kInvalidChunkHandle) {
1010	Chunk* c = ChunkFromHandle(h);
1011	if (c->in_use()) {
1012	// Render the wasted space
1013	size_t wasted = c->size - c->requested_size;
1014	if (wasted > `0`) {
1015	RenderRegion(rendered, resolution, total_region_size,
1016	region_offset + c->requested_size, region.ptr(), c->ptr,
1017	wasted, `'x'`);
1018	}
1019	// Then the occupied space
1020	RenderRegion(rendered, resolution, total_region_size, region_offset,
1021	region.ptr(), c->ptr, c->requested_size, `'*'`);
1022	}
1023	h = c->next;
1024	}
1025	region_offset += region.memory_size();
1026	}
1027
1028	return string (rendered, resolution);
1029	}
1030
1031	void BFCAllocator::DumpMemoryLog(size_t num_bytes) {
1032	const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1033	LOG(INFO) << "BFCAllocator dump for " << Name();
1034	for (BinNum bin_num = `0`; bin_num < kNumBins; bin_num++) {
1035	Bin* b = BinFromIndex(bin_num);
1036	const BinDebugInfo& bin_info = bin_infos [bin_num];
1037	CHECK_EQ(b->free_chunks.size(),
1038	bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1039
1040	LOG(INFO) << "Bin (" << b->bin_size
1041	<< "): \tTotal Chunks: " << bin_info.total_chunks_in_bin
1042	<< ", Chunks in use: " << bin_info.total_chunks_in_use << ". "
1043	<< strings::HumanReadableNumBytes(bin_info.total_bytes_in_bin)
1044	<< " allocated for chunks. "
1045	<< strings::HumanReadableNumBytes(bin_info.total_bytes_in_use)
1046	<< " in use in bin. "
1047	<< strings::HumanReadableNumBytes(
1048	bin_info.total_requested_bytes_in_use)
1049	<< " client-requested in use in bin.";
1050	}
1051
1052	// Find the bin that we would have liked to allocate in, so we
1053	// can get some further analysis about fragmentation.
1054	Bin* b = BinForSize(num_bytes);
1055
1056	LOG(INFO) << "Bin for " << strings::HumanReadableNumBytes(num_bytes)
1057	<< " was " << strings::HumanReadableNumBytes(b->bin_size)
1058	<< ", Chunk State: ";
1059
1060	for (ChunkHandle h : b->free_chunks) {
1061	Chunk* c = ChunkFromHandle(h);
1062	LOG(INFO) << c->DebugString(this, true);
1063	}
1064
1065	// Next show the chunks that are in use, and also summarize their
1066	// number by size.
1067	std::map<size_t, int> in_use_by_size;
1068	for (const auto& region : region_manager_.regions()) {
1069	LOG(INFO) << "Next region of size " << region.memory_size();
1070	ChunkHandle h = region_manager_.get_handle(region.ptr());
1071	while (h != kInvalidChunkHandle) {
1072	const Chunk* c = ChunkFromHandle(h);
1073	if (c->in_use()) {
1074	in_use_by_size [c->size]++;
1075	}
1076	string buf = strings::StrCat(
1077	(c->in_use() ? "InUse" : "Free "), " at ",
1078	strings::Hex (reinterpret_cast<uint64>(c->ptr)), " of size ", c->size);
1079	#ifdef TENSORFLOW_MEM_DEBUG
1080	if (ShouldRecordOpName()) {
1081	strings::StrAppend(&buf, " by op ", c->op_name, " action_count ",
1082	c->action_count, " step ", c->step_id);
1083	}
1084	#endif
1085	strings::StrAppend(&buf, " next ", c->next);
1086	if (timing_counter_) {
1087	strings::StrAppend(&buf, " freed_at_count ", c->freed_at_count);
1088	}
1089	LOG(INFO) << buf;
1090	h = c->next;
1091	}
1092	}
1093
1094	LOG(INFO) << " Summary of in-use Chunks by size: ";
1095	size_t total_bytes = `0`;
1096	for (auto& it : in_use_by_size) {
1097	LOG(INFO) << it.second << " Chunks of size " << it.first << " totalling "
1098	<< strings::HumanReadableNumBytes(it.first * it.second);
1099	total_bytes += (it.first * it.second);
1100	}
1101	LOG(INFO) << "Sum Total of in-use chunks: "
1102	<< strings::HumanReadableNumBytes(total_bytes);
1103	LOG(INFO) << "total_region_allocated_bytes_: "
1104	<< total_region_allocated_bytes_
1105	<< " memory_limit_: " << memory_limit_ << " available bytes: "
1106	<< (memory_limit_ - total_region_allocated_bytes_)
1107	<< " curr_region_allocation_bytes_: "
1108	<< curr_region_allocation_bytes_;
1109	LOG(INFO) << "Stats: \n" << stats_.DebugString();
1110	}
1111
1112	void BFCAllocator::MaybeWriteMemoryMap() {
1113	const char* gpu_memory_map_file = std::getenv("TF_BFC_MEMORY_DUMP");
1114	if (gpu_memory_map_file != nullptr) {
1115	std::unique_ptr<WritableFile> dump_file;
1116	string file_name = strings::StrCat(gpu_memory_map_file, "_", Name(), ".",
1117	Env::Default()->NowMicros());
1118	Status status = Env::Default()->NewWritableFile(file_name, &dump_file);
1119	if (!status.ok()) {
1120	LOG(ERROR) << "Failed to open file " << file_name;
1121	return;
1122	}
1123	MemoryDump md = RecordMemoryMapInternal();
1124	status = dump_file ->Append(md.SerializeAsString());
1125	if (!status.ok()) {
1126	LOG(ERROR) << "Error on writing to file " << gpu_memory_map_file << ": "
1127	<< status;
1128	}
1129	}
1130	}
1131
1132	MemoryDump BFCAllocator::RecordMemoryMap() {
1133	mutex_lock l(lock_);
1134	return RecordMemoryMapInternal();
1135	}
1136
1137	MemoryDump BFCAllocator::RecordMemoryMapInternal() {
1138	MemoryDump md;
1139	md.set_allocator_name(Name());
1140
1141	// Record the general stats
1142	MemAllocatorStats* mas = md.mutable_stats();
1143	mas->set_num_allocs(stats_.num_allocs);
1144	mas->set_bytes_in_use(stats_.bytes_in_use);
1145	mas->set_peak_bytes_in_use(stats_.peak_bytes_in_use);
1146	mas->set_largest_alloc_size(stats_.largest_alloc_size);
1147
1148	// Record summary data for every bin.
1149	const std::array<BinDebugInfo, kNumBins> bin_infos = get_bin_debug_info();
1150	for (BinNum bin_num = `0`; bin_num < kNumBins; bin_num++) {
1151	Bin* b = BinFromIndex(bin_num);
1152	const BinDebugInfo& bin_info = bin_infos [bin_num];
1153	DCHECK_EQ(b->free_chunks.size(),
1154	bin_info.total_chunks_in_bin - bin_info.total_chunks_in_use);
1155	BinSummary* bs = md.add_bin_summary();
1156	bs->set_bin(bin_num);
1157	bs->set_total_bytes_in_use(bin_info.total_bytes_in_use);
1158	bs->set_total_bytes_in_bin(bin_info.total_bytes_in_bin);
1159	bs->set_total_chunks_in_use(bin_info.total_chunks_in_use);
1160	bs->set_total_chunks_in_bin(bin_info.total_chunks_in_bin);
1161	}
1162
1163	// Record state of every defined Chunk.
1164	for (const auto& region : region_manager_.regions()) {
1165	ChunkHandle h = region_manager_.get_handle(region.ptr());
1166	while (h != kInvalidChunkHandle) {
1167	const Chunk* c = ChunkFromHandle(h);
1168	MemChunk* mc = md.add_chunk();
1169	mc->set_in_use(c->in_use());
1170	mc->set_address(reinterpret_cast<uint64>(c->ptr));
1171	mc->set_size(c->size);
1172	mc->set_requested_size(c->requested_size);
1173	mc->set_bin(c->bin_num);
1174	#ifdef TENSORFLOW_MEM_DEBUG
1175	mc->set_op_name(c->op_name ? string(c->op_name) : "UNKNOWN");
1176	mc->set_step_id(c->step_id);
1177	mc->set_action_count(c->action_count);
1178	#endif
1179	if (timing_counter_) {
1180	mc->set_freed_at_count(c->in_use() ? `0` : c->freed_at_count);
1181	}
1182	h = c->next;
1183	}
1184	}
1185
1186	mas->set_fragmentation_metric(GetFragmentation());
1187
1188	#ifdef TENSORFLOW_MEM_DEBUG
1189	// Record the recent size history
1190	int history_len = static_cast<int>(std::min(
1191	action_counter_, static_cast<long long>(MEM_DEBUG_SIZE_HISTORY_SIZE)));
1192	for (int i = action_counter_ - history_len; i < action_counter_; ++i) {
1193	SnapShot* ss = md.add_snap_shot();
1194	ss->set_action_count(i);
1195	int slot = i % MEM_DEBUG_SIZE_HISTORY_SIZE;
1196	ss->set_size(size_history_[slot]);
1197	}
1198	#endif
1199
1200	return md;
1201	}
1202
1203	absl::optional<AllocatorStats> BFCAllocator::GetStats() {
1204	mutex_lock l(lock_);
1205	return stats_;
1206	}
1207
1208	bool BFCAllocator::ClearStats() {
1209	mutex_lock l(lock_);
1210	stats_.num_allocs = `0`;
1211	stats_.peak_bytes_in_use = stats_.bytes_in_use;
1212	stats_.largest_alloc_size = `0`;
1213	return true;
1214	}
1215
1216	std::array<BFCAllocator::BinDebugInfo, BFCAllocator::kNumBins>
1217	BFCAllocator::get_bin_debug_info() {
1218	std::array<BinDebugInfo, kNumBins> bin_infos;
1219	for (const auto& region : region_manager_.regions()) {
1220	ChunkHandle h = region_manager_.get_handle(region.ptr());
1221	while (h != kInvalidChunkHandle) {
1222	const Chunk* c = ChunkFromHandle(h);
1223	BinNum bin_num = BinNumForSize(c->size);
1224	BinDebugInfo& bin_info = bin_infos [bin_num];
1225	bin_info.total_bytes_in_bin += c->size;
1226	bin_info.total_chunks_in_bin++;
1227	if (c->in_use()) {
1228	bin_info.total_bytes_in_use += c->size;
1229	bin_info.total_requested_bytes_in_use += c->requested_size;
1230	bin_info.total_chunks_in_use++;
1231	} else {
1232	Bin* bin = BinFromIndex(bin_num);
1233	CHECK_EQ(bin->free_chunks.count(h), `1`);
1234	CHECK_EQ(c->bin_num, bin_num);
1235	}
1236	h = c->next;
1237	}
1238	}
1239	return bin_infos;
1240	}
1241
1242	AllocatorMemoryType BFCAllocator::GetMemoryType() const {
1243	return sub_allocator_->GetMemoryType();
1244	}
1245
1246	} // namespace tensorflow
1247

Browse the source code of tensorflow/tensorflow/core/common_runtime/bfc_allocator.cc