1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_TSL_FRAMEWORK_ALLOCATOR_H_ |
17 | #define TENSORFLOW_TSL_FRAMEWORK_ALLOCATOR_H_ |
18 | |
19 | #include <stdlib.h> |
20 | |
21 | #include <functional> |
22 | #include <limits> |
23 | |
24 | #include "absl/strings/string_view.h" |
25 | #include "absl/types/optional.h" |
26 | #include "tensorflow/tsl/framework/numeric_types.h" |
27 | #include "tensorflow/tsl/framework/type_traits.h" |
28 | #include "tensorflow/tsl/platform/logging.h" |
29 | #include "tensorflow/tsl/platform/macros.h" |
30 | #include "tensorflow/tsl/platform/numa.h" |
31 | #include "tensorflow/tsl/platform/types.h" |
32 | |
33 | namespace tsl { |
34 | |
35 | // Attributes for a single allocation call. Different calls to the same |
36 | // allocator could potentially have different allocation attributes. |
37 | struct AllocationAttributes { |
38 | AllocationAttributes() = default; |
39 | |
40 | AllocationAttributes(bool retry_on_failure, bool allocation_will_be_logged, |
41 | std::function<uint64()>* freed_by_func) |
42 | : retry_on_failure(retry_on_failure), |
43 | allocation_will_be_logged(allocation_will_be_logged), |
44 | freed_by_func(freed_by_func) {} |
45 | |
46 | // If the first attempt to allocate the memory fails, the allocation should |
47 | // wait and retry (with a timeout). |
48 | // |
49 | // This is usually set to true, but we may set it to false in cases where a |
50 | // failure has only performance impact (e.g. optional scratch space |
51 | // allocation). |
52 | bool retry_on_failure = true; |
53 | // If a Tensor is allocated without the following set to true, then |
54 | // it is logged as an unknown allocation. During execution Tensors |
55 | // should be allocated through the OpKernelContext which records |
56 | // which Op is performing the allocation, and sets this flag to |
57 | // true. |
58 | bool allocation_will_be_logged = false; |
59 | // EXPERIMENTAL: If provided, then evaluates to a timing count such that only |
60 | // a memory chunk whose freed_at_count is at this value or earlier may be |
61 | // returned. |
62 | std::function<uint64()>* freed_by_func = nullptr; // Not owned. |
63 | |
64 | TF_DISALLOW_COPY_AND_ASSIGN(AllocationAttributes); |
65 | }; |
66 | |
67 | // Runtime statistics collected by an allocator. Exactly the same as |
68 | // stream_executor::AllocatorStats, but independently defined to preserve the |
69 | // mutual independence of StreamExecutor and TensorFlow. |
70 | struct AllocatorStats { |
71 | int64_t num_allocs; // Number of allocations. |
72 | int64_t bytes_in_use; // Number of bytes in use. |
73 | int64_t peak_bytes_in_use; // The peak bytes in use. |
74 | int64_t largest_alloc_size; // The largest single allocation seen. |
75 | |
76 | // The upper limit of bytes of user allocatable device memory, if such a limit |
77 | // is known. |
78 | absl::optional<int64_t> bytes_limit; |
79 | |
80 | // Stats for reserved memory usage. |
81 | int64_t bytes_reserved; // Number of bytes reserved. |
82 | int64_t peak_bytes_reserved; // The peak number of bytes reserved. |
83 | // The upper limit on the number bytes of reservable memory, |
84 | // if such a limit is known. |
85 | absl::optional<int64_t> bytes_reservable_limit; |
86 | |
87 | int64_t largest_free_block_bytes; // Largest free block's size in heap. |
88 | |
89 | AllocatorStats() |
90 | : num_allocs(0), |
91 | bytes_in_use(0), |
92 | peak_bytes_in_use(0), |
93 | largest_alloc_size(0), |
94 | bytes_reserved(0), |
95 | peak_bytes_reserved(0), |
96 | largest_free_block_bytes(0) {} |
97 | |
98 | std::string DebugString() const; |
99 | }; |
100 | |
101 | // The type of the allocated memory. |
102 | enum class AllocatorMemoryType { |
103 | kUnknown = 0, // Memory type unknown. |
104 | kDevice = 1, // Memory on device. |
105 | kHostPageable = 2, // Memory on host and it is pagable. |
106 | kHostPinned = 3, // Memory on host and it is pinned. |
107 | }; |
108 | |
109 | // Allocator is an abstract interface for allocating and deallocating |
110 | // device memory. |
111 | class Allocator { |
112 | public: |
113 | // Align to 64 byte boundary. |
114 | static constexpr size_t kAllocatorAlignment = 64; |
115 | |
116 | virtual ~Allocator(); |
117 | |
118 | // Return a string identifying this allocator |
119 | virtual std::string Name() = 0; |
120 | |
121 | // Return an uninitialized block of memory that is "num_bytes" bytes |
122 | // in size. The returned pointer is guaranteed to be aligned to a |
123 | // multiple of "alignment" bytes. |
124 | // REQUIRES: "alignment" is a power of 2. |
125 | virtual void* AllocateRaw(size_t alignment, size_t num_bytes) = 0; |
126 | |
127 | // Return an uninitialized block of memory that is "num_bytes" bytes |
128 | // in size with specified allocation attributes. The returned pointer is |
129 | // guaranteed to be aligned to a multiple of "alignment" bytes. |
130 | // REQUIRES: "alignment" is a power of 2. |
131 | virtual void* AllocateRaw(size_t alignment, size_t num_bytes, |
132 | const AllocationAttributes& allocation_attr) { |
133 | // The default behavior is to use the implementation without any allocation |
134 | // attributes. |
135 | return AllocateRaw(alignment, num_bytes); |
136 | } |
137 | |
138 | // Deallocate a block of memory pointer to by "ptr" |
139 | // REQUIRES: "ptr" was previously returned by a call to AllocateRaw |
140 | virtual void DeallocateRaw(void* ptr) = 0; |
141 | |
142 | // Returns true if this allocator tracks the sizes of allocations. |
143 | // RequestedSize and AllocatedSize must be overridden if |
144 | // TracksAllocationSizes is overridden to return true. |
145 | virtual bool TracksAllocationSizes() const { return false; } |
146 | |
147 | // Returns true if this allocator allocates an opaque handle rather than the |
148 | // requested number of bytes. |
149 | // |
150 | // This method returns false for most allocators, but may be used by |
151 | // special-case allocators that track tensor usage. If this method returns |
152 | // true, AllocateRaw() should be invoked for all values of `num_bytes`, |
153 | // including 0. |
154 | // |
155 | // NOTE: It is the caller's responsibility to track whether an allocated |
156 | // object is a buffer or an opaque handle. In particular, when this method |
157 | // returns `true`, users of this allocator must not run any constructors or |
158 | // destructors for complex objects, since there is no backing store for the |
159 | // tensor in which to place their outputs. |
160 | virtual bool AllocatesOpaqueHandle() const { return false; } |
161 | |
162 | // Returns the user-requested size of the data allocated at |
163 | // 'ptr'. Note that the actual buffer allocated might be larger |
164 | // than requested, but this function returns the size requested by |
165 | // the user. |
166 | // |
167 | // REQUIRES: TracksAllocationSizes() is true. |
168 | // |
169 | // REQUIRES: 'ptr!=nullptr' and points to a buffer previously |
170 | // allocated by this allocator. |
171 | virtual size_t RequestedSize(const void* ptr) const { |
172 | CHECK(false) << "allocator doesn't track sizes" ; |
173 | return size_t(0); |
174 | } |
175 | |
176 | // Returns the allocated size of the buffer at 'ptr' if known, |
177 | // otherwise returns RequestedSize(ptr). AllocatedSize(ptr) is |
178 | // guaranteed to be >= RequestedSize(ptr). |
179 | // |
180 | // REQUIRES: TracksAllocationSizes() is true. |
181 | // |
182 | // REQUIRES: 'ptr!=nullptr' and points to a buffer previously |
183 | // allocated by this allocator. |
184 | virtual size_t AllocatedSize(const void* ptr) const { |
185 | return RequestedSize(ptr); |
186 | } |
187 | |
188 | // Returns either 0 or an identifier assigned to the buffer at 'ptr' |
189 | // when the buffer was returned by AllocateRaw. If non-zero, the |
190 | // identifier differs from every other ID assigned by this |
191 | // allocator. |
192 | // |
193 | // REQUIRES: TracksAllocationSizes() is true. |
194 | // |
195 | // REQUIRES: 'ptr!=nullptr' and points to a buffer previously |
196 | // allocated by this allocator. |
197 | virtual int64_t AllocationId(const void* ptr) const { return 0; } |
198 | |
199 | // Returns the allocated size of the buffer at 'ptr' if known, |
200 | // otherwise returns 0. This method can be called when |
201 | // TracksAllocationSizes() is false, but can be extremely slow. |
202 | // |
203 | // REQUIRES: 'ptr!=nullptr' and points to a buffer previously |
204 | // allocated by this allocator. |
205 | virtual size_t AllocatedSizeSlow(const void* ptr) const { |
206 | if (TracksAllocationSizes()) { |
207 | return AllocatedSize(ptr); |
208 | } |
209 | return 0; |
210 | } |
211 | |
212 | // Fills in 'stats' with statistics collected by this allocator. |
213 | virtual absl::optional<AllocatorStats> GetStats() { return absl::nullopt; } |
214 | |
215 | // If implemented, clears the internal stats except for the `in_use` fields |
216 | // and sets the `peak_bytes_in_use` to be equal to the `bytes_in_use`. Returns |
217 | // true if implemented. |
218 | // |
219 | // REQUIRES: GetStats is overridden. |
220 | virtual bool ClearStats() TF_MUST_USE_RESULT { return false; } |
221 | |
222 | virtual void SetSafeFrontier(uint64 count) {} |
223 | |
224 | // For allocator that are stream aware, allow to specify the compute |
225 | // stream this allocator is used for. This can also trigger memory |
226 | // preallocation. |
227 | virtual void SetStreamAndPreallocateMemory(void* stream) {} |
228 | |
229 | // Returns the type of the memory allocated by this allocator. |
230 | virtual AllocatorMemoryType GetMemoryType() const { |
231 | return AllocatorMemoryType::kUnknown; |
232 | } |
233 | }; |
234 | |
235 | // An implementation of Allocator that delegates all calls to another Allocator. |
236 | // |
237 | // Useful to clients who want to override part of the functionality of another |
238 | // allocator. |
239 | class AllocatorWrapper : public Allocator { |
240 | public: |
241 | explicit AllocatorWrapper(Allocator* wrapped) : wrapped_(wrapped) {} |
242 | |
243 | ~AllocatorWrapper() override {} |
244 | |
245 | // Returns the wrapped allocator to which all calls are delegated. |
246 | Allocator* wrapped() const { return wrapped_; } |
247 | |
248 | std::string Name() override { return wrapped_->Name(); } |
249 | |
250 | void* AllocateRaw(size_t alignment, size_t num_bytes) override { |
251 | return wrapped_->AllocateRaw(alignment, num_bytes); |
252 | } |
253 | |
254 | void* AllocateRaw(size_t alignment, size_t num_bytes, |
255 | const AllocationAttributes& allocation_attr) override { |
256 | return wrapped_->AllocateRaw(alignment, num_bytes, allocation_attr); |
257 | } |
258 | |
259 | void DeallocateRaw(void* ptr) override { wrapped_->DeallocateRaw(ptr); } |
260 | |
261 | bool TracksAllocationSizes() const override { |
262 | return wrapped_->TracksAllocationSizes(); |
263 | } |
264 | |
265 | bool AllocatesOpaqueHandle() const override { |
266 | return wrapped_->AllocatesOpaqueHandle(); |
267 | } |
268 | |
269 | size_t RequestedSize(const void* ptr) const override { |
270 | return wrapped_->RequestedSize(ptr); |
271 | } |
272 | |
273 | size_t AllocatedSize(const void* ptr) const override { |
274 | return wrapped_->AllocatedSize(ptr); |
275 | } |
276 | |
277 | int64_t AllocationId(const void* ptr) const override { |
278 | return wrapped_->AllocationId(ptr); |
279 | } |
280 | |
281 | size_t AllocatedSizeSlow(const void* ptr) const override { |
282 | return wrapped_->AllocatedSizeSlow(ptr); |
283 | } |
284 | |
285 | AllocatorMemoryType GetMemoryType() const override { |
286 | return wrapped_->GetMemoryType(); |
287 | } |
288 | |
289 | private: |
290 | Allocator* const wrapped_; |
291 | }; |
292 | |
293 | // A tensorflow Op may need access to different kinds of memory that |
294 | // are not simply a function of the device to which the Op has been |
295 | // assigned. For example, an Op executing on a GPU may still need |
296 | // to allocate CPU RAM for some purpose. Internal to the tensorflow |
297 | // runtime we may choose to allocate CPU ram from special regions |
298 | // that have been prepared for higher performance in some use |
299 | // contexts, e.g. doing DMA with particular devices. For these |
300 | // reasons, the Device interface does not expose just one memory |
301 | // Allocator, but instead provides an accessor that takes a |
302 | // specification of the desired memory attributes in order to select |
303 | // an Allocator. |
304 | // |
305 | // Example use: |
306 | // // Allocator for ordinary device memory: |
307 | // Allocator* a = allocator(AllocatorAttributes()); |
308 | // ... |
309 | // // Allocator for CPU RAM, regardless of where Op is executing: |
310 | // AllocatorAttributes attr; |
311 | // attr.set_on_host(true); |
312 | // Allocator* a = allocator(attr); |
313 | struct AllocatorAttributes { |
314 | void set_on_host(bool v) { value |= (static_cast<int>(v)); } |
315 | bool on_host() const { return value & 0x1; } |
316 | void set_nic_compatible(bool v) { value |= (static_cast<int>(v) << 1); } |
317 | bool nic_compatible() const { return value & (0x1 << 1); } |
318 | void set_gpu_compatible(bool v) { value |= (static_cast<int>(v) << 2); } |
319 | bool gpu_compatible() const { return value & (0x1 << 2); } |
320 | void Merge(AllocatorAttributes other) { |
321 | value |= other.value; |
322 | if (scope_id != other.scope_id) { |
323 | CHECK(scope_id == 0 || other.scope_id == 0) |
324 | << "At least one scope_id should be zero to merge " |
325 | "AllocatorAttributes but found this.scope_id=" |
326 | << scope_id << " and other.scope_id=" << other.scope_id; |
327 | scope_id = scope_id == 0 ? other.scope_id : scope_id; |
328 | } |
329 | } |
330 | // Returns true if the fields set in *this is a subset of or equal to |
331 | // those set in other. |
332 | bool IsEqualOrLessRestrictiveThan(const AllocatorAttributes& other) const { |
333 | return (value | other.value) == other.value; |
334 | } |
335 | |
336 | // NOTE: The upper 8 bits of the value are reserved for |
337 | // device-specific uses. Implementors of a device can interpret these |
338 | // upper 8 bits in device-specific ways, and ops implemented for those |
339 | // devices are responsible for setting those 8 bits appropriately. |
340 | uint32 value = 0; |
341 | // EXPERIMENTAL: If this is greater than zero, then allocation is delegated to |
342 | // a named special-purpose allocator on the same device. |
343 | int32 scope_id = 0; |
344 | |
345 | // Returns a human readable representation of this. |
346 | std::string DebugString() const; |
347 | }; |
348 | |
349 | // Returns a trivial implementation of Allocator, which is a process singleton. |
350 | // Access through this function is only intended for use by restricted parts |
351 | // of the infrastructure. |
352 | Allocator* cpu_allocator_base(); |
353 | |
354 | // If available, calls ProcessState::GetCPUAllocator(numa_node). |
355 | // If not, falls back to cpu_allocator_base(). |
356 | // Intended for use in contexts where ProcessState is not visible at |
357 | // compile time. Where ProcessState is visible, it's preferable to |
358 | // call it directly. |
359 | Allocator* cpu_allocator(int numa_node = port::kNUMANoAffinity); |
360 | |
361 | // Enables AllocatorStats in the default CPU allocator implementation. By |
362 | // default, it's disabled. |
363 | void EnableCPUAllocatorStats(); |
364 | // Disables AllocatorStats in the default CPU allocator implementation. By |
365 | // default, it's disabled. |
366 | void DisableCPUAllocatorStats(); |
367 | bool CPUAllocatorStatsEnabled(); |
368 | |
369 | // Enables full statistics collection in the default CPU allocator |
370 | // implementation. By default, it's disabled. |
371 | void EnableCPUAllocatorFullStats(); |
372 | bool CPUAllocatorFullStatsEnabled(); |
373 | |
374 | // An object that does the underlying suballoc/free of memory for a higher-level |
375 | // allocator. The expectation is that the higher-level allocator is doing some |
376 | // kind of cache or pool management so that it will call SubAllocator::Alloc and |
377 | // Free relatively infrequently, compared to the number of times its own |
378 | // AllocateRaw and Free methods are called. |
379 | class SubAllocator { |
380 | public: |
381 | // Visitor gets called with a pointer to a memory area and its |
382 | // size in bytes. The index value will be numa_node for a CPU |
383 | // allocator and GPU id for a GPU allocator. |
384 | typedef std::function<void(void*, int index, size_t)> Visitor; |
385 | |
386 | SubAllocator(const std::vector<Visitor>& alloc_visitors, |
387 | const std::vector<Visitor>& free_visitors); |
388 | |
389 | virtual ~SubAllocator() {} |
390 | // Allocates at least num_bytes. Returns actual number of bytes allocated in |
391 | // bytes_received. The caller can safely use the full bytes_received sized |
392 | // buffer following the returend pointer. |
393 | virtual void* Alloc(size_t alignment, size_t num_bytes, |
394 | size_t* bytes_received) = 0; |
395 | virtual void Free(void* ptr, size_t num_bytes) = 0; |
396 | |
397 | // Returns true if the BFC allocator can safely coalesce adjacent regions |
398 | // returned by this allocator. |
399 | virtual bool SupportsCoalescing() const = 0; |
400 | |
401 | // Returns the type of the memory allocated by this SubAllocator. |
402 | virtual AllocatorMemoryType GetMemoryType() const { |
403 | return AllocatorMemoryType::kUnknown; |
404 | } |
405 | |
406 | protected: |
407 | // Implementation of Alloc() method must call this on newly allocated |
408 | // value. |
409 | void VisitAlloc(void* ptr, int index, size_t num_bytes); |
410 | |
411 | // Implementation of Free() method must call this on value to be |
412 | // freed immediately before deallocation. |
413 | void VisitFree(void* ptr, int index, size_t num_bytes); |
414 | |
415 | const std::vector<Visitor> alloc_visitors_; |
416 | const std::vector<Visitor> free_visitors_; |
417 | }; |
418 | |
419 | } // namespace tsl |
420 | |
421 | #endif // TENSORFLOW_TSL_FRAMEWORK_ALLOCATOR_H_ |
422 | |