1 | /* Copyright 2019 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include <atomic> |
17 | |
18 | #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h" |
19 | #include "tensorflow/core/profiler/lib/traceme.h" |
20 | #include "tensorflow/tsl/framework/allocator.h" |
21 | #include "tensorflow/tsl/framework/allocator_registry.h" |
22 | #include "tensorflow/tsl/framework/tracking_allocator.h" |
23 | #include "tensorflow/tsl/platform/mem.h" |
24 | #include "tensorflow/tsl/platform/mutex.h" |
25 | #include "tensorflow/tsl/platform/strcat.h" |
26 | #include "tensorflow/tsl/platform/stringprintf.h" |
27 | #include "tensorflow/tsl/platform/types.h" |
28 | |
29 | namespace tsl { |
30 | |
31 | // If true, cpu allocator collects more stats. |
32 | static bool cpu_allocator_collect_stats = false; |
33 | |
34 | void EnableCPUAllocatorStats() { cpu_allocator_collect_stats = true; } |
35 | void DisableCPUAllocatorStats() { cpu_allocator_collect_stats = false; } |
36 | bool CPUAllocatorStatsEnabled() { return cpu_allocator_collect_stats; } |
37 | |
38 | static const int kMaxTotalAllocationWarnings = 1; |
39 | |
40 | static const int kMaxSingleAllocationWarnings = 5; |
41 | |
42 | // If cpu_allocator_collect_stats is true, warn when the total allocated memory |
43 | // exceeds this threshold. |
44 | static const double kTotalAllocationWarningThreshold = 0.5; |
45 | |
46 | // Individual allocations large than this amount will trigger a warning. |
47 | static const double kLargeAllocationWarningThreshold = 0.1; |
48 | |
49 | // Cache first invocation to port::AvailableRam, as it can be expensive. |
50 | static int64_t LargeAllocationWarningBytes() { |
51 | static int64_t value = static_cast<int64_t>(port::AvailableRam() * |
52 | kLargeAllocationWarningThreshold); |
53 | return value; |
54 | } |
55 | |
56 | static int64_t TotalAllocationWarningBytes() { |
57 | static int64_t value = static_cast<int64_t>(port::AvailableRam() * |
58 | kTotalAllocationWarningThreshold); |
59 | return value; |
60 | } |
61 | |
62 | namespace { |
63 | |
64 | // A default Allocator for CPU devices. ProcessState::GetCPUAllocator() will |
65 | // return a different version that may perform better, but may also lack the |
66 | // optional stats triggered by the functions above. TODO(tucker): migrate all |
67 | // uses of cpu_allocator() except tests to use ProcessState instead. |
68 | class CPUAllocator : public Allocator { |
69 | public: |
70 | CPUAllocator() |
71 | : single_allocation_warning_count_(0), |
72 | total_allocation_warning_count_(0) {} |
73 | |
74 | ~CPUAllocator() override {} |
75 | |
76 | string Name() override { return "cpu" ; } |
77 | |
78 | void* AllocateRaw(size_t alignment, size_t num_bytes) override { |
79 | if (num_bytes > static_cast<size_t>(LargeAllocationWarningBytes()) && |
80 | single_allocation_warning_count_ < kMaxSingleAllocationWarnings) { |
81 | ++single_allocation_warning_count_; |
82 | LOG(WARNING) << "Allocation of " << num_bytes << " exceeds " |
83 | << 100 * kLargeAllocationWarningThreshold |
84 | << "% of free system memory." ; |
85 | } |
86 | |
87 | void* p = port::AlignedMalloc(num_bytes, alignment); |
88 | if (cpu_allocator_collect_stats) { |
89 | const std::size_t alloc_size = port::MallocExtension_GetAllocatedSize(p); |
90 | mutex_lock l(mu_); |
91 | ++stats_.num_allocs; |
92 | stats_.bytes_in_use += alloc_size; |
93 | stats_.peak_bytes_in_use = |
94 | std::max<int64_t>(stats_.peak_bytes_in_use, stats_.bytes_in_use); |
95 | stats_.largest_alloc_size = |
96 | std::max<int64_t>(stats_.largest_alloc_size, alloc_size); |
97 | |
98 | if (stats_.bytes_in_use > TotalAllocationWarningBytes() && |
99 | total_allocation_warning_count_ < kMaxTotalAllocationWarnings) { |
100 | ++total_allocation_warning_count_; |
101 | LOG(WARNING) << "Total allocated memory " << stats_.bytes_in_use |
102 | << "exceeds " << 100 * kTotalAllocationWarningThreshold |
103 | << "% of free system memory" ; |
104 | } |
105 | if (p != nullptr) { |
106 | AddTraceMe("MemoryAllocation" , p, num_bytes, alloc_size); |
107 | } |
108 | } |
109 | return p; |
110 | } |
111 | |
112 | void DeallocateRaw(void* ptr) override { |
113 | if (cpu_allocator_collect_stats) { |
114 | const std::size_t alloc_size = |
115 | port::MallocExtension_GetAllocatedSize(ptr); |
116 | mutex_lock l(mu_); |
117 | stats_.bytes_in_use -= alloc_size; |
118 | AddTraceMe("MemoryDeallocation" , ptr, 0, alloc_size); |
119 | } |
120 | port::AlignedFree(ptr); |
121 | } |
122 | |
123 | void AddTraceMe(absl::string_view traceme_name, const void* chunk_ptr, |
124 | std::size_t req_bytes, std::size_t alloc_bytes) { |
125 | tensorflow::profiler::TraceMe::InstantActivity( |
126 | [this, traceme_name, chunk_ptr, req_bytes, |
127 | alloc_bytes]() TF_NO_THREAD_SAFETY_ANALYSIS { |
128 | const auto& annotation = tensorflow::profiler:: |
129 | ScopedMemoryDebugAnnotation::CurrentAnnotation(); |
130 | return tensorflow::profiler::TraceMeEncode( |
131 | traceme_name, {{"allocator_name" , Name()}, |
132 | {"bytes_reserved" , stats_.bytes_reserved}, |
133 | {"bytes_allocated" , stats_.bytes_in_use}, |
134 | {"peak_bytes_in_use" , stats_.peak_bytes_in_use}, |
135 | {"requested_bytes" , req_bytes}, |
136 | {"allocation_bytes" , alloc_bytes}, |
137 | {"addr" , reinterpret_cast<uint64>(chunk_ptr)}, |
138 | {"tf_op" , annotation.pending_op_name}, |
139 | {"id" , annotation.pending_step_id}, |
140 | {"region_type" , annotation.pending_region_type}, |
141 | {"data_type" , annotation.pending_data_type}, |
142 | {"shape" , annotation.pending_shape_func()}}); |
143 | }, |
144 | /*level=*/tensorflow::profiler::TraceMeLevel::kInfo); |
145 | } |
146 | |
147 | absl::optional<AllocatorStats> GetStats() override { |
148 | if (!cpu_allocator_collect_stats) return absl::nullopt; |
149 | mutex_lock l(mu_); |
150 | return stats_; |
151 | } |
152 | |
153 | bool ClearStats() override { |
154 | if (!cpu_allocator_collect_stats) return false; |
155 | mutex_lock l(mu_); |
156 | stats_.num_allocs = 0; |
157 | stats_.peak_bytes_in_use = stats_.bytes_in_use; |
158 | stats_.largest_alloc_size = 0; |
159 | return true; |
160 | } |
161 | |
162 | size_t AllocatedSizeSlow(const void* ptr) const override { |
163 | return port::MallocExtension_GetAllocatedSize(ptr); |
164 | } |
165 | |
166 | AllocatorMemoryType GetMemoryType() const override { |
167 | return AllocatorMemoryType::kHostPageable; |
168 | } |
169 | |
170 | private: |
171 | mutex mu_; |
172 | AllocatorStats stats_ TF_GUARDED_BY(mu_); |
173 | |
174 | // Use <atomic> for single allocations to avoid mutex contention when |
175 | // statistics are disabled. |
176 | std::atomic<int> single_allocation_warning_count_; |
177 | int total_allocation_warning_count_ TF_GUARDED_BY(mu_); |
178 | |
179 | TF_DISALLOW_COPY_AND_ASSIGN(CPUAllocator); |
180 | }; |
181 | |
182 | class CPUAllocatorFactory : public AllocatorFactory { |
183 | public: |
184 | Allocator* CreateAllocator() override { return new CPUAllocator; } |
185 | |
186 | SubAllocator* CreateSubAllocator(int numa_node) override { |
187 | return new CPUSubAllocator(new CPUAllocator); |
188 | } |
189 | |
190 | private: |
191 | class CPUSubAllocator : public SubAllocator { |
192 | public: |
193 | explicit CPUSubAllocator(CPUAllocator* cpu_allocator) |
194 | : SubAllocator({}, {}), cpu_allocator_(cpu_allocator) {} |
195 | |
196 | void* Alloc(size_t alignment, size_t num_bytes, |
197 | size_t* bytes_received) override { |
198 | *bytes_received = num_bytes; |
199 | return cpu_allocator_->AllocateRaw(alignment, num_bytes); |
200 | } |
201 | |
202 | void Free(void* ptr, size_t num_bytes) override { |
203 | cpu_allocator_->DeallocateRaw(ptr); |
204 | } |
205 | |
206 | bool SupportsCoalescing() const override { return false; } |
207 | |
208 | AllocatorMemoryType GetMemoryType() const override { |
209 | return cpu_allocator_->GetMemoryType(); |
210 | } |
211 | |
212 | private: |
213 | CPUAllocator* cpu_allocator_; |
214 | }; |
215 | }; |
216 | |
217 | REGISTER_MEM_ALLOCATOR("DefaultCPUAllocator" , 100, CPUAllocatorFactory); |
218 | } // namespace |
219 | |
220 | } // namespace tsl |
221 | |