1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/core/common_runtime/process_state.h" |
17 | |
18 | #include <atomic> |
19 | #include <cstring> |
20 | #include <vector> |
21 | |
22 | #include "absl/base/call_once.h" |
23 | #include "tensorflow/core/common_runtime/bfc_allocator.h" |
24 | #include "tensorflow/core/common_runtime/pool_allocator.h" |
25 | #include "tensorflow/core/framework/allocator.h" |
26 | #include "tensorflow/core/framework/log_memory.h" |
27 | #include "tensorflow/core/framework/tracking_allocator.h" |
28 | #include "tensorflow/core/lib/strings/strcat.h" |
29 | #include "tensorflow/core/platform/logging.h" |
30 | #include "tensorflow/core/platform/mutex.h" |
31 | #include "tensorflow/core/platform/types.h" |
32 | #include "tensorflow/core/util/env_var.h" |
33 | |
34 | namespace tensorflow { |
35 | |
36 | /*static*/ ProcessState* ProcessState::singleton() { |
37 | static ProcessState* instance = new ProcessState; |
38 | static absl::once_flag f; |
39 | absl::call_once(f, []() { |
40 | AllocatorFactoryRegistry::singleton()->process_state_ = instance; |
41 | }); |
42 | |
43 | return instance; |
44 | } |
45 | |
46 | ProcessState::ProcessState() |
47 | : numa_enabled_(false), cpu_allocators_cached_(0) {} |
48 | |
49 | string ProcessState::MemDesc::DebugString() { |
50 | return strings::StrCat((loc == CPU ? "CPU " : "GPU " ), dev_index, |
51 | ", dma: " , gpu_registered, ", nic: " , nic_registered); |
52 | } |
53 | |
54 | ProcessState::MemDesc ProcessState::PtrType(const void* ptr) { |
55 | if (FLAGS_brain_gpu_record_mem_types) { |
56 | auto iter = mem_desc_map_.find(ptr); |
57 | if (iter != mem_desc_map_.end()) { |
58 | return iter->second; |
59 | } |
60 | } |
61 | return MemDesc(); |
62 | } |
63 | |
64 | Allocator* ProcessState::GetCPUAllocator(int numa_node) { |
65 | if (!numa_enabled_ || numa_node == port::kNUMANoAffinity) numa_node = 0; |
66 | |
67 | // Check if allocator for the numa node is in lock-free cache. |
68 | if (numa_node < cpu_allocators_cached_.load(std::memory_order_acquire)) { |
69 | return cpu_allocators_cache_[numa_node]; |
70 | } |
71 | |
72 | mutex_lock lock(mu_); |
73 | while (cpu_allocators_.size() <= static_cast<size_t>(numa_node)) { |
74 | // If visitors have been defined we need an Allocator built from |
75 | // a SubAllocator. Prefer BFCAllocator, but fall back to PoolAllocator |
76 | // depending on env var setting. |
77 | const bool alloc_visitors_defined = |
78 | (!cpu_alloc_visitors_.empty() || !cpu_free_visitors_.empty()); |
79 | bool use_bfc_allocator = false; |
80 | Status status = ReadBoolFromEnvVar( |
81 | "TF_CPU_ALLOCATOR_USE_BFC" , alloc_visitors_defined, &use_bfc_allocator); |
82 | if (!status.ok()) { |
83 | LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); |
84 | } |
85 | Allocator* allocator = nullptr; |
86 | SubAllocator* sub_allocator = |
87 | (numa_enabled_ || alloc_visitors_defined || use_bfc_allocator) |
88 | ? new BasicCPUAllocator( |
89 | numa_enabled_ ? numa_node : port::kNUMANoAffinity, |
90 | cpu_alloc_visitors_, cpu_free_visitors_) |
91 | : nullptr; |
92 | if (use_bfc_allocator) { |
93 | // TODO(reedwm): evaluate whether 64GB by default is the best choice. |
94 | int64_t cpu_mem_limit_in_mb = -1; |
95 | Status status = ReadInt64FromEnvVar("TF_CPU_BFC_MEM_LIMIT_IN_MB" , |
96 | 1LL << 16 /*64GB max by default*/, |
97 | &cpu_mem_limit_in_mb); |
98 | if (!status.ok()) { |
99 | LOG(ERROR) << "GetCPUAllocator: " << status.error_message(); |
100 | } |
101 | int64_t cpu_mem_limit = cpu_mem_limit_in_mb * (1LL << 20); |
102 | DCHECK(sub_allocator); |
103 | |
104 | BFCAllocator::Options allocator_opts; |
105 | allocator_opts.allow_growth = true; |
106 | allocator = new BFCAllocator( |
107 | absl::WrapUnique(sub_allocator), cpu_mem_limit, |
108 | /*name=*/"bfc_cpu_allocator_for_gpu" , allocator_opts); |
109 | |
110 | VLOG(2) << "Using BFCAllocator with memory limit of " |
111 | << cpu_mem_limit_in_mb << " MB for ProcessState CPU allocator" ; |
112 | } else if (sub_allocator) { |
113 | DCHECK(sub_allocator); |
114 | allocator = |
115 | new PoolAllocator(/*pool_size_limit=*/100, /*auto_resize=*/true, |
116 | sub_allocator, new NoopRounder, "cpu_pool" ); |
117 | VLOG(2) << "Using PoolAllocator for ProcessState CPU allocator " |
118 | << "numa_enabled_=" << numa_enabled_ |
119 | << " numa_node=" << numa_node; |
120 | } else { |
121 | DCHECK(!sub_allocator); |
122 | allocator = cpu_allocator_base(); |
123 | } |
124 | if (LogMemory::IsEnabled() && !allocator->TracksAllocationSizes()) { |
125 | // Wrap the allocator to track allocation ids for better logging |
126 | // at the cost of performance. |
127 | allocator = new TrackingAllocator(allocator, true); |
128 | } |
129 | cpu_allocators_.push_back(allocator); |
130 | if (cpu_allocators_.size() < cpu_allocators_cache_.max_size()) { |
131 | cpu_allocators_cache_[cpu_allocators_.size() - 1] = allocator; |
132 | cpu_allocators_cached_.fetch_add(1, std::memory_order_release); |
133 | } |
134 | if (!sub_allocator) { |
135 | DCHECK(cpu_alloc_visitors_.empty() && cpu_free_visitors_.empty()); |
136 | } |
137 | } |
138 | return cpu_allocators_[numa_node]; |
139 | } |
140 | |
141 | void ProcessState::AddCPUAllocVisitor(SubAllocator::Visitor visitor) { |
142 | VLOG(1) << "AddCPUAllocVisitor" ; |
143 | mutex_lock lock(mu_); |
144 | CHECK_EQ(0, cpu_allocators_.size()) // Crash OK |
145 | << "AddCPUAllocVisitor must be called prior to first call to " |
146 | "ProcessState::GetCPUAllocator" ; |
147 | cpu_alloc_visitors_.push_back(std::move(visitor)); |
148 | } |
149 | |
150 | void ProcessState::AddCPUFreeVisitor(SubAllocator::Visitor visitor) { |
151 | mutex_lock lock(mu_); |
152 | CHECK_EQ(0, cpu_allocators_.size()) // Crash OK |
153 | << "AddCPUFreeVisitor must be called prior to first call to " |
154 | "ProcessState::GetCPUAllocator" ; |
155 | cpu_free_visitors_.push_back(std::move(visitor)); |
156 | } |
157 | |
158 | void ProcessState::TestOnlyReset() { |
159 | mutex_lock lock(mu_); |
160 | // Don't delete this value because it's static. |
161 | Allocator* default_cpu_allocator = cpu_allocator_base(); |
162 | mem_desc_map_.clear(); |
163 | for (Allocator* a : cpu_allocators_) { |
164 | if (a != default_cpu_allocator) delete a; |
165 | } |
166 | cpu_allocators_.clear(); |
167 | for (Allocator* a : cpu_al_) { |
168 | delete a; |
169 | } |
170 | cpu_al_.clear(); |
171 | } |
172 | |
173 | } // namespace tensorflow |
174 | |