1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ |
17 | #define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ |
18 | |
19 | #include <memory> |
20 | #include <string> |
21 | #include <vector> |
22 | |
23 | #include "absl/base/macros.h" |
24 | #include "absl/strings/string_view.h" |
25 | #include "tensorflow/core/framework/device_attributes.pb.h" |
26 | #include "tensorflow/core/framework/tensor.h" |
27 | #include "tensorflow/core/lib/core/errors.h" |
28 | #include "tensorflow/core/lib/core/refcount.h" |
29 | #include "tensorflow/core/lib/core/status.h" |
30 | #include "tensorflow/core/lib/core/stringpiece.h" |
31 | #include "tensorflow/core/platform/logging.h" |
32 | #include "tensorflow/core/platform/threadpool.h" |
33 | #include "tensorflow/core/util/device_name_utils.h" |
34 | |
35 | namespace Eigen { |
36 | struct ThreadPoolDevice; |
37 | } // end namespace Eigen |
38 | |
39 | namespace stream_executor { |
40 | class Stream; |
41 | } // namespace stream_executor |
42 | |
43 | namespace tsl { |
44 | class Env; |
45 | namespace thread { |
46 | class ThreadPool; |
47 | } // namespace thread |
48 | } // namespace tsl |
49 | namespace tensorflow { |
50 | |
51 | class Device; |
52 | class DeviceAttributes; |
53 | class EventMgr; |
54 | class OpKernelContext; |
55 | class ResourceMgr; |
56 | class ScopedAllocatorMgr; |
57 | class TensorProto; |
58 | |
59 | // A wrapper for an Eigen Gpu Device that includes per-op state. The |
60 | // class is defined even for non-GPU devices since the |
61 | // OpKernelContext::Params structure wants to fill it in. |
62 | class PerOpGpuDevice { |
63 | public: |
64 | virtual ~PerOpGpuDevice() {} |
65 | virtual const Eigen::GpuDevice& device() const = 0; |
66 | }; |
67 | |
68 | // A class that devices can subclass to pass around |
69 | // Device-specific context to OpKernels. |
70 | class DeviceContext : public core::RefCounted { |
71 | public: |
72 | ~DeviceContext() override {} |
73 | virtual stream_executor::Stream* stream() const { return nullptr; } |
74 | virtual void MaintainLifetimeOnStream(const Tensor* t, |
75 | stream_executor::Stream* stream) const { |
76 | } |
77 | |
78 | // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into |
79 | // "device_tensor" which is on a non-CPU device "device". "device_tensor" |
80 | // must be allocated to be of the same size as "cpu_tensor". |
81 | virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device, |
82 | Tensor* device_tensor, StatusCallback done, |
83 | bool sync_dst_compute = true) const { |
84 | done(errors::Internal("Unrecognized device type in CPU-to-device Copy" )); |
85 | } |
86 | |
87 | // Same as CopyCPUTensorToDevice, but in a synchronous way. |
88 | Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device, |
89 | Tensor* device_tensor) const; |
90 | |
91 | // Copies a tensor in this device. |
92 | virtual void CopyTensorInSameDevice(const Tensor* input_tensor, |
93 | Device* device, Tensor* output_tensor, |
94 | StatusCallback done) const { |
95 | done(errors::Unimplemented("Copy in same device not implemented." )); |
96 | } |
97 | |
98 | // "device_tensor" is a tensor on a non-CPU device. Copies |
99 | // device_tensor into "cpu_tensor". "cpu_tensor" must be allocated |
100 | // to be of the same size as "device_tensor". |
101 | virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor, |
102 | StringPiece tensor_name, Device* device, |
103 | Tensor* cpu_tensor, StatusCallback done) { |
104 | done(errors::Internal("Unrecognized device type in device-to-CPU Copy" )); |
105 | } |
106 | |
107 | // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done. |
108 | Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor, |
109 | StringPiece tensor_name, Device* device, |
110 | Tensor* cpu_tensor); |
111 | |
112 | // If possible, wait for all events on *stream to complete then execute func. |
113 | // A non-OK Status is returned otherwise. The stream argument should be the |
114 | // one provided by AcceleratorDeviceInfo. This function is not applicable to |
115 | // devices that don't provide such a value. |
116 | virtual Status ThenExecute(Device* device, stream_executor::Stream* stream, |
117 | std::function<void()> func) { |
118 | return errors::Internal("ThenExecute not supported by device" ); |
119 | } |
120 | |
121 | // check if device is a pluggable device |
122 | virtual bool IsPluggableDevice() { return false; } |
123 | |
124 | // Returns the pinned host memory allocator for the device. |
125 | virtual Allocator* host_memory_allocator() const { return nullptr; } |
126 | }; |
127 | |
128 | class DeviceBase { |
129 | public: |
130 | explicit DeviceBase(tsl::Env* env) : env_(env) {} |
131 | virtual ~DeviceBase(); |
132 | |
133 | tsl::Env* env() const { return env_; } |
134 | |
135 | struct CpuWorkerThreads { |
136 | int num_threads = 0; |
137 | tsl::thread::ThreadPool* workers = nullptr; |
138 | }; |
139 | |
140 | // Does not take ownership. |
141 | void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) { |
142 | cpu_worker_threads_ = t; |
143 | } |
144 | |
145 | virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const { |
146 | CHECK(cpu_worker_threads_ != nullptr); |
147 | return cpu_worker_threads_; |
148 | } |
149 | |
150 | // "stream" is used in special circumstances (such as the |
151 | // constructors of Ops) where there is no available OpKernelContext. |
152 | // "default_context" is used by OpKernelContext whenever a device does not |
153 | // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only |
154 | // using a single stream.) |
155 | // "event_mgr" is used to delay deallocation of temporary GPU buffers. |
156 | // TODO(pbar) Work out how to move this out of DeviceBase. |
157 | struct AcceleratorDeviceInfo { |
158 | // Make sure all the defaults are NULL, so we can spot missing assignments. |
159 | stream_executor::Stream* stream = nullptr; |
160 | DeviceContext* default_context = nullptr; |
161 | EventMgr* event_mgr = nullptr; |
162 | int gpu_id = -1; |
163 | }; |
164 | |
165 | // Does not take ownership. |
166 | void set_tensorflow_accelerator_device_info( |
167 | AcceleratorDeviceInfo* device_info) { |
168 | accelerator_device_info_ = device_info; |
169 | } |
170 | |
171 | virtual const AcceleratorDeviceInfo* tensorflow_accelerator_device_info() |
172 | const { |
173 | return accelerator_device_info_; |
174 | } |
175 | |
176 | // The preferred thread pool for this device. If it is nullptr, the system |
177 | // automatically assigns a thread pool for execution. |
178 | virtual tsl::thread::ThreadPool* tensorflow_device_thread_pool() { |
179 | return device_thread_pool_; |
180 | } |
181 | |
182 | // Does not take ownership. |
183 | void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d); |
184 | |
185 | // Return the Allocator implementation to use based on the allocator |
186 | // attributes requested. See allocator.h for more details. |
187 | virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) { |
188 | LOG(FATAL) << "GetAllocator() is not implemented." ; |
189 | return nullptr; |
190 | } |
191 | |
192 | // This method is provided for backwards compatibility, and will be removed |
193 | // in a future release. |
194 | ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`." ) |
195 | Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) { |
196 | return GetAllocator(attr); |
197 | } |
198 | |
199 | // Return an Allocator prepared for use in particular places by graph |
200 | // optimization |
201 | virtual Allocator* GetScopedAllocator(AllocatorAttributes attr, |
202 | int64_t step_id) { |
203 | LOG(FATAL) << "Device does not implement GetScopedAllocator()" ; |
204 | return nullptr; |
205 | } |
206 | |
207 | virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; } |
208 | |
209 | virtual bool has_eigen_cpu_device() const { |
210 | return !eigen_cpu_devices_.empty(); |
211 | } |
212 | |
213 | virtual const Eigen::ThreadPoolDevice* eigen_cpu_device(); |
214 | |
215 | // Caller owns the return value. The OpKernelContext calls this even |
216 | // for devices that do not implement an eigen_gpu_device. Overridden |
217 | // by GPU devices to return a derived type. |
218 | virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; } |
219 | |
220 | virtual DeviceBase* UnderlyingDevice() { return this; } |
221 | virtual const DeviceBase* UnderlyingDevice() const { return this; } |
222 | |
223 | // This is overridden by GPU devices to reinitialize the derived |
224 | // type returned by MakeGpuDevice. |
225 | virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/, |
226 | PerOpGpuDevice* /*device*/, |
227 | DeviceContext* /*dc*/, |
228 | Allocator* /*allocator*/) { |
229 | return OkStatus(); |
230 | } |
231 | |
232 | // Unimplemented by default |
233 | virtual const DeviceAttributes& attributes() const; |
234 | virtual int NumaNode() const { return attributes().locality().numa_node(); } |
235 | virtual const std::string& name() const; |
236 | virtual const DeviceNameUtils::ParsedName& parsed_name() const; |
237 | |
238 | // Updates `attributes()`, indicating the XLA global ID associated with this |
239 | // device. This ID is unique across clients in a multi-client setup. For TPUs |
240 | // this does not happen until the TPU system has been initialized. |
241 | // |
242 | // Implemented in Device. |
243 | virtual void set_xla_global_id(int64_t id) {} |
244 | |
245 | // Materializes the given TensorProto into 'tensor' stored in Device |
246 | // memory. Most devices will want to override this. |
247 | // |
248 | // TODO(vrv): We should be able to put this function into |
249 | // OpKernelContext and handle the copies from device memory via send |
250 | // and receive nodes, instead of requiring that each device handle |
251 | // the copies here as well as in copy ops. |
252 | virtual Status MakeTensorFromProto(const TensorProto& tensor_proto, |
253 | const AllocatorAttributes alloc_attrs, |
254 | Tensor* tensor) { |
255 | return errors::Internal("Device does not implement MakeTensorFromProto()" ); |
256 | } |
257 | |
258 | // Some devices (i.e. GPUs) may free device memory prior to its actual use |
259 | // being completed on the assumption that subsequent allocations can only be |
260 | // used serially with respect to pending uses. If this function returns a |
261 | // non-zero value it is the value of a device-specific counter such that any |
262 | // device memory tagged with an earlier freed-at count is really unencumbered |
263 | // by pending uses. For this to be useful the device memory allocator must |
264 | // be tagging deallocated memory chunks using the same counter. |
265 | virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; } |
266 | |
267 | // Copies `input_tensor` to `output_tensor`, where both tensors are on this |
268 | // device. This function assumes that `output_tensor` has already been |
269 | // allocated with a buffer that is large enough to hold `input_tensor`'s data. |
270 | // Calls `done` from a device-specific thread after copy is finished, which |
271 | // may be the same as calling thread. |
272 | // |
273 | // NOTE(ayushd): This function is for TensorFlow internal use only. Deep copy |
274 | // is discouraged and should not be used in OpKernels. |
275 | virtual void CopyTensorInSameDevice(const Tensor* input_tensor, |
276 | Tensor* output_tensor, |
277 | const DeviceContext* device_context, |
278 | StatusCallback done) { |
279 | done(errors::Internal("Device " , name(), " does not implement " , |
280 | "CopyTensorInSameDevice" )); |
281 | } |
282 | |
283 | protected: |
284 | // Does not take ownership. |
285 | void set_tensorflow_device_thread_pool(tsl::thread::ThreadPool* thread_pool) { |
286 | device_thread_pool_ = thread_pool; |
287 | } |
288 | |
289 | private: |
290 | tsl::Env* const env_; |
291 | CpuWorkerThreads* cpu_worker_threads_ = nullptr; |
292 | // Set by GPUs as well as by TPU devices. |
293 | AcceleratorDeviceInfo* accelerator_device_info_ = nullptr; |
294 | tsl::thread::ThreadPool* device_thread_pool_ = nullptr; |
295 | std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_; |
296 | }; |
297 | |
298 | // Methods to create and check for Symbolic execution devices. |
299 | // Such devices are mostly used for TF-XLA bridge. TF should not treat these as |
300 | // normal devices. |
301 | void AddSymbolicExecutionDevice(absl::string_view device_name); |
302 | bool IsSymbolicExecutionDevice(absl::string_view device_name); |
303 | |
304 | } // namespace tensorflow |
305 | |
306 | #endif // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_ |
307 | |