1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
17#define TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
18
19#include <memory>
20#include <string>
21#include <vector>
22
23#include "absl/base/macros.h"
24#include "absl/strings/string_view.h"
25#include "tensorflow/core/framework/device_attributes.pb.h"
26#include "tensorflow/core/framework/tensor.h"
27#include "tensorflow/core/lib/core/errors.h"
28#include "tensorflow/core/lib/core/refcount.h"
29#include "tensorflow/core/lib/core/status.h"
30#include "tensorflow/core/lib/core/stringpiece.h"
31#include "tensorflow/core/platform/logging.h"
32#include "tensorflow/core/platform/threadpool.h"
33#include "tensorflow/core/util/device_name_utils.h"
34
35namespace Eigen {
36struct ThreadPoolDevice;
37} // end namespace Eigen
38
39namespace stream_executor {
40class Stream;
41} // namespace stream_executor
42
43namespace tsl {
44class Env;
45namespace thread {
46class ThreadPool;
47} // namespace thread
48} // namespace tsl
49namespace tensorflow {
50
51class Device;
52class DeviceAttributes;
53class EventMgr;
54class OpKernelContext;
55class ResourceMgr;
56class ScopedAllocatorMgr;
57class TensorProto;
58
59// A wrapper for an Eigen Gpu Device that includes per-op state. The
60// class is defined even for non-GPU devices since the
61// OpKernelContext::Params structure wants to fill it in.
62class PerOpGpuDevice {
63 public:
64 virtual ~PerOpGpuDevice() {}
65 virtual const Eigen::GpuDevice& device() const = 0;
66};
67
68// A class that devices can subclass to pass around
69// Device-specific context to OpKernels.
70class DeviceContext : public core::RefCounted {
71 public:
72 ~DeviceContext() override {}
73 virtual stream_executor::Stream* stream() const { return nullptr; }
74 virtual void MaintainLifetimeOnStream(const Tensor* t,
75 stream_executor::Stream* stream) const {
76 }
77
78 // "cpu_tensor" is a tensor on a CPU. Copies "cpu_tensor" into
79 // "device_tensor" which is on a non-CPU device "device". "device_tensor"
80 // must be allocated to be of the same size as "cpu_tensor".
81 virtual void CopyCPUTensorToDevice(const Tensor* cpu_tensor, Device* device,
82 Tensor* device_tensor, StatusCallback done,
83 bool sync_dst_compute = true) const {
84 done(errors::Internal("Unrecognized device type in CPU-to-device Copy"));
85 }
86
87 // Same as CopyCPUTensorToDevice, but in a synchronous way.
88 Status CopyCPUTensorToDeviceSync(const Tensor* cpu_tensor, Device* device,
89 Tensor* device_tensor) const;
90
91 // Copies a tensor in this device.
92 virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
93 Device* device, Tensor* output_tensor,
94 StatusCallback done) const {
95 done(errors::Unimplemented("Copy in same device not implemented."));
96 }
97
98 // "device_tensor" is a tensor on a non-CPU device. Copies
99 // device_tensor into "cpu_tensor". "cpu_tensor" must be allocated
100 // to be of the same size as "device_tensor".
101 virtual void CopyDeviceTensorToCPU(const Tensor* device_tensor,
102 StringPiece tensor_name, Device* device,
103 Tensor* cpu_tensor, StatusCallback done) {
104 done(errors::Internal("Unrecognized device type in device-to-CPU Copy"));
105 }
106
107 // Same as `CopyDeviceTensorToCPU`, but blocks until the copy is done.
108 Status CopyDeviceTensorToCPUSync(const Tensor* device_tensor,
109 StringPiece tensor_name, Device* device,
110 Tensor* cpu_tensor);
111
112 // If possible, wait for all events on *stream to complete then execute func.
113 // A non-OK Status is returned otherwise. The stream argument should be the
114 // one provided by AcceleratorDeviceInfo. This function is not applicable to
115 // devices that don't provide such a value.
116 virtual Status ThenExecute(Device* device, stream_executor::Stream* stream,
117 std::function<void()> func) {
118 return errors::Internal("ThenExecute not supported by device");
119 }
120
121 // check if device is a pluggable device
122 virtual bool IsPluggableDevice() { return false; }
123
124 // Returns the pinned host memory allocator for the device.
125 virtual Allocator* host_memory_allocator() const { return nullptr; }
126};
127
128class DeviceBase {
129 public:
130 explicit DeviceBase(tsl::Env* env) : env_(env) {}
131 virtual ~DeviceBase();
132
133 tsl::Env* env() const { return env_; }
134
135 struct CpuWorkerThreads {
136 int num_threads = 0;
137 tsl::thread::ThreadPool* workers = nullptr;
138 };
139
140 // Does not take ownership.
141 void set_tensorflow_cpu_worker_threads(CpuWorkerThreads* t) {
142 cpu_worker_threads_ = t;
143 }
144
145 virtual const CpuWorkerThreads* tensorflow_cpu_worker_threads() const {
146 CHECK(cpu_worker_threads_ != nullptr);
147 return cpu_worker_threads_;
148 }
149
150 // "stream" is used in special circumstances (such as the
151 // constructors of Ops) where there is no available OpKernelContext.
152 // "default_context" is used by OpKernelContext whenever a device does not
153 // supply a DeviceContext for an op in TryGetDeviceContext() (e.g. when only
154 // using a single stream.)
155 // "event_mgr" is used to delay deallocation of temporary GPU buffers.
156 // TODO(pbar) Work out how to move this out of DeviceBase.
157 struct AcceleratorDeviceInfo {
158 // Make sure all the defaults are NULL, so we can spot missing assignments.
159 stream_executor::Stream* stream = nullptr;
160 DeviceContext* default_context = nullptr;
161 EventMgr* event_mgr = nullptr;
162 int gpu_id = -1;
163 };
164
165 // Does not take ownership.
166 void set_tensorflow_accelerator_device_info(
167 AcceleratorDeviceInfo* device_info) {
168 accelerator_device_info_ = device_info;
169 }
170
171 virtual const AcceleratorDeviceInfo* tensorflow_accelerator_device_info()
172 const {
173 return accelerator_device_info_;
174 }
175
176 // The preferred thread pool for this device. If it is nullptr, the system
177 // automatically assigns a thread pool for execution.
178 virtual tsl::thread::ThreadPool* tensorflow_device_thread_pool() {
179 return device_thread_pool_;
180 }
181
182 // Does not take ownership.
183 void set_eigen_cpu_device(Eigen::ThreadPoolDevice* d);
184
185 // Return the Allocator implementation to use based on the allocator
186 // attributes requested. See allocator.h for more details.
187 virtual Allocator* GetAllocator(AllocatorAttributes /*attr*/) {
188 LOG(FATAL) << "GetAllocator() is not implemented.";
189 return nullptr;
190 }
191
192 // This method is provided for backwards compatibility, and will be removed
193 // in a future release.
194 ABSL_DEPRECATED("Use `this->GetAllocator()` or `this->GetScopedAllocator()`.")
195 Allocator* GetStepAllocator(AllocatorAttributes attr, ResourceMgr*) {
196 return GetAllocator(attr);
197 }
198
199 // Return an Allocator prepared for use in particular places by graph
200 // optimization
201 virtual Allocator* GetScopedAllocator(AllocatorAttributes attr,
202 int64_t step_id) {
203 LOG(FATAL) << "Device does not implement GetScopedAllocator()";
204 return nullptr;
205 }
206
207 virtual ScopedAllocatorMgr* GetScopedAllocatorMgr() const { return nullptr; }
208
209 virtual bool has_eigen_cpu_device() const {
210 return !eigen_cpu_devices_.empty();
211 }
212
213 virtual const Eigen::ThreadPoolDevice* eigen_cpu_device();
214
215 // Caller owns the return value. The OpKernelContext calls this even
216 // for devices that do not implement an eigen_gpu_device. Overridden
217 // by GPU devices to return a derived type.
218 virtual PerOpGpuDevice* MakeGpuDevice() { return nullptr; }
219
220 virtual DeviceBase* UnderlyingDevice() { return this; }
221 virtual const DeviceBase* UnderlyingDevice() const { return this; }
222
223 // This is overridden by GPU devices to reinitialize the derived
224 // type returned by MakeGpuDevice.
225 virtual Status ReinitializeGpuDevice(OpKernelContext* /*context*/,
226 PerOpGpuDevice* /*device*/,
227 DeviceContext* /*dc*/,
228 Allocator* /*allocator*/) {
229 return OkStatus();
230 }
231
232 // Unimplemented by default
233 virtual const DeviceAttributes& attributes() const;
234 virtual int NumaNode() const { return attributes().locality().numa_node(); }
235 virtual const std::string& name() const;
236 virtual const DeviceNameUtils::ParsedName& parsed_name() const;
237
238 // Updates `attributes()`, indicating the XLA global ID associated with this
239 // device. This ID is unique across clients in a multi-client setup. For TPUs
240 // this does not happen until the TPU system has been initialized.
241 //
242 // Implemented in Device.
243 virtual void set_xla_global_id(int64_t id) {}
244
245 // Materializes the given TensorProto into 'tensor' stored in Device
246 // memory. Most devices will want to override this.
247 //
248 // TODO(vrv): We should be able to put this function into
249 // OpKernelContext and handle the copies from device memory via send
250 // and receive nodes, instead of requiring that each device handle
251 // the copies here as well as in copy ops.
252 virtual Status MakeTensorFromProto(const TensorProto& tensor_proto,
253 const AllocatorAttributes alloc_attrs,
254 Tensor* tensor) {
255 return errors::Internal("Device does not implement MakeTensorFromProto()");
256 }
257
258 // Some devices (i.e. GPUs) may free device memory prior to its actual use
259 // being completed on the assumption that subsequent allocations can only be
260 // used serially with respect to pending uses. If this function returns a
261 // non-zero value it is the value of a device-specific counter such that any
262 // device memory tagged with an earlier freed-at count is really unencumbered
263 // by pending uses. For this to be useful the device memory allocator must
264 // be tagging deallocated memory chunks using the same counter.
265 virtual uint64 SafeAllocFrontier(uint64 old_value) { return 0; }
266
267 // Copies `input_tensor` to `output_tensor`, where both tensors are on this
268 // device. This function assumes that `output_tensor` has already been
269 // allocated with a buffer that is large enough to hold `input_tensor`'s data.
270 // Calls `done` from a device-specific thread after copy is finished, which
271 // may be the same as calling thread.
272 //
273 // NOTE(ayushd): This function is for TensorFlow internal use only. Deep copy
274 // is discouraged and should not be used in OpKernels.
275 virtual void CopyTensorInSameDevice(const Tensor* input_tensor,
276 Tensor* output_tensor,
277 const DeviceContext* device_context,
278 StatusCallback done) {
279 done(errors::Internal("Device ", name(), " does not implement ",
280 "CopyTensorInSameDevice"));
281 }
282
283 protected:
284 // Does not take ownership.
285 void set_tensorflow_device_thread_pool(tsl::thread::ThreadPool* thread_pool) {
286 device_thread_pool_ = thread_pool;
287 }
288
289 private:
290 tsl::Env* const env_;
291 CpuWorkerThreads* cpu_worker_threads_ = nullptr;
292 // Set by GPUs as well as by TPU devices.
293 AcceleratorDeviceInfo* accelerator_device_info_ = nullptr;
294 tsl::thread::ThreadPool* device_thread_pool_ = nullptr;
295 std::vector<Eigen::ThreadPoolDevice*> eigen_cpu_devices_;
296};
297
298// Methods to create and check for Symbolic execution devices.
299// Such devices are mostly used for TF-XLA bridge. TF should not treat these as
300// normal devices.
301void AddSymbolicExecutionDevice(absl::string_view device_name);
302bool IsSymbolicExecutionDevice(absl::string_view device_name);
303
304} // namespace tensorflow
305
306#endif // TENSORFLOW_CORE_FRAMEWORK_DEVICE_BASE_H_
307