1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | // See docs in ../ops/array_ops.cc. |
17 | |
18 | #define EIGEN_USE_THREADS |
19 | |
20 | #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ |
21 | (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) |
22 | #define EIGEN_USE_GPU |
23 | #endif |
24 | |
25 | #include "tensorflow/core/kernels/constant_op.h" |
26 | |
27 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
28 | #include "tensorflow/core/framework/allocator.h" |
29 | #include "tensorflow/core/framework/bounds_check.h" |
30 | #include "tensorflow/core/framework/node_def.pb.h" |
31 | #include "tensorflow/core/framework/register_types.h" |
32 | #include "tensorflow/core/framework/tensor.h" |
33 | #include "tensorflow/core/framework/tensor.pb.h" |
34 | #include "tensorflow/core/framework/tensor_shape.h" |
35 | #include "tensorflow/core/framework/tensor_types.h" |
36 | #include "tensorflow/core/framework/types.h" |
37 | #include "tensorflow/core/framework/variant_op_registry.h" |
38 | #include "tensorflow/core/graph/graph_node_util.h" |
39 | #include "tensorflow/core/kernels/fill_functor.h" |
40 | #include "tensorflow/core/platform/macros.h" |
41 | #include "tensorflow/core/profiler/lib/scoped_memory_debug_annotation.h" |
42 | |
43 | namespace tensorflow { |
44 | |
45 | namespace { |
46 | |
47 | NodeDef StripTensorDataFromNodeDef(OpKernelConstruction* ctx) { |
48 | const NodeDef& original = ctx->def(); |
49 | if (std::is_base_of<protobuf::Message, NodeDef>()) { |
50 | DCHECK_EQ(reinterpret_cast<const protobuf::Message*>(&original) |
51 | ->GetDescriptor() |
52 | ->field_count(), |
53 | 7) |
54 | << "The NodeDef format has changed, and the attr-stripping code may " |
55 | "need to be updated." ; |
56 | } |
57 | NodeDef ret; |
58 | ret.set_name(original.name()); |
59 | ret.set_op(original.op()); |
60 | ret.set_device(original.device()); |
61 | // Strip the "value" attr from the returned NodeDef. |
62 | // NOTE(mrry): The present implementation of `OpKernel::OpKernel()` only uses |
63 | // attrs that affect the cardinality of list-typed inputs and outputs, so it |
64 | // is safe to drop other attrs from the NodeDef. |
65 | AddNodeAttr("dtype" , ctx->output_type(0), &ret); |
66 | MergeDebugInfo(original, &ret); |
67 | if (original.has_experimental_type()) { |
68 | *ret.mutable_experimental_type() = original.experimental_type(); |
69 | } |
70 | return ret; |
71 | } |
72 | |
73 | } // namespace |
74 | |
75 | ConstantOp::ConstantOp(OpKernelConstruction* ctx) |
76 | : OpKernel(ctx, StripTensorDataFromNodeDef(ctx), false), |
77 | tensor_(ctx->output_type(0)) { |
78 | const TensorProto* proto = nullptr; |
79 | profiler::ScopedMemoryDebugAnnotation op_annotation(name_view().data()); |
80 | OP_REQUIRES_OK(ctx, ctx->GetAttr("value" , &proto)); |
81 | OP_REQUIRES_OK(ctx, ctx->device()->MakeTensorFromProto( |
82 | *proto, AllocatorAttributes(), &tensor_)); |
83 | OP_REQUIRES( |
84 | ctx, ctx->output_type(0) == tensor_.dtype(), |
85 | errors::InvalidArgument("Type mismatch between value (" , |
86 | DataTypeString(tensor_.dtype()), ") and dtype (" , |
87 | DataTypeString(ctx->output_type(0)), ")" )); |
88 | } |
89 | |
90 | void ConstantOp::Compute(OpKernelContext* ctx) { |
91 | ctx->set_output(0, tensor_); |
92 | if (TF_PREDICT_FALSE(ctx->track_allocations())) { |
93 | ctx->record_persistent_memory_allocation(tensor_.AllocatedBytes()); |
94 | } |
95 | } |
96 | |
97 | ConstantOp::~ConstantOp() {} |
98 | |
99 | REGISTER_KERNEL_BUILDER(Name("Const" ).Device(DEVICE_CPU), ConstantOp); |
100 | REGISTER_KERNEL_BUILDER(Name("Const" ).Device(DEVICE_TPU_SYSTEM), ConstantOp); |
101 | |
102 | #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ |
103 | (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) |
104 | #define REGISTER_KERNEL(D, TYPE) \ |
105 | REGISTER_KERNEL_BUILDER( \ |
106 | Name("Const").Device(DEVICE_##D).TypeConstraint<TYPE>("dtype"), \ |
107 | ConstantOp); |
108 | REGISTER_KERNEL(GPU, Eigen::half); |
109 | REGISTER_KERNEL(GPU, bfloat16); |
110 | REGISTER_KERNEL(GPU, float); |
111 | REGISTER_KERNEL(GPU, double); |
112 | REGISTER_KERNEL(GPU, uint8); |
113 | REGISTER_KERNEL(GPU, int8); |
114 | REGISTER_KERNEL(GPU, qint8); |
115 | REGISTER_KERNEL(GPU, uint16); |
116 | REGISTER_KERNEL(GPU, int16); |
117 | REGISTER_KERNEL(GPU, qint16); |
118 | REGISTER_KERNEL(GPU, quint16); |
119 | REGISTER_KERNEL(GPU, uint32); |
120 | REGISTER_KERNEL(GPU, qint32); |
121 | REGISTER_KERNEL(GPU, int64_t); |
122 | REGISTER_KERNEL(GPU, uint64); |
123 | REGISTER_KERNEL(GPU, complex64); |
124 | REGISTER_KERNEL(GPU, complex128); |
125 | REGISTER_KERNEL(GPU, bool); |
126 | REGISTER_KERNEL(GPU, Variant); |
127 | #undef REGISTER_KERNEL |
128 | #endif |
129 | |
130 | #define REGISTER_DEFAULT_KERNEL(TYPE) \ |
131 | REGISTER_KERNEL_BUILDER( \ |
132 | Name("Const").Device(DEVICE_DEFAULT).TypeConstraint<TYPE>("dtype"), \ |
133 | ConstantOp); |
134 | TF_CALL_NUMBER_TYPES_NO_INT32(REGISTER_DEFAULT_KERNEL); |
135 | TF_CALL_QUANTIZED_TYPES(REGISTER_DEFAULT_KERNEL); |
136 | TF_CALL_qint16(REGISTER_DEFAULT_KERNEL); |
137 | TF_CALL_quint16(REGISTER_DEFAULT_KERNEL); |
138 | TF_CALL_bool(REGISTER_DEFAULT_KERNEL); |
139 | TF_CALL_variant(REGISTER_DEFAULT_KERNEL); |
140 | #undef REGISTER_DEFAULT_KERNEL |
141 | |
142 | typedef Eigen::ThreadPoolDevice CPUDevice; |
143 | typedef Eigen::GpuDevice GPUDevice; |
144 | |
145 | template <typename Device, typename T, typename Index> |
146 | class FillOp : public OpKernel { |
147 | public: |
148 | explicit FillOp(OpKernelConstruction* context) : OpKernel(context) {} |
149 | |
150 | void Compute(OpKernelContext* context) override { |
151 | const Tensor& Tdims = context->input(0); |
152 | OP_REQUIRES( |
153 | context, |
154 | // TODO(rmlarsen): Disallow legacy use of scalars to represent shape. |
155 | (TensorShapeUtils::IsVector(Tdims.shape()) || |
156 | TensorShapeUtils::IsScalar(Tdims.shape())), |
157 | errors::InvalidArgument("dims must represent a vector, got shape " , |
158 | Tdims.shape().DebugString())); |
159 | const Tensor& Tvalue = context->input(1); |
160 | OP_REQUIRES( |
161 | context, |
162 | // TODO(rmlarsen): Disallow legacy use of length-1 vector to represent |
163 | // scalar. |
164 | TensorShapeUtils::IsScalar(Tvalue.shape()) || |
165 | (TensorShapeUtils::IsVector(Tvalue.shape()) && |
166 | Tvalue.shape().dim_size(0) == 1), |
167 | errors::InvalidArgument("value must represent a scalar, got shape " , |
168 | Tvalue.shape().DebugString())); |
169 | auto dims = Tdims.flat<Index>(); |
170 | TensorShape shape; |
171 | OP_REQUIRES_OK(context, TensorShapeUtils::MakeShape( |
172 | reinterpret_cast<const Index*>(dims.data()), |
173 | dims.size(), &shape)); |
174 | Tensor* out = nullptr; |
175 | OP_REQUIRES_OK(context, context->allocate_output(0, shape, &out)); |
176 | functor::FillFunctor<Device, T> functor; |
177 | functor(context->eigen_device<Device>(), out->flat<T>(), |
178 | Tvalue.scalar<T>()); |
179 | } |
180 | }; |
181 | |
182 | #define REGISTER_KERNEL(D, TYPE) \ |
183 | REGISTER_KERNEL_BUILDER(Name("Fill") \ |
184 | .Device(DEVICE_##D) \ |
185 | .TypeConstraint<TYPE>("T") \ |
186 | .TypeConstraint<int32>("index_type") \ |
187 | .HostMemory("dims"), \ |
188 | FillOp<D##Device, TYPE, int32>); \ |
189 | REGISTER_KERNEL_BUILDER(Name("Fill") \ |
190 | .Device(DEVICE_##D) \ |
191 | .TypeConstraint<TYPE>("T") \ |
192 | .TypeConstraint<int64_t>("index_type") \ |
193 | .HostMemory("dims"), \ |
194 | FillOp<D##Device, TYPE, int64>); |
195 | |
196 | #define REGISTER_CPU_KERNEL(TYPE) REGISTER_KERNEL(CPU, TYPE) |
197 | TF_CALL_ALL_TYPES(REGISTER_CPU_KERNEL); |
198 | // TODO(b/28917570): Add a test for this. Currently python 3 is not happy about |
199 | // the conversion from uint8 to quint8. |
200 | REGISTER_KERNEL(CPU, quint8); |
201 | REGISTER_KERNEL(CPU, quint16); |
202 | REGISTER_KERNEL(CPU, qint8); |
203 | REGISTER_KERNEL(CPU, qint16); |
204 | REGISTER_KERNEL(CPU, qint32); |
205 | #undef REGISTER_CPU_KERNEL |
206 | |
207 | #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ |
208 | (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) |
209 | REGISTER_KERNEL(GPU, Eigen::half); |
210 | REGISTER_KERNEL(GPU, bfloat16); |
211 | REGISTER_KERNEL(GPU, float); |
212 | REGISTER_KERNEL(GPU, double); |
213 | REGISTER_KERNEL(GPU, complex64); |
214 | REGISTER_KERNEL(GPU, complex128); |
215 | REGISTER_KERNEL(GPU, uint8); |
216 | REGISTER_KERNEL(GPU, int8); |
217 | REGISTER_KERNEL(GPU, uint16); |
218 | REGISTER_KERNEL(GPU, int16); |
219 | REGISTER_KERNEL(GPU, int64_t); |
220 | REGISTER_KERNEL(GPU, bool); |
221 | // Currently we do not support filling strings on GPU |
222 | |
223 | // A special DEVICE_DEFAULT kernel for int32. |
224 | // TODO(b/25387198): Also enable int32 in device memory. This kernel |
225 | // registration requires all int32 inputs and outputs to be in host memory. |
226 | REGISTER_KERNEL_BUILDER(Name("Fill" ) |
227 | .Device(DEVICE_DEFAULT) |
228 | .TypeConstraint<int32>("T" ) |
229 | .TypeConstraint<int32>("index_type" ) |
230 | .HostMemory("dims" ) |
231 | .HostMemory("value" ) |
232 | .HostMemory("output" ), |
233 | FillOp<CPUDevice, int32, int32>); |
234 | #endif |
235 | |
236 | #undef REGISTER_KERNEL |
237 | |
238 | template <typename Device, typename T> |
239 | class ZerosLikeOp : public OpKernel { |
240 | public: |
241 | explicit ZerosLikeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} |
242 | |
243 | void Compute(OpKernelContext* ctx) override { |
244 | const Tensor& input = ctx->input(0); |
245 | const Device& d = ctx->eigen_device<Device>(); |
246 | if (std::is_same<T, Variant>::value) { |
247 | OP_REQUIRES( |
248 | ctx, input.dims() == 0, |
249 | errors::InvalidArgument("ZerosLike non-scalar Tensor with " |
250 | "dtype=DT_VARIANT is not supported." )); |
251 | const Variant& v = input.scalar<Variant>()(); |
252 | // DT_VARIANT tensors must be allocated on CPU since they wrap C++ |
253 | // objects which can not be efficiently represented in GPU memory. |
254 | int numa_node = ctx->device()->NumaNode(); |
255 | Tensor out(cpu_allocator(numa_node), DT_VARIANT, TensorShape({})); |
256 | Variant* out_v = &(out.scalar<Variant>()()); |
257 | OP_REQUIRES_OK(ctx, UnaryOpVariant<Device>( |
258 | ctx, ZEROS_LIKE_VARIANT_UNARY_OP, v, out_v)); |
259 | ctx->set_output(0, out); |
260 | } else { |
261 | Tensor* out = nullptr; |
262 | OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( |
263 | {0}, 0, input.shape(), &out)); |
264 | functor::SetZeroFunctor<Device, T> f; |
265 | f(d, out->flat<T>()); |
266 | } |
267 | } |
268 | }; |
269 | |
270 | #define REGISTER_KERNEL(type, dev) \ |
271 | REGISTER_KERNEL_BUILDER( \ |
272 | Name("ZerosLike").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ |
273 | ZerosLikeOp<dev##Device, type>) |
274 | |
275 | #define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU) |
276 | TF_CALL_POD_STRING_TYPES(REGISTER_CPU); |
277 | REGISTER_CPU(Variant); |
278 | #undef REGISTER_CPU |
279 | |
280 | #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ |
281 | (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) |
282 | #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) |
283 | REGISTER_KERNEL(bool, GPU); |
284 | REGISTER_KERNEL(Eigen::half, GPU); |
285 | REGISTER_KERNEL(float, GPU); |
286 | REGISTER_KERNEL(double, GPU); |
287 | REGISTER_KERNEL(int64_t, GPU); |
288 | REGISTER_KERNEL(complex64, GPU); |
289 | REGISTER_KERNEL(complex128, GPU); |
290 | #endif |
291 | |
292 | REGISTER_KERNEL(bfloat16, GPU); |
293 | REGISTER_KERNEL(Variant, GPU); |
294 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
295 | #undef REGISTER_KERNEL |
296 | |
297 | REGISTER_KERNEL_BUILDER(Name("ZerosLike" ) |
298 | .Device(DEVICE_DEFAULT) |
299 | .TypeConstraint<int32>("T" ) |
300 | .HostMemory("y" ), |
301 | ZerosLikeOp<CPUDevice, int32>); |
302 | |
303 | template <typename Device, typename T> |
304 | class OnesLikeOp : public OpKernel { |
305 | public: |
306 | explicit OnesLikeOp(OpKernelConstruction* ctx) : OpKernel(ctx) {} |
307 | |
308 | void Compute(OpKernelContext* ctx) override { |
309 | const Tensor& input = ctx->input(0); |
310 | Tensor* out = nullptr; |
311 | OP_REQUIRES_OK(ctx, ctx->forward_input_or_allocate_output( |
312 | {0}, 0, input.shape(), &out)); |
313 | functor::SetOneFunctor<Device, T> f; |
314 | f(ctx->eigen_device<Device>(), out->flat<T>()); |
315 | } |
316 | }; |
317 | |
318 | #define REGISTER_KERNEL(type, dev) \ |
319 | REGISTER_KERNEL_BUILDER( \ |
320 | Name("OnesLike").Device(DEVICE_##dev).TypeConstraint<type>("T"), \ |
321 | OnesLikeOp<dev##Device, type>) |
322 | |
323 | #define REGISTER_CPU(type) REGISTER_KERNEL(type, CPU) |
324 | TF_CALL_POD_TYPES(REGISTER_CPU); |
325 | #undef REGISTER_CPU |
326 | |
327 | #if (defined(GOOGLE_CUDA) && GOOGLE_CUDA) || \ |
328 | (defined(TENSORFLOW_USE_ROCM) && TENSORFLOW_USE_ROCM) |
329 | #if !defined(MLIR_GENERATED_GPU_KERNELS_ENABLED) |
330 | REGISTER_KERNEL(bool, GPU); |
331 | REGISTER_KERNEL(Eigen::half, GPU); |
332 | REGISTER_KERNEL(float, GPU); |
333 | REGISTER_KERNEL(double, GPU); |
334 | REGISTER_KERNEL(int64_t, GPU); |
335 | REGISTER_KERNEL(complex64, GPU); |
336 | REGISTER_KERNEL(complex128, GPU); |
337 | #endif |
338 | REGISTER_KERNEL(bfloat16, GPU); |
339 | REGISTER_KERNEL_BUILDER(Name("OnesLike" ) |
340 | .Device(DEVICE_DEFAULT) |
341 | .TypeConstraint<int32>("T" ) |
342 | .HostMemory("y" ), |
343 | OnesLikeOp<CPUDevice, int32>); |
344 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
345 | |
346 | #undef REGISTER_KERNEL |
347 | |
348 | PlaceholderOp::PlaceholderOp(OpKernelConstruction* ctx) : OpKernel(ctx) { |
349 | OP_REQUIRES_OK(ctx, ctx->GetAttr("shape" , &expected_shape_)); |
350 | } |
351 | |
352 | void PlaceholderOp::Compute(OpKernelContext* ctx) { |
353 | if (expected_shape_.dims() > 0) { |
354 | OP_REQUIRES(ctx, false, |
355 | errors::InvalidArgument( |
356 | "You must feed a value for placeholder tensor '" , name(), |
357 | "' with dtype " , DataTypeString(output_type(0)), |
358 | " and shape " , expected_shape_.DebugString())); |
359 | } else { |
360 | OP_REQUIRES(ctx, false, |
361 | errors::InvalidArgument( |
362 | "You must feed a value for placeholder tensor '" , name(), |
363 | "' with dtype " , DataTypeString(output_type(0)))); |
364 | } |
365 | } |
366 | |
367 | REGISTER_KERNEL_BUILDER(Name("Placeholder" ).Device(DEVICE_CPU), PlaceholderOp); |
368 | REGISTER_KERNEL_BUILDER(Name("PlaceholderV2" ).Device(DEVICE_CPU), |
369 | PlaceholderOp); |
370 | // The following GPU/Default kernel registration is used to address the |
371 | // situation that a placeholder is added in a GPU device context and soft |
372 | // placement is false. Since a placeholder should never be executed, adding |
373 | // these GPU kernels has no effect on graph execution. |
374 | REGISTER_KERNEL_BUILDER(Name("Placeholder" ).Device(DEVICE_DEFAULT), |
375 | PlaceholderOp); |
376 | REGISTER_KERNEL_BUILDER(Name("PlaceholderV2" ).Device(DEVICE_DEFAULT), |
377 | PlaceholderOp); |
378 | } // namespace tensorflow |
379 | |