1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16// See docs in ../ops/data_flow_ops.cc.
17
18#define EIGEN_USE_THREADS
19
20#include <limits>
21#include <vector>
22// TODO(b/31496047): Fix non-standard include order.
23#include <numeric> // clang-format off
24
25#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
26#include "tensorflow/core/framework/bounds_check.h"
27#include "tensorflow/core/framework/op_kernel.h"
28#include "tensorflow/core/framework/register_types.h"
29#include "tensorflow/core/framework/resource_mgr.h"
30#include "tensorflow/core/framework/tensor.h"
31#include "tensorflow/core/framework/tensor_shape.h"
32#include "tensorflow/core/framework/tensor_util.h"
33#include "tensorflow/core/framework/types.h"
34#include "tensorflow/core/kernels/concat_lib.h"
35#include "tensorflow/core/kernels/split_lib.h"
36#include "tensorflow/core/kernels/tensor_array.h"
37#include "tensorflow/core/lib/core/errors.h"
38#include "tensorflow/core/lib/core/refcount.h"
39#include "tensorflow/core/lib/strings/strcat.h"
40#include "tensorflow/core/platform/dynamic_annotations.h"
41#include "tensorflow/core/platform/logging.h"
42#include "tensorflow/core/platform/thread_annotations.h"
43#include "tensorflow/core/platform/types.h"
44#include "tensorflow/core/util/ptr_util.h"
45
46typedef Eigen::ThreadPoolDevice CPUDevice;
47#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
48typedef Eigen::GpuDevice GPUDevice;
49#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
50
51// clang-format on
52
53namespace tensorflow {
54
55Status GetHandle(OpKernelContext* ctx, string* container, string* ta_handle) {
56 {
57 Tensor tensor;
58 // Assuming that handle is the input at index 0.
59 if (IsRefType(ctx->input_dtype(0))) {
60 tensor = ctx->mutable_input(0, false);
61 } else {
62 tensor = ctx->input(0);
63 }
64 if (tensor.NumElements() != 2) {
65 return errors::InvalidArgument(
66 "Tensor array handle must be 2-element vector, but had shape: ",
67 tensor.shape().DebugString());
68 }
69 auto h = tensor.flat<tstring>();
70 *container = h(0);
71 *ta_handle = h(1);
72 }
73 return OkStatus();
74}
75
76Status GetTensorArray(OpKernelContext* ctx, TensorArray** tensor_array) {
77 string container;
78 string ta_handle;
79 if (ctx->input_dtype(0) != DT_RESOURCE) {
80 TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &ta_handle));
81 ResourceMgr* rm = ctx->resource_manager();
82 if (rm == nullptr) return errors::Internal("No resource manager.");
83 TF_RETURN_IF_ERROR(
84 ctx->step_container()->Lookup(rm, container + ta_handle, tensor_array));
85 return OkStatus();
86 } else {
87 return LookupResource(ctx, HandleFromInput(ctx, 0), tensor_array);
88 }
89}
90
91Status SetupFlowControlInputs(OpKernelContext* ctx, bool set_output) {
92 const Tensor* flow_control;
93 TF_RETURN_IF_ERROR(ctx->input("flow_in", &flow_control));
94 if (set_output) {
95 TF_RETURN_IF_ERROR(ctx->set_output("flow_out", *flow_control));
96 }
97 return OkStatus();
98}
99
100// CREATION *******************************************************************
101
102// Virtual class for shared behavior between TensorArrayOp and
103// TensorArrayGradOp.
104class TensorArrayCreationOp : public OpKernel {
105 public:
106 explicit TensorArrayCreationOp(OpKernelConstruction* context)
107 : OpKernel(context), device_type_(context->device_type()) {}
108
109 void Compute(OpKernelContext* ctx) override {
110 Tensor tensor_array_output_handle;
111
112 AllocatorAttributes alloc_attr;
113 alloc_attr.set_on_host(true);
114 OP_REQUIRES_OK(ctx, ctx->allocate_temp(
115 tensorflow::DT_STRING, tensorflow::TensorShape({2}),
116 &tensor_array_output_handle, alloc_attr));
117 // Store the handle in a per-step container of the RM.
118 ResourceMgr* rm = ctx->resource_manager();
119 OP_REQUIRES(ctx, rm != nullptr, errors::Internal("No resource manager."));
120
121 TensorArray* output_tensor_array;
122 OP_REQUIRES_OK(ctx, CreateTensorArray(ctx, rm, &tensor_array_output_handle,
123 &output_tensor_array));
124 if (IsRefType(ctx->expected_output_dtype(0))) {
125 ctx->set_output_ref(0, output_tensor_array->mu(),
126 output_tensor_array->handle());
127 } else if (ctx->expected_output_dtype(0) == DT_STRING) {
128 ctx->set_output(0, *output_tensor_array->handle());
129 } else {
130 Tensor* handle;
131 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &handle));
132 handle->flat<ResourceHandle>()(0) =
133 output_tensor_array->resource_handle(ctx);
134 }
135 if (ctx->num_outputs() == 2) {
136 // Create the flow output.
137 Tensor* flow;
138 OP_REQUIRES_OK(ctx, ctx->allocate_output(1, TensorShape({}), &flow));
139 if (device_type_ == DEVICE_CPU) {
140 // Value doesn't matter, but this makes msan not complaint about
141 // copying an uninitialized value. To do this on GPU would require
142 // a kernel launch or a host->device memcpy, so we avoid that.
143 flow->flat<float>()(0) = 0;
144 }
145 }
146 }
147
148 protected:
149 virtual Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
150 Tensor* tensor_array_output_handle,
151 TensorArray** output_tensor_array) = 0;
152
153 private:
154 const DeviceType device_type_;
155};
156
157// A per-run local tensor array. The tensor array uses a "per-step" resource
158// manager which ensures that correct garbage collection on error or
159// successful completion.
160class TensorArrayOp : public TensorArrayCreationOp {
161 public:
162 explicit TensorArrayOp(OpKernelConstruction* context)
163 : TensorArrayCreationOp(context) {
164 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
165 OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
166 OP_REQUIRES_OK(context, context->GetAttr("dynamic_size", &dynamic_size_));
167 // The HasAttr check is for backwards compatibility with older op
168 // versions which do not have this attribute.
169 if (context->HasAttr("identical_element_shapes")) {
170 OP_REQUIRES_OK(context, context->GetAttr("identical_element_shapes",
171 &identical_element_shapes_));
172 } else {
173 identical_element_shapes_ = false;
174 }
175 OP_REQUIRES_OK(context,
176 context->GetAttr("clear_after_read", &clear_after_read_));
177 OP_REQUIRES_OK(context,
178 context->GetAttr("tensor_array_name", &tensor_array_name_));
179 if (tensor_array_name_.empty()) tensor_array_name_ = name();
180 }
181
182 Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
183 Tensor* tensor_array_output_handle,
184 TensorArray** output_tensor_array) override {
185 const Tensor* tensor_size;
186 TF_RETURN_IF_ERROR(ctx->input("size", &tensor_size));
187
188 if (!TensorShapeUtils::IsScalar(tensor_size->shape())) {
189 return errors::InvalidArgument(
190 "TensorArray size must be scalar, but had shape: ",
191 tensor_size->shape().DebugString());
192 }
193 const int32_t size = tensor_size->scalar<int32>()();
194 if (size < 0) {
195 return errors::InvalidArgument("Size should be >= 0.");
196 }
197
198 auto handle = tensor_array_output_handle->flat<tstring>();
199 string unique_tensor_array_name =
200 strings::StrCat(tensor_array_name_, "_",
201 TensorArray::tensor_array_counter.fetch_add(1));
202 handle(0) = "_tensor_arrays";
203 handle(1) = unique_tensor_array_name;
204
205 auto key = strings::StrCat(handle(0), unique_tensor_array_name);
206
207 TensorArray* tensor_array = new TensorArray(
208 key, dtype_, *tensor_array_output_handle, size, element_shape_,
209 identical_element_shapes_, dynamic_size_,
210 false /* multiple_writes_aggregate */, false /* is_grad */,
211 -1 /* marked_size */, clear_after_read_);
212
213 TF_RETURN_IF_ERROR(ctx->step_container()->Create(rm, key, tensor_array));
214
215 *output_tensor_array = tensor_array;
216
217 return OkStatus();
218 }
219
220 private:
221 DataType dtype_;
222 PartialTensorShape element_shape_;
223 bool identical_element_shapes_;
224 bool dynamic_size_;
225 bool clear_after_read_;
226 string tensor_array_name_; // The name used to create the TensorArray.
227
228 TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayOp);
229};
230
231REGISTER_KERNEL_BUILDER(Name("TensorArray").Device(DEVICE_CPU), TensorArrayOp);
232REGISTER_KERNEL_BUILDER(Name("TensorArrayV2").Device(DEVICE_CPU),
233 TensorArrayOp);
234REGISTER_KERNEL_BUILDER(Name("TensorArrayV3").Device(DEVICE_CPU),
235 TensorArrayOp);
236
237#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
238
239#define REGISTER_GPU(type) \
240 REGISTER_KERNEL_BUILDER(Name("TensorArray") \
241 .Device(DEVICE_GPU) \
242 .TypeConstraint<type>("dtype") \
243 .HostMemory("size") \
244 .HostMemory("handle"), \
245 TensorArrayOp); \
246 REGISTER_KERNEL_BUILDER(Name("TensorArrayV2") \
247 .Device(DEVICE_GPU) \
248 .TypeConstraint<type>("dtype") \
249 .HostMemory("size") \
250 .HostMemory("handle"), \
251 TensorArrayOp); \
252 REGISTER_KERNEL_BUILDER(Name("TensorArrayV3") \
253 .Device(DEVICE_GPU) \
254 .TypeConstraint<type>("dtype") \
255 .HostMemory("size") \
256 .HostMemory("handle"), \
257 TensorArrayOp);
258
259TF_CALL_int64(REGISTER_GPU);
260TF_CALL_bfloat16(REGISTER_GPU);
261TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
262TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
263#undef REGISTER_GPU
264
265#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
266
267// GRADIENT *******************************************************************
268// Note that this op may have an optional third input. If present, it represents
269// a shape value. It indicates that element shape of this gradient array is that
270// shape value concatenated with the element shape of the original tensor array.
271// See TensorArrayGradWithShape.
272class TensorArrayGradOp : public TensorArrayCreationOp {
273 public:
274 explicit TensorArrayGradOp(OpKernelConstruction* context)
275 : TensorArrayCreationOp(context) {
276 OP_REQUIRES_OK(context, context->GetAttr("source", &source_));
277 }
278
279 Status CreateTensorArray(OpKernelContext* ctx, ResourceMgr* rm,
280 Tensor* tensor_array_output_handle,
281 TensorArray** output_tensor_array) override {
282 string container;
283 string tensor_array_name;
284 if (ctx->input_dtype(0) != DT_RESOURCE) {
285 TF_RETURN_IF_ERROR(GetHandle(ctx, &container, &tensor_array_name));
286 if (container != "_tensor_arrays") {
287 return errors::InvalidArgument(
288 "Input container should be '_tensor_arrays', but received '",
289 container, "'");
290 }
291 } else {
292 container = "_tensor_arrays";
293 const auto& resource = ctx->input(0).flat<ResourceHandle>()(0);
294 if (StringPiece(resource.name()).substr(0, container.size()) !=
295 container) {
296 return errors::InvalidArgument("Wrong input container. ",
297 resource.name());
298 }
299 tensor_array_name =
300 string(StringPiece(resource.name()).substr(container.size()));
301 }
302
303 auto output_handle = tensor_array_output_handle->flat<tstring>();
304 output_handle(0) = "_tensor_array_grads";
305 output_handle(1) = strings::StrCat(tensor_array_name, "@", source_);
306
307 TensorArray* tensor_array;
308 TF_RETURN_IF_ERROR(ctx->step_container()->Lookup(
309 rm, strings::StrCat(container, tensor_array_name), &tensor_array));
310 core::ScopedUnref unref(tensor_array);
311
312 // Once gradients are being calculated, the forward TensorArray
313 // may no longer be resized by new Writes.
314 tensor_array->DisableDynamicSize();
315
316 int32_t array_size = 0;
317 int32_t marked_size = 0;
318 TF_RETURN_IF_ERROR(tensor_array->Size(&array_size));
319 TF_RETURN_IF_ERROR(tensor_array->MarkedSize(&marked_size));
320
321 if (array_size < 0) {
322 return errors::InvalidArgument("ArraySize should be >= 0.");
323 }
324 if (!tensor_array->GradientsAllowed()) {
325 return errors::InvalidArgument(
326 "Unable to create a gradients TensorArray for ", tensor_array_name,
327 ". Perhaps you used the multiple_writes_aggregate flag on a "
328 "previous write? Gradient calculation is impossible when multiple "
329 "writes are performed to the same index.");
330 }
331 TensorShape shape_to_prepend;
332 auto element_shape = PartialTensorShape();
333 if (ctx->num_inputs() > 2) {
334 TF_RETURN_IF_ERROR(tensor::MakeShape(ctx->input(2), &shape_to_prepend));
335 auto ta_element_shape = tensor_array->ElemShape();
336 if (!ta_element_shape.unknown_rank()) {
337 std::vector<int64_t> dims;
338 for (auto dim : shape_to_prepend) {
339 dims.push_back(dim.size);
340 }
341 for (auto dim : ta_element_shape) {
342 dims.push_back(dim.size);
343 }
344 TF_RETURN_IF_ERROR(TensorShapeUtils::MakeShape(
345 gtl::ArraySlice<int64_t>(dims), &element_shape));
346 }
347 } else {
348 element_shape = tensor_array->ElemShape();
349 }
350
351 const auto key = strings::StrCat(output_handle(0), output_handle(1));
352 auto creator = [key, tensor_array, array_size, marked_size, element_shape,
353 shape_to_prepend,
354 tensor_array_output_handle](TensorArray** ret) -> Status {
355 *ret = new TensorArray(
356 key, tensor_array->ElemType(), *tensor_array_output_handle,
357 array_size, element_shape, tensor_array->HasIdenticalElementShapes(),
358 false /* dynamic_size */, true /* multiple_writes_aggregate */,
359 true /* is_grad */, marked_size /* marked_size */,
360 true /* close_after_read */);
361 return (*ret)->CopyShapesFrom(tensor_array, &shape_to_prepend);
362 };
363
364 Status s = ctx->step_container()->LookupOrCreate<TensorArray>(
365 rm, key, output_tensor_array, creator);
366 (*output_tensor_array)->Unref();
367
368 return s;
369 }
370
371 private:
372 // The gradient source for creating the given
373 // gradient TensorArray. This should be unique to each gradients
374 // call. Typical values look like "gradients", "gradients_1", ...
375 string source_;
376
377 TF_DISALLOW_COPY_AND_ASSIGN(TensorArrayGradOp);
378};
379
380REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad").Device(DEVICE_CPU),
381 TensorArrayGradOp);
382REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2").Device(DEVICE_CPU),
383 TensorArrayGradOp);
384REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3").Device(DEVICE_CPU),
385 TensorArrayGradOp);
386REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape").Device(DEVICE_CPU),
387 TensorArrayGradOp);
388REGISTER_KERNEL_BUILDER(Name("TensorArrayGrad")
389 .Device(DEVICE_GPU)
390 .HostMemory("handle")
391 .HostMemory("grad_handle"),
392 TensorArrayGradOp);
393REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV2")
394 .Device(DEVICE_GPU)
395 .HostMemory("handle")
396 .HostMemory("grad_handle"),
397 TensorArrayGradOp);
398REGISTER_KERNEL_BUILDER(Name("TensorArrayGradV3")
399 .Device(DEVICE_GPU)
400 .HostMemory("handle")
401 .HostMemory("grad_handle"),
402 TensorArrayGradOp);
403REGISTER_KERNEL_BUILDER(Name("TensorArrayGradWithShape")
404 .Device(DEVICE_GPU)
405 .HostMemory("handle")
406 .HostMemory("shape_to_prepend")
407 .HostMemory("grad_handle"),
408 TensorArrayGradOp);
409
410// WRITE **********************************************************************
411
412template <typename Device, typename T>
413class TensorArrayWriteOp : public OpKernel {
414 public:
415 explicit TensorArrayWriteOp(OpKernelConstruction* context)
416 : OpKernel(context) {}
417
418 void Compute(OpKernelContext* ctx) override {
419 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
420
421 const Tensor* tensor_index;
422 const Tensor* tensor_value;
423 OP_REQUIRES_OK(ctx, ctx->input("index", &tensor_index));
424 OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
425
426 OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_index->shape()),
427 errors::InvalidArgument(
428 "TensorArray index must be scalar, but had shape: ",
429 tensor_index->shape().DebugString()));
430
431 TensorArray* tensor_array = nullptr;
432 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
433 core::ScopedUnref unref(tensor_array);
434 const int32_t index = tensor_index->scalar<int32>()();
435 OP_REQUIRES(
436 ctx, tensor_value->dtype() == tensor_array->ElemType(),
437 errors::InvalidArgument("TensorArray dtype is ",
438 DataTypeString(tensor_array->ElemType()),
439 " but Op is trying to write dtype ",
440 DataTypeString(tensor_value->dtype()), "."));
441 Status s =
442 tensor_array->WriteOrAggregate<Device, T>(ctx, index, tensor_value);
443 OP_REQUIRES_OK(ctx, s);
444 }
445};
446
447#define REGISTER_WRITE(type) \
448 REGISTER_KERNEL_BUILDER( \
449 Name("TensorArrayWrite").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
450 TensorArrayWriteOp<CPUDevice, type>); \
451 REGISTER_KERNEL_BUILDER( \
452 Name("TensorArrayWriteV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
453 TensorArrayWriteOp<CPUDevice, type>); \
454 REGISTER_KERNEL_BUILDER( \
455 Name("TensorArrayWriteV3").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
456 TensorArrayWriteOp<CPUDevice, type>);
457
458TF_CALL_ALL_TYPES(REGISTER_WRITE);
459
460#undef REGISTER_WRITE
461
462#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
463
464#define REGISTER_GPU(type) \
465 REGISTER_KERNEL_BUILDER(Name("TensorArrayWrite") \
466 .Device(DEVICE_GPU) \
467 .TypeConstraint<type>("T") \
468 .HostMemory("handle") \
469 .HostMemory("index"), \
470 TensorArrayWriteOp<GPUDevice, type>); \
471 REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV2") \
472 .Device(DEVICE_GPU) \
473 .TypeConstraint<type>("T") \
474 .HostMemory("handle") \
475 .HostMemory("index"), \
476 TensorArrayWriteOp<GPUDevice, type>); \
477 REGISTER_KERNEL_BUILDER(Name("TensorArrayWriteV3") \
478 .Device(DEVICE_GPU) \
479 .TypeConstraint<type>("T") \
480 .HostMemory("handle") \
481 .HostMemory("index"), \
482 TensorArrayWriteOp<GPUDevice, type>);
483
484TF_CALL_bfloat16(REGISTER_GPU);
485TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
486TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
487#undef REGISTER_GPU
488
489#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
490
491// READ ***********************************************************************
492
493template <typename Device, typename T>
494class TensorArrayReadOp : public OpKernel {
495 public:
496 explicit TensorArrayReadOp(OpKernelConstruction* context)
497 : OpKernel(context) {
498 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
499 }
500
501 void Compute(OpKernelContext* ctx) override {
502 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
503
504 const Tensor* tensor_index;
505 OP_REQUIRES_OK(ctx, ctx->input("index", &tensor_index));
506
507 OP_REQUIRES(ctx, TensorShapeUtils::IsScalar(tensor_index->shape()),
508 errors::InvalidArgument(
509 "TensorArray index must be scalar, but had shape: ",
510 tensor_index->shape().DebugString()));
511
512 TensorArray* tensor_array = nullptr;
513 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
514 core::ScopedUnref unref(tensor_array);
515
516 const int32_t index = tensor_index->scalar<int32>()();
517 OP_REQUIRES(
518 ctx, dtype_ == tensor_array->ElemType(),
519 errors::InvalidArgument(
520 "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
521 " but Op requested dtype ", DataTypeString(dtype_), "."));
522 Tensor value;
523 Status s = tensor_array->Read<Device, T>(ctx, index, &value);
524 OP_REQUIRES_OK(ctx, s);
525 ctx->set_output(0, value);
526 }
527
528 private:
529 DataType dtype_;
530};
531
532#define REGISTER_READ(type) \
533 REGISTER_KERNEL_BUILDER(Name("TensorArrayRead") \
534 .Device(DEVICE_CPU) \
535 .TypeConstraint<type>("dtype"), \
536 TensorArrayReadOp<CPUDevice, type>); \
537 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2") \
538 .Device(DEVICE_CPU) \
539 .TypeConstraint<type>("dtype"), \
540 TensorArrayReadOp<CPUDevice, type>); \
541 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV3") \
542 .Device(DEVICE_CPU) \
543 .TypeConstraint<type>("dtype"), \
544 TensorArrayReadOp<CPUDevice, type>);
545
546TF_CALL_ALL_TYPES(REGISTER_READ)
547
548#undef REGISTER_READ
549
550#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
551
552#define REGISTER_GPU(type) \
553 REGISTER_KERNEL_BUILDER(Name("TensorArrayRead") \
554 .Device(DEVICE_GPU) \
555 .TypeConstraint<type>("dtype") \
556 .HostMemory("handle") \
557 .HostMemory("index"), \
558 TensorArrayReadOp<GPUDevice, type>); \
559 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV2") \
560 .Device(DEVICE_GPU) \
561 .TypeConstraint<type>("dtype") \
562 .HostMemory("handle") \
563 .HostMemory("index"), \
564 TensorArrayReadOp<GPUDevice, type>); \
565 REGISTER_KERNEL_BUILDER(Name("TensorArrayReadV3") \
566 .Device(DEVICE_GPU) \
567 .TypeConstraint<type>("dtype") \
568 .HostMemory("handle") \
569 .HostMemory("index"), \
570 TensorArrayReadOp<GPUDevice, type>);
571
572TF_CALL_int64(REGISTER_GPU);
573TF_CALL_bfloat16(REGISTER_GPU);
574TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
575TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
576#undef REGISTER_GPU
577
578#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
579
580// PACK and GATHER ************************************************************
581
582// Concatenate the elements in a TensorArray. All elements must be
583// defined and have the same shape.
584template <typename Device, typename T, bool LEGACY_PACK>
585class TensorArrayPackOrGatherOp : public OpKernel {
586 public:
587 typedef typename TTypes<T, 2>::ConstMatrix ConstMatrix;
588 typedef std::vector<std::unique_ptr<ConstMatrix> > ConstMatrixVector;
589
590 explicit TensorArrayPackOrGatherOp(OpKernelConstruction* context)
591 : OpKernel(context) {
592 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
593 OP_REQUIRES_OK(context, context->GetAttr("element_shape", &element_shape_));
594 }
595
596 void Compute(OpKernelContext* ctx) override {
597 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
598
599 TensorArray* tensor_array = nullptr;
600 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
601
602 core::ScopedUnref unref(tensor_array);
603 OP_REQUIRES(
604 ctx, dtype_ == tensor_array->ElemType(),
605 errors::InvalidArgument(
606 "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
607 " but Op requested dtype ", DataTypeString(dtype_), "."));
608
609 // Ensure new element shape is compatible with the one stored in the
610 // TensorArray.
611 OP_REQUIRES_OK(ctx, tensor_array->SetElemShape(element_shape_));
612
613 int32_t num_indices;
614 std::vector<Tensor> values;
615 std::vector<int32> indices;
616 if (LEGACY_PACK) {
617 OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&num_indices));
618 indices.resize(num_indices);
619 std::iota(indices.begin(), indices.end(), 0);
620 } else {
621 const Tensor* tensor_indices;
622 OP_REQUIRES_OK(ctx, ctx->input("indices", &tensor_indices));
623 OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_indices->shape()),
624 errors::InvalidArgument(
625 "Expected indices to be a vector, but received shape: ",
626 tensor_indices->shape().DebugString()));
627 const auto indices_t = tensor_indices->vec<int32>();
628 num_indices = tensor_indices->NumElements();
629 indices.resize(num_indices);
630 std::copy(indices_t.data(), indices_t.data() + num_indices,
631 indices.begin());
632 }
633
634 // If there are no elements to return, return a zero-element Tensor with
635 // shape [0] + element_shape_
636 if (num_indices == 0) {
637 OP_REQUIRES(ctx, element_shape_.IsFullyDefined(),
638 errors::Unimplemented(
639 "TensorArray has size zero, but element shape ",
640 element_shape_.DebugString(),
641 " is not fully defined. "
642 "Currently only static shapes are supported when packing "
643 "zero-size TensorArrays."));
644 TensorShape empty_shape;
645 element_shape_.AsTensorShape(&empty_shape);
646 empty_shape.InsertDim(0, 0);
647 Tensor* empty_unused;
648 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &empty_unused));
649 return;
650 }
651
652 // Read all the Tensors into a vector to keep track of their memory.
653 Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
654 OP_REQUIRES_OK(ctx, s);
655
656 const Tensor* value_0_t = &values[0];
657
658 OP_REQUIRES(
659 ctx, element_shape_.IsCompatibleWith(value_0_t->shape()),
660 errors::InvalidArgument("TensorArray was passed element_shape ",
661 element_shape_.DebugString(),
662 " which does not match the Tensor at index 0: ",
663 value_0_t->shape().DebugString()));
664
665 TensorShape output_shape(value_0_t->shape());
666 output_shape.InsertDim(0, num_indices);
667
668 Tensor* output_tensor = nullptr;
669 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
670
671 // If output_tensor is empty, there is nothing to concatenate so return it.
672 if (output_shape.num_elements() == 0) {
673 return;
674 }
675
676 ConstMatrixVector input_tensors_flat;
677 input_tensors_flat.reserve(num_indices);
678 auto output_flat =
679 output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
680
681 // Insert the first value
682 input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
683 value_0_t->shaped<T, 2>({1, value_0_t->NumElements()})));
684
685 for (int i = 1; i < num_indices; ++i) {
686 const Tensor* value_t = &values[i];
687 OP_REQUIRES(
688 ctx, value_0_t->shape() == value_t->shape(),
689 errors::InvalidArgument(
690 "TensorArray has inconsistent shapes. Index 0 has shape: ",
691 value_0_t->shape().DebugString(), " but index ", i,
692 " has shape: ", value_t->shape().DebugString()));
693 input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
694 value_t->shaped<T, 2>({1, value_t->NumElements()})));
695 }
696
697#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
698 if (std::is_same<Device, GPUDevice>::value) {
699 ConcatGPU<T>(ctx, input_tensors_flat, output_tensor, &output_flat);
700 return;
701 }
702#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
703 ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
704 }
705
706 private:
707 DataType dtype_;
708 PartialTensorShape element_shape_;
709};
710
711#define REGISTER_GATHER_AND_PACK(type) \
712 REGISTER_KERNEL_BUILDER( \
713 Name("TensorArrayPack") \
714 .Device(DEVICE_CPU) \
715 .TypeConstraint<type>("dtype"), \
716 TensorArrayPackOrGatherOp<CPUDevice, type, true /* LEGACY_PACK */>); \
717 REGISTER_KERNEL_BUILDER( \
718 Name("TensorArrayGather") \
719 .Device(DEVICE_CPU) \
720 .TypeConstraint<type>("dtype"), \
721 TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>); \
722 REGISTER_KERNEL_BUILDER( \
723 Name("TensorArrayGatherV2") \
724 .Device(DEVICE_CPU) \
725 .TypeConstraint<type>("dtype"), \
726 TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>); \
727 REGISTER_KERNEL_BUILDER( \
728 Name("TensorArrayGatherV3") \
729 .Device(DEVICE_CPU) \
730 .TypeConstraint<type>("dtype"), \
731 TensorArrayPackOrGatherOp<CPUDevice, type, false /* LEGACY_PACK */>);
732
733TF_CALL_POD_STRING_TYPES(REGISTER_GATHER_AND_PACK);
734TF_CALL_variant(REGISTER_GATHER_AND_PACK);
735REGISTER_GATHER_AND_PACK(quint8);
736REGISTER_GATHER_AND_PACK(qint8);
737REGISTER_GATHER_AND_PACK(qint32);
738
739#undef REGISTER_GATHER_AND_PACK
740
741#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
742
743#define REGISTER_GPU(type) \
744 REGISTER_KERNEL_BUILDER( \
745 Name("TensorArrayPack") \
746 .Device(DEVICE_GPU) \
747 .TypeConstraint<type>("dtype") \
748 .HostMemory("handle"), \
749 TensorArrayPackOrGatherOp<GPUDevice, type, true /* LEGACY_PACK */>); \
750 REGISTER_KERNEL_BUILDER( \
751 Name("TensorArrayGather") \
752 .Device(DEVICE_GPU) \
753 .TypeConstraint<type>("dtype") \
754 .HostMemory("indices") \
755 .HostMemory("handle"), \
756 TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>); \
757 REGISTER_KERNEL_BUILDER( \
758 Name("TensorArrayGatherV2") \
759 .Device(DEVICE_GPU) \
760 .TypeConstraint<type>("dtype") \
761 .HostMemory("indices") \
762 .HostMemory("handle"), \
763 TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>); \
764 REGISTER_KERNEL_BUILDER( \
765 Name("TensorArrayGatherV3") \
766 .Device(DEVICE_GPU) \
767 .TypeConstraint<type>("dtype") \
768 .HostMemory("indices") \
769 .HostMemory("handle"), \
770 TensorArrayPackOrGatherOp<GPUDevice, type, false /* LEGACY_PACK */>);
771
772TF_CALL_bfloat16(REGISTER_GPU);
773TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
774TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
775#undef REGISTER_GPU
776
777// A special GPU kernel for int32.
778// TODO(b/25387198): Also enable int32 in device memory. This kernel
779// registration requires all int32 inputs and outputs to be in host memory.
780REGISTER_KERNEL_BUILDER(
781 Name("TensorArrayGather")
782 .Device(DEVICE_GPU)
783 .TypeConstraint<int32>("dtype")
784 .HostMemory("indices")
785 .HostMemory("handle"),
786 TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
787REGISTER_KERNEL_BUILDER(
788 Name("TensorArrayGatherV2")
789 .Device(DEVICE_GPU)
790 .TypeConstraint<int32>("dtype")
791 .HostMemory("indices")
792 .HostMemory("handle"),
793 TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
794REGISTER_KERNEL_BUILDER(
795 Name("TensorArrayGatherV3")
796 .Device(DEVICE_GPU)
797 .TypeConstraint<int32>("dtype")
798 .HostMemory("indices")
799 .HostMemory("handle"),
800 TensorArrayPackOrGatherOp<CPUDevice, int32, false /* LEGACY_PACK */>);
801
802#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
803
804// CONCAT *********************************************************************
805
806// Concatenate the elements in a TensorArray. All elements must be
807// defined and (excepting the first dimension) have the same shape.
808template <typename Device, typename T>
809class TensorArrayConcatOp : public OpKernel {
810 public:
811 typedef typename TTypes<T, 2>::ConstMatrix ConstMatrix;
812 typedef std::vector<std::unique_ptr<ConstMatrix> > ConstMatrixVector;
813
814 explicit TensorArrayConcatOp(OpKernelConstruction* context)
815 : OpKernel(context) {
816 OP_REQUIRES_OK(context, context->GetAttr("dtype", &dtype_));
817 OP_REQUIRES_OK(context, context->GetAttr("element_shape_except0",
818 &element_shape_except0_));
819 }
820
821 void Compute(OpKernelContext* ctx) override {
822 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, false));
823
824 TensorArray* tensor_array = nullptr;
825 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
826 core::ScopedUnref unref(tensor_array);
827 OP_REQUIRES(
828 ctx, dtype_ == tensor_array->ElemType(),
829 errors::InvalidArgument(
830 "TensorArray dtype is ", DataTypeString(tensor_array->ElemType()),
831 " but Op requested dtype ", DataTypeString(dtype_), "."));
832
833 int32_t array_size;
834 OP_REQUIRES_OK(ctx, tensor_array->PackOrConcatSize(&array_size));
835
836 // If there are no elements, return a zero-element Tensor with
837 // shape [0] + element_shape_except0_
838 if (array_size == 0) {
839 OP_REQUIRES(
840 ctx, element_shape_except0_.IsFullyDefined(),
841 errors::Unimplemented(
842 "TensorArray has size zero, but element_shape_except0 ",
843 element_shape_except0_.DebugString(),
844 " is not fully defined. "
845 "Currently only static shapes are supported when concatenating "
846 "zero-size TensorArrays."));
847 TensorShape empty_shape;
848 element_shape_except0_.AsTensorShape(&empty_shape);
849 empty_shape.InsertDim(0, 0);
850 Tensor* empty_unused;
851 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, empty_shape, &empty_unused));
852 OP_REQUIRES_OK(ctx, ctx->allocate_output(1, {0}, &empty_unused));
853 return;
854 }
855
856 // Read all the Tensors into a vector to keep track of their memory.
857 std::vector<Tensor> values;
858 std::vector<int32> indices(array_size);
859 std::iota(indices.begin(), indices.end(), 0);
860 Status s = tensor_array->ReadMany<Device, T>(ctx, indices, &values);
861 OP_REQUIRES_OK(ctx, s);
862
863 Tensor* lengths_tensor = nullptr;
864 OP_REQUIRES_OK(ctx,
865 ctx->allocate_output(
866 1, TensorShape({static_cast<int64_t>(values.size())}),
867 &lengths_tensor));
868 auto lengths_tensor_t = lengths_tensor->vec<int64_t>();
869
870 TensorShape output_shape;
871 TensorShape output_shape_except0;
872 for (std::size_t i = 0; i < values.size(); ++i) {
873 TensorShape value_shape_t = values[i].shape();
874
875 OP_REQUIRES(
876 ctx, TensorShapeUtils::IsVectorOrHigher(value_shape_t),
877 errors::InvalidArgument(
878 "Concat saw a scalar shape at index ", i,
879 " but requires at least vectors. Did you mean to call pack?"));
880
881 lengths_tensor_t(i) = value_shape_t.dim_size(0);
882
883 TensorShape value_shape_t_except0 = value_shape_t;
884 value_shape_t_except0.RemoveDim(0);
885 if (i == 0) {
886 output_shape = value_shape_t;
887 output_shape_except0 = value_shape_t_except0;
888 OP_REQUIRES(
889 ctx, element_shape_except0_.IsCompatibleWith(output_shape_except0),
890 errors::InvalidArgument(
891 "TensorArray was passed element_shape_except0 ",
892 element_shape_except0_.DebugString(),
893 " but index 0 has (excepting dimension 0) shape: ",
894 value_shape_t_except0.DebugString(), " which does not match."));
895 } else {
896 OP_REQUIRES(ctx, output_shape_except0 == value_shape_t_except0,
897 errors::InvalidArgument(
898 "TensorArray has inconsistent shapes. Index 0 has "
899 "(excepting dimension 0) shape: ",
900 output_shape_except0.DebugString(), " but index ", i,
901 " has (excepting dimension 0) shape: ",
902 value_shape_t_except0.DebugString()));
903 // Store the previous maximum length as the offset for this tensor.
904 output_shape.set_dim(
905 0, output_shape.dim_size(0) + value_shape_t.dim_size(0));
906 }
907 }
908
909 Tensor* output_tensor = nullptr;
910 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, output_shape, &output_tensor));
911 ConstMatrixVector input_tensors_flat;
912 input_tensors_flat.reserve(values.size());
913 for (size_t i = 0; i < values.size(); ++i) {
914 const Tensor* value_t = &values[i];
915 if (value_t->NumElements() > 0) {
916 input_tensors_flat.push_back(MakeUnique<ConstMatrix>(
917 value_t->shaped<T, 2>({1, value_t->NumElements()})));
918 }
919 }
920
921 if (output_shape.num_elements() > 0) {
922 auto output_flat =
923 output_tensor->shaped<T, 2>({1, output_shape.num_elements()});
924#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
925 if (std::is_same<Device, GPUDevice>::value) {
926 ConcatGPU<T>(ctx, input_tensors_flat, output_tensor, &output_flat);
927 return;
928 }
929#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
930 ConcatCPU<T>(ctx->device(), input_tensors_flat, &output_flat);
931 }
932 }
933
934 private:
935 DataType dtype_;
936 PartialTensorShape element_shape_except0_;
937};
938
939#define REGISTER_CONCAT(type) \
940 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat") \
941 .Device(DEVICE_CPU) \
942 .TypeConstraint<type>("dtype") \
943 .HostMemory("lengths") \
944 .HostMemory("handle"), \
945 TensorArrayConcatOp<CPUDevice, type>); \
946 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2") \
947 .Device(DEVICE_CPU) \
948 .TypeConstraint<type>("dtype") \
949 .HostMemory("lengths") \
950 .HostMemory("handle"), \
951 TensorArrayConcatOp<CPUDevice, type>) \
952 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3") \
953 .Device(DEVICE_CPU) \
954 .TypeConstraint<type>("dtype") \
955 .HostMemory("lengths") \
956 .HostMemory("handle"), \
957 TensorArrayConcatOp<CPUDevice, type>)
958
959TF_CALL_POD_STRING_TYPES(REGISTER_CONCAT);
960REGISTER_CONCAT(quint8);
961REGISTER_CONCAT(qint8);
962REGISTER_CONCAT(qint32);
963
964#undef REGISTER_CONCAT
965
966#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
967
968#define REGISTER_GPU(type) \
969 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat") \
970 .Device(DEVICE_GPU) \
971 .TypeConstraint<type>("dtype") \
972 .HostMemory("lengths") \
973 .HostMemory("handle"), \
974 TensorArrayConcatOp<GPUDevice, type>); \
975 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2") \
976 .Device(DEVICE_GPU) \
977 .TypeConstraint<type>("dtype") \
978 .HostMemory("lengths") \
979 .HostMemory("handle"), \
980 TensorArrayConcatOp<GPUDevice, type>) \
981 REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3") \
982 .Device(DEVICE_GPU) \
983 .TypeConstraint<type>("dtype") \
984 .HostMemory("lengths") \
985 .HostMemory("handle"), \
986 TensorArrayConcatOp<GPUDevice, type>)
987
988TF_CALL_bfloat16(REGISTER_GPU);
989TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
990TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
991#undef REGISTER_GPU
992
993// A special GPU kernel for int32.
994// TODO(b/25387198): Also enable int32 in device memory. This kernel
995// registration requires all int32 inputs and outputs to be in host memory.
996REGISTER_KERNEL_BUILDER(Name("TensorArrayConcat")
997 .Device(DEVICE_GPU)
998 .TypeConstraint<int32>("dtype")
999 .HostMemory("lengths")
1000 .HostMemory("handle"),
1001 TensorArrayConcatOp<CPUDevice, int32>);
1002REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV2")
1003 .Device(DEVICE_GPU)
1004 .TypeConstraint<int32>("dtype")
1005 .HostMemory("lengths")
1006 .HostMemory("handle"),
1007 TensorArrayConcatOp<CPUDevice, int32>);
1008REGISTER_KERNEL_BUILDER(Name("TensorArrayConcatV3")
1009 .Device(DEVICE_GPU)
1010 .TypeConstraint<int32>("dtype")
1011 .HostMemory("lengths")
1012 .HostMemory("handle"),
1013 TensorArrayConcatOp<CPUDevice, int32>);
1014
1015#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1016
1017// UNPACK and SCATTER *********************************************************
1018
1019template <typename Device, typename T, bool LEGACY_UNPACK>
1020class TensorArrayUnpackOrScatterOp : public OpKernel {
1021 public:
1022 explicit TensorArrayUnpackOrScatterOp(OpKernelConstruction* context)
1023 : OpKernel(context) {}
1024
1025 void Compute(OpKernelContext* ctx) override {
1026 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
1027
1028 TensorArray* tensor_array = nullptr;
1029 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1030 core::ScopedUnref unref(tensor_array);
1031 const Tensor* tensor_value;
1032 OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
1033 TensorShape element_shape(tensor_value->shape());
1034
1035 OP_REQUIRES(ctx,
1036 FastBoundsCheck(element_shape.dim_size(0),
1037 std::numeric_limits<int32>::max()),
1038 errors::InvalidArgument("tensor dim0 too large to unpack"));
1039
1040 OP_REQUIRES(
1041 ctx, tensor_value->dtype() == tensor_array->ElemType(),
1042 errors::InvalidArgument("TensorArray dtype is ",
1043 DataTypeString(tensor_array->ElemType()),
1044 " but Op is trying to write dtype ",
1045 DataTypeString(tensor_value->dtype()), "."));
1046 OP_REQUIRES(ctx, element_shape.dims() > 0,
1047 errors::InvalidArgument("Input value for unpack must be at "
1048 "least a vector but received shape: ",
1049 element_shape.DebugString()));
1050 int32_t array_size;
1051 OP_REQUIRES_OK(ctx, tensor_array->Size(&array_size));
1052
1053 int32_t max_index;
1054 int32_t num_values;
1055 std::vector<int32> write_indices;
1056 if (LEGACY_UNPACK) {
1057 num_values = element_shape.dim_size(0);
1058 max_index = num_values - 1;
1059 write_indices.resize(num_values);
1060 std::iota(write_indices.begin(), write_indices.end(), 0);
1061 } else {
1062 const Tensor* tensor_indices;
1063 OP_REQUIRES_OK(ctx, ctx->input("indices", &tensor_indices));
1064 OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_indices->shape()),
1065 errors::InvalidArgument(
1066 "Expected indices to be a vector, but received shape: ",
1067 tensor_indices->shape().DebugString()));
1068 OP_REQUIRES(ctx,
1069 tensor_indices->NumElements() == element_shape.dim_size(0),
1070 errors::InvalidArgument(
1071 "Expected len(indices) == values.shape[0], but saw: ",
1072 tensor_indices->NumElements(), " vs. ",
1073 element_shape.dim_size(0)));
1074 const auto indices_t = tensor_indices->vec<int32>();
1075 num_values = tensor_indices->NumElements();
1076 max_index = (num_values == 0)
1077 ? -1
1078 : *std::max_element(indices_t.data(),
1079 indices_t.data() + num_values);
1080 write_indices.resize(num_values);
1081 // Copy into write_indices.
1082 std::copy(indices_t.data(), indices_t.data() + num_values,
1083 write_indices.begin());
1084 }
1085
1086 bool dynamic_size = tensor_array->HasDynamicSize();
1087
1088 // If dynamic size, we may have to resize the TensorArray to fit.
1089 if (dynamic_size && array_size < max_index + 1) {
1090 array_size = static_cast<int32>(max_index + 1);
1091 }
1092
1093 if (LEGACY_UNPACK) {
1094 OP_REQUIRES(
1095 ctx, element_shape.dim_size(0) == array_size,
1096 errors::InvalidArgument(
1097 "Input value must have first dimension equal to the array size (",
1098 element_shape.dim_size(0), " vs. ", array_size, ")"));
1099 } else {
1100 OP_REQUIRES(
1101 ctx, max_index < array_size,
1102 errors::InvalidArgument("Max scatter index must be < array size (",
1103 max_index, " vs. ", array_size, ")"));
1104 }
1105 element_shape.RemoveDim(0);
1106
1107 auto tensor_value_t = tensor_value->shaped<T, 3>(
1108 {1, num_values, element_shape.num_elements()});
1109
1110 Eigen::DSizes<Eigen::DenseIndex, 3> indices{0, 0, 0};
1111 Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
1112 1, 1, static_cast<Eigen::DenseIndex>(element_shape.num_elements())};
1113
1114 std::vector<Tensor> write_values;
1115 write_values.reserve(num_values);
1116
1117 for (int i = 0; i < num_values; ++i) {
1118 Tensor tensor_value_i;
1119 OP_REQUIRES_OK(ctx, ctx->allocate_temp(tensor_array->ElemType(),
1120 element_shape, &tensor_value_i));
1121 auto tensor_value_i_t =
1122 tensor_value_i.shaped<T, 3>({1, 1, element_shape.num_elements()});
1123 indices[1] = i;
1124
1125 if (element_shape.num_elements() > 0) {
1126 functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
1127 tensor_value_i_t, tensor_value_t,
1128 indices, sizes);
1129 }
1130
1131 write_values.push_back(tensor_value_i);
1132 }
1133
1134 // Record the pack size of the TensorArray.
1135 if (LEGACY_UNPACK) {
1136 OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
1137 }
1138
1139 Status s = tensor_array->WriteOrAggregateMany<Device, T>(ctx, write_indices,
1140 &write_values);
1141 OP_REQUIRES_OK(ctx, s);
1142 }
1143};
1144
1145#define REGISTER_SCATTER_AND_UNPACK(type) \
1146 REGISTER_KERNEL_BUILDER( \
1147 Name("TensorArrayUnpack").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1148 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1149 true /* LEGACY_UNPACK */>); \
1150 REGISTER_KERNEL_BUILDER( \
1151 Name("TensorArrayScatter").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1152 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1153 false /* LEGACY_UNPACK */>); \
1154 REGISTER_KERNEL_BUILDER( \
1155 Name("TensorArrayScatterV2") \
1156 .Device(DEVICE_CPU) \
1157 .TypeConstraint<type>("T"), \
1158 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1159 false /* LEGACY_UNPACK */>); \
1160 REGISTER_KERNEL_BUILDER( \
1161 Name("TensorArrayScatterV3") \
1162 .Device(DEVICE_CPU) \
1163 .TypeConstraint<type>("T"), \
1164 TensorArrayUnpackOrScatterOp<CPUDevice, type, \
1165 false /* LEGACY_UNPACK */>);
1166
1167TF_CALL_ALL_TYPES(REGISTER_SCATTER_AND_UNPACK);
1168#undef REGISTER_SCATTER_AND_UNPACK
1169
1170#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1171
1172#define REGISTER_GPU(type) \
1173 REGISTER_KERNEL_BUILDER( \
1174 Name("TensorArrayUnpack") \
1175 .Device(DEVICE_GPU) \
1176 .TypeConstraint<type>("T") \
1177 .HostMemory("handle"), \
1178 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1179 true /* LEGACY_UNPACK */>); \
1180 REGISTER_KERNEL_BUILDER( \
1181 Name("TensorArrayScatter") \
1182 .Device(DEVICE_GPU) \
1183 .TypeConstraint<type>("T") \
1184 .HostMemory("indices") \
1185 .HostMemory("handle"), \
1186 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1187 false /* LEGACY_UNPACK */>); \
1188 REGISTER_KERNEL_BUILDER( \
1189 Name("TensorArrayScatterV2") \
1190 .Device(DEVICE_GPU) \
1191 .TypeConstraint<type>("T") \
1192 .HostMemory("indices") \
1193 .HostMemory("handle"), \
1194 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1195 false /* LEGACY_UNPACK */>); \
1196 REGISTER_KERNEL_BUILDER( \
1197 Name("TensorArrayScatterV3") \
1198 .Device(DEVICE_GPU) \
1199 .TypeConstraint<type>("T") \
1200 .HostMemory("indices") \
1201 .HostMemory("handle"), \
1202 TensorArrayUnpackOrScatterOp<GPUDevice, type, \
1203 false /* LEGACY_UNPACK */>);
1204
1205TF_CALL_int64(REGISTER_GPU);
1206TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1207TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
1208#undef REGISTER_GPU
1209
1210#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1211
1212// SPLIT *********************************************************************
1213
1214template <typename Device, typename T>
1215class TensorArraySplitOp : public OpKernel {
1216 public:
1217 explicit TensorArraySplitOp(OpKernelConstruction* context)
1218 : OpKernel(context) {}
1219
1220 void Compute(OpKernelContext* ctx) override {
1221 OP_REQUIRES_OK(ctx, SetupFlowControlInputs(ctx, true));
1222
1223 TensorArray* tensor_array = nullptr;
1224 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1225 core::ScopedUnref unref(tensor_array);
1226 const Tensor* tensor_value;
1227 OP_REQUIRES_OK(ctx, ctx->input("value", &tensor_value));
1228 const Tensor* tensor_lengths;
1229 OP_REQUIRES_OK(ctx, ctx->input("lengths", &tensor_lengths));
1230
1231 OP_REQUIRES(ctx, TensorShapeUtils::IsVector(tensor_lengths->shape()),
1232 errors::InvalidArgument(
1233 "Expected lengths to be a vector, received shape: ",
1234 tensor_lengths->shape().DebugString()));
1235 OP_REQUIRES(ctx,
1236 FastBoundsCheck(tensor_lengths->NumElements(),
1237 std::numeric_limits<int32>::max()),
1238 errors::InvalidArgument(
1239 "Expected lengths to have < max int32 entries"));
1240
1241 int32_t num_tensors = static_cast<int32>(tensor_lengths->NumElements());
1242 auto tensor_lengths_t = tensor_lengths->vec<int64_t>();
1243 std::vector<int64_t> cumulative_lengths;
1244 cumulative_lengths.reserve(num_tensors);
1245 int64_t total_length = 0;
1246 for (int i = 0; i < num_tensors; ++i) {
1247 total_length += tensor_lengths_t(i);
1248 cumulative_lengths.push_back(total_length);
1249 }
1250
1251 OP_REQUIRES(
1252 ctx, TensorShapeUtils::IsVectorOrHigher(tensor_value->shape()),
1253 errors::InvalidArgument(
1254 "Expected value to be at least a vector, but received shape: ",
1255 tensor_value->shape().DebugString()));
1256
1257 OP_REQUIRES(
1258 ctx, total_length == tensor_value->shape().dim_size(0),
1259 errors::InvalidArgument("Expected sum of lengths to be equal to "
1260 "values.shape[0], but sum of lengths is ",
1261 total_length, " and value's shape is: ",
1262 tensor_value->shape().DebugString()));
1263 int64_t elements_per_row =
1264 (total_length == 0) ? 0 : (tensor_value->NumElements() / total_length);
1265
1266 int32_t array_size;
1267 OP_REQUIRES_OK(ctx, tensor_array->Size(&array_size));
1268 bool dynamic_size = tensor_array->HasDynamicSize();
1269
1270 std::vector<TensorShape> element_shapes(num_tensors, tensor_value->shape());
1271 for (int32_t i = 0; i < num_tensors; ++i) {
1272 element_shapes[i].set_dim(0, tensor_lengths_t(i));
1273 }
1274
1275 // If dynamic size, we may have to resize the TensorArray to fit.
1276 if (dynamic_size && array_size < num_tensors) {
1277 array_size = num_tensors;
1278 }
1279
1280 OP_REQUIRES(
1281 ctx, array_size == num_tensors,
1282 errors::InvalidArgument(
1283 "TensorArray's size is not equal to the size of lengths (",
1284 array_size, " vs. ", num_tensors, "), and the TensorArray is not ",
1285 "marked as dynamically resizeable"));
1286
1287 OP_REQUIRES(
1288 ctx, tensor_value->dtype() == tensor_array->ElemType(),
1289 errors::InvalidArgument("TensorArray dtype is ",
1290 DataTypeString(tensor_array->ElemType()),
1291 " but Op is trying to write dtype ",
1292 DataTypeString(tensor_value->dtype()), "."));
1293
1294 auto tensor_value_t =
1295 tensor_value->shaped<T, 3>({1, total_length, elements_per_row});
1296
1297 std::vector<Tensor> write_values;
1298 write_values.reserve(array_size);
1299
1300 for (int i = 0; i < array_size; ++i) {
1301 Tensor tensor_value_i;
1302
1303 int64_t previous_length = (i == 0) ? 0 : cumulative_lengths[i - 1];
1304 Eigen::DSizes<Eigen::DenseIndex, 3> indices{
1305 0, static_cast<Eigen::DenseIndex>(previous_length), 0};
1306 Eigen::DSizes<Eigen::DenseIndex, 3> sizes{
1307 1, static_cast<Eigen::DenseIndex>(tensor_lengths_t(i)),
1308 static_cast<Eigen::DenseIndex>(elements_per_row)};
1309
1310 OP_REQUIRES_OK(
1311 ctx, ctx->allocate_temp(tensor_array->ElemType(), element_shapes[i],
1312 &tensor_value_i));
1313
1314 if (tensor_lengths_t(i) > 0) {
1315 auto tensor_value_i_t = tensor_value_i.shaped<T, 3>(
1316 {1, tensor_lengths_t(i), elements_per_row});
1317
1318 functor::Split<Device, T, 3>()(ctx->eigen_device<Device>(),
1319 tensor_value_i_t, tensor_value_t,
1320 indices, sizes);
1321 }
1322
1323 write_values.push_back(tensor_value_i);
1324 }
1325
1326 // Record the concat size of the TensorArray.
1327 OP_REQUIRES_OK(ctx, tensor_array->SetMarkedSize(array_size));
1328
1329 std::vector<int32> indices(array_size);
1330 std::iota(indices.begin(), indices.end(), 0);
1331
1332 Status s = tensor_array->WriteOrAggregateMany<Device, T>(ctx, indices,
1333 &write_values);
1334 OP_REQUIRES_OK(ctx, s);
1335 }
1336};
1337
1338#define REGISTER_SPLIT(type) \
1339 REGISTER_KERNEL_BUILDER( \
1340 Name("TensorArraySplit").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1341 TensorArraySplitOp<CPUDevice, type>); \
1342 REGISTER_KERNEL_BUILDER( \
1343 Name("TensorArraySplitV2").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1344 TensorArraySplitOp<CPUDevice, type>); \
1345 REGISTER_KERNEL_BUILDER( \
1346 Name("TensorArraySplitV3").Device(DEVICE_CPU).TypeConstraint<type>("T"), \
1347 TensorArraySplitOp<CPUDevice, type>);
1348
1349TF_CALL_ALL_TYPES(REGISTER_SPLIT);
1350#undef REGISTER_SPLIT
1351
1352#if GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1353
1354#define REGISTER_GPU(type) \
1355 REGISTER_KERNEL_BUILDER(Name("TensorArraySplit") \
1356 .Device(DEVICE_GPU) \
1357 .TypeConstraint<type>("T") \
1358 .HostMemory("lengths") \
1359 .HostMemory("handle"), \
1360 TensorArraySplitOp<GPUDevice, type>); \
1361 REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV2") \
1362 .Device(DEVICE_GPU) \
1363 .TypeConstraint<type>("T") \
1364 .HostMemory("lengths") \
1365 .HostMemory("handle"), \
1366 TensorArraySplitOp<GPUDevice, type>); \
1367 REGISTER_KERNEL_BUILDER(Name("TensorArraySplitV3") \
1368 .Device(DEVICE_GPU) \
1369 .TypeConstraint<type>("T") \
1370 .HostMemory("lengths") \
1371 .HostMemory("handle"), \
1372 TensorArraySplitOp<GPUDevice, type>);
1373
1374TF_CALL_GPU_NUMBER_TYPES(REGISTER_GPU);
1375TF_CALL_COMPLEX_TYPES(REGISTER_GPU);
1376#undef REGISTER_GPU
1377
1378#endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM
1379
1380// SIZE ***********************************************************************
1381
1382// Get the size of the TensorArray
1383class TensorArraySizeOp : public OpKernel {
1384 public:
1385 explicit TensorArraySizeOp(OpKernelConstruction* context)
1386 : OpKernel(context) {}
1387
1388 void Compute(OpKernelContext* ctx) override {
1389 TensorArray* tensor_array;
1390 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1391 core::ScopedUnref unref(tensor_array);
1392 Tensor* output = nullptr;
1393 OP_REQUIRES_OK(ctx, ctx->allocate_output(0, TensorShape({}), &output));
1394 OP_REQUIRES_OK(ctx, tensor_array->Size(&(output->scalar<int32>()())));
1395 }
1396};
1397
1398REGISTER_KERNEL_BUILDER(Name("TensorArraySize").Device(DEVICE_CPU),
1399 TensorArraySizeOp);
1400REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV2").Device(DEVICE_CPU),
1401 TensorArraySizeOp);
1402REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV3").Device(DEVICE_CPU),
1403 TensorArraySizeOp);
1404
1405REGISTER_KERNEL_BUILDER(Name("TensorArraySize")
1406 .Device(DEVICE_GPU)
1407 .HostMemory("handle")
1408 .HostMemory("size"),
1409 TensorArraySizeOp);
1410REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV2")
1411 .Device(DEVICE_GPU)
1412 .HostMemory("handle")
1413 .HostMemory("size"),
1414 TensorArraySizeOp);
1415REGISTER_KERNEL_BUILDER(Name("TensorArraySizeV3")
1416 .Device(DEVICE_GPU)
1417 .HostMemory("handle")
1418 .HostMemory("size"),
1419 TensorArraySizeOp);
1420
1421// CLOSE
1422// **********************************************************************
1423
1424// Delete the TensorArray from its resource container. This enables
1425// the user to close and release the resource in the middle of a step/run.
1426// TODO(ebrevdo): decide whether closing the grad op should happen
1427// here or on the python side.
1428class TensorArrayCloseOp : public OpKernel {
1429 public:
1430 explicit TensorArrayCloseOp(OpKernelConstruction* context)
1431 : OpKernel(context) {}
1432
1433 void Compute(OpKernelContext* ctx) override {
1434 TensorArray* tensor_array;
1435 OP_REQUIRES_OK(ctx, GetTensorArray(ctx, &tensor_array));
1436 core::ScopedUnref unref(tensor_array);
1437 // Instead of deleting this TA from the ResourceManager, we just
1438 // clear it away and mark it as closed. The remaining memory
1439 // consumed store its mutex and handle Tensor. This will be
1440 // cleared out at the end of the step anyway, so it's fine to keep
1441 // it around until the end of the step. Further calls to the
1442 // TensorArray will fail because TensorArray checks internally to
1443 // see if it is closed or not.
1444 tensor_array->ClearAndMarkClosed();
1445 }
1446};
1447
1448REGISTER_KERNEL_BUILDER(Name("TensorArrayClose").Device(DEVICE_CPU),
1449 TensorArrayCloseOp);
1450REGISTER_KERNEL_BUILDER(Name("TensorArrayCloseV2").Device(DEVICE_CPU),
1451 TensorArrayCloseOp);
1452REGISTER_KERNEL_BUILDER(Name("TensorArrayCloseV3").Device(DEVICE_CPU),
1453 TensorArrayCloseOp);
1454
1455REGISTER_KERNEL_BUILDER(
1456 Name("TensorArrayClose").Device(DEVICE_GPU).HostMemory("handle"),
1457 TensorArrayCloseOp);
1458REGISTER_KERNEL_BUILDER(
1459 Name("TensorArrayCloseV2").Device(DEVICE_GPU).HostMemory("handle"),
1460 TensorArrayCloseOp);
1461REGISTER_KERNEL_BUILDER(
1462 Name("TensorArrayCloseV3").Device(DEVICE_GPU).HostMemory("handle"),
1463 TensorArrayCloseOp);
1464
1465} // namespace tensorflow
1466