where_op.cc source code [tensorflow/tensorflow/core/kernels/where_op.cc]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	// See docs in ../ops/array_ops.cc.
17
18	#define EIGEN_USE_THREADS
19
20	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
21	#define EIGEN_USE_GPU
22	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
23
24	#include "tensorflow/core/kernels/where_op.h"
25
26	#include <memory>
27	#include <numeric>
28	#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
29	#include "tensorflow/core/framework/bounds_check.h"
30	#include "tensorflow/core/framework/op_kernel.h"
31	#include "tensorflow/core/framework/register_types.h"
32	#include "tensorflow/core/framework/tensor.h"
33	#include "tensorflow/core/framework/tensor_shape.h"
34	#include "tensorflow/core/framework/tensor_types.h"
35	#include "tensorflow/core/framework/types.h"
36	#include "tensorflow/core/platform/logging.h"
37	#include "tensorflow/core/platform/macros.h"
38	#include "tensorflow/core/platform/types.h"
39
40	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
41	#include "tensorflow/core/common_runtime/gpu/gpu_event_mgr.h"
42	#include "tensorflow/core/util/gpu_solvers.h"
43	#if GOOGLE_CUDA
44	#include "tensorflow/compiler/xla/stream_executor/cuda/cuda_activation.h"
45	using stream_executor::cuda::ScopedActivateExecutorContext;
46	#elif TENSORFLOW_USE_ROCM
47	#include "tensorflow/core/platform/rocm.h"
48	using stream_executor::rocm::ScopedActivateExecutorContext;
49	#endif // TENSORFLOW_USE_ROCM
50	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
51
52	namespace tensorflow {
53
54	typedef Eigen::ThreadPoolDevice CPUDevice;
55	typedef Eigen::GpuDevice GPUDevice;
56
57	namespace functor {
58
59	namespace {
60	template <typename T>
61	int64_t CountAccumulator(const T* begin, const T* end) {
62	return std::accumulate(begin, end, `0LL`, [](int64_t accum, const T& val) {
63	return accum + (val != T(`0`));
64	});
65	}
66
67	template <>
68	int64_t CountAccumulator<bool>(const bool* begin, const bool* end) {
69	return std::accumulate(begin, end, `0LL`);
70	}
71
72	} // namespace
73
74	template <typename T>
75	struct NumTrue<CPUDevice, T, int64_t> {
76	static Status Compute(OpKernelContext* ctx, const CPUDevice& d,
77	typename TTypes<T>::ConstFlat input,
78	TTypes<int64_t>::UnalignedScalar num_true) {
79	num_true () = CountAccumulator<T>(input.data(), input.data() + input.size());
80	return OkStatus();
81	}
82	};
83
84	template <int DIMS, typename T, typename TIndex>
85	struct Where<CPUDevice, DIMS, T, TIndex> {
86	EIGEN_ALWAYS_INLINE static void WriteIndexRowMajor(
87	typename TTypes<int64_t>::Matrix output,
88	const typename Eigen::DSizes<TIndex, DIMS>& strides, TIndex true_n,
89	TIndex index) {
90	for (int i = `0`; i < DIMS; ++i) {
91	output(true_n, i) = index / strides[i];
92	index -= output(true_n, i) * strides[i];
93	}
94	}
95
96	EIGEN_ALWAYS_INLINE static Status Compute(
97	OpKernelContext* ctx, const CPUDevice& d,
98	typename TTypes<T, DIMS>::ConstTensor input,
99	typename TTypes<int64_t>::Matrix output, TIndex* found_true) {
100	Eigen::DSizes<Eigen::DenseIndex, DIMS> dims = input.dimensions();
101	Eigen::DSizes<TIndex, DIMS> strides;
102
103	EIGEN_STATIC_ASSERT((static_cast<int>(decltype(input)::Layout) ==
104	static_cast<int>(Eigen::RowMajor)),
105	INTERNAL_ERROR_INPUT_SHOULD_BE_ROWMAJOR);
106
107	strides[DIMS - `1`] = `1`;
108	for (int i = DIMS - `2`; i >= `0`; --i) {
109	strides[i] = strides[i + `1`] * dims[i + `1`];
110	}
111
112	Eigen::DenseIndex output_size = output.dimension(`0`);
113	for (Eigen::DenseIndex n = `0`; n < input.size(); ++n) {
114	if (input.data()[n] != T(`0`)) {
115	if (FastBoundsCheck(*found_true, output_size)) {
116	WriteIndexRowMajor(output, strides, *found_true, n);
117	}
118	++*found_true;
119	}
120	}
121	return OkStatus();
122	}
123	};
124
125	} // namespace functor
126
127	template <typename T>
128	class WhereCPUOp : public OpKernel {
129	public:
130	explicit WhereCPUOp(OpKernelConstruction* context) : OpKernel(context) {}
131
132	void Compute(OpKernelContext* context) override {
133	const Tensor& input = context->input(`0`);
134
135	OP_REQUIRES(
136	context, input.dtype() != DT_HALF,
137	errors::Unimplemented("No WhereOp available for float16/half type on "
138	"CPU; dying in CPU WhereOp to avoid silently "
139	"creating costly copies from device."));
140
141	const int input_dims = input.dims();
142
143	int64_t num_true;
144	TTypes<int64_t>::UnalignedScalar num_true_t(&num_true);
145
146	Status s = functor::NumTrue<CPUDevice, T, int64_t>::Compute(
147	context, context->eigen_device<CPUDevice>(), input.flat<T>(),
148	num_true_t);
149	OP_REQUIRES_OK(context, s);
150	TensorShape output_shape({num_true, input_dims});
151	Tensor* output = nullptr;
152	OP_REQUIRES_OK(context, context->allocate_output(`0`, output_shape, &output));
153
154	// TODO(ebrevdo): Replace single-threaded copy with a multithreaded block
155	// copy by getting block counts above instead of a global NumTrue, then
156	// having each block filled in separate threads below.
157	int64_t found_true = `0`;
158
159	#define HANDLE_DIM(NDIM) \
160	case NDIM: { \
161	Status s = functor::Where<CPUDevice, NDIM, T, int64_t>::Compute( \
162	context, context->eigen_device<CPUDevice>(), input.tensor<T, NDIM>(), \
163	output->matrix<int64_t>(), &found_true); \
164	OP_REQUIRES_OK(context, s); \
165	} break;
166
167	switch (input_dims) {
168	HANDLE_DIM(`1`);
169	HANDLE_DIM(`2`);
170	HANDLE_DIM(`3`);
171	HANDLE_DIM(`4`);
172	HANDLE_DIM(`5`);
173	HANDLE_DIM(`6`);
174	HANDLE_DIM(`7`);
175	HANDLE_DIM(`8`);
176
177	default:
178	OP_REQUIRES(context, false,
179	errors::InvalidArgument(
180	"WhereOp : Unhandled input dimensions: ", input_dims));
181	}
182	#undef HANDLE_DIM
183
184	OP_REQUIRES(
185	context, found_true == num_true_t(),
186	errors::InvalidArgument(
187	"WhereOp: Race condition between counting the number of true "
188	"elements and writing them. When counting, saw ",
189	num_true_t(), " elements; but when writing their indices, saw ",
190	found_true, " elements."));
191	}
192
193	private:
194	TF_DISALLOW_COPY_AND_ASSIGN(WhereCPUOp);
195	};
196
197	#define REGISTER_WHERE_OP(T) \
198	REGISTER_KERNEL_BUILDER( \
199	Name("Where").Device(DEVICE_CPU).TypeConstraint<T>("T"), WhereCPUOp<T>);
200
201	TF_CALL_NUMBER_TYPES(REGISTER_WHERE_OP);
202	TF_CALL_bool(REGISTER_WHERE_OP);
203
204	#undef REGISTER_WHERE_OP
205
206	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
207
208	namespace functor {
209
210	#define DECLARE_GPU_NUMTRUE(T, Tindex) \
211	template <> \
212	Status NumTrue<GPUDevice, T, Tindex>::Compute( \
213	OpKernelContext* ctx, const GPUDevice& d, TTypes<T>::ConstFlat input, \
214	TTypes<Tindex>::UnalignedScalar num_true); \
215	extern template struct NumTrue<GPUDevice, T, Tindex>
216
217	#define DECLARE_GPU_NUMTRUE_TYPE(T) \
218	DECLARE_GPU_NUMTRUE(T, int32); \
219	DECLARE_GPU_NUMTRUE(T, int64_t);
220
221	TF_CALL_NUMBER_TYPES(DECLARE_GPU_NUMTRUE_TYPE);
222	TF_CALL_bool(DECLARE_GPU_NUMTRUE_TYPE);
223
224	#undef DECLARE_GPU_NUMTRUE_TYPE
225	#undef DECLARE_GPU_NUMTRUE
226
227	#define DECLARE_GPU_WHERE_INDEX(Dims, T, Tindex) \
228	template <> \
229	Status Where<GPUDevice, Dims, T, Tindex>::Compute( \
230	OpKernelContext* ctx, const GPUDevice& d, \
231	typename TTypes<T, Dims>::ConstTensor input, \
232	typename TTypes<int64_t>::Matrix output, Tindex* found_true); \
233	extern template struct Where<GPUDevice, Dims, T, Tindex>;
234	#define DECLARE_GPU_WHERE(Dims, T) \
235	DECLARE_GPU_WHERE_INDEX(Dims, T, int32); \
236	DECLARE_GPU_WHERE_INDEX(Dims, T, int64_t);
237
238	#define DECLARE_GPU_WHERE_TYPES(T) \
239	DECLARE_GPU_WHERE(1, T); \
240	DECLARE_GPU_WHERE(2, T); \
241	DECLARE_GPU_WHERE(3, T); \
242	DECLARE_GPU_WHERE(4, T); \
243	DECLARE_GPU_WHERE(5, T); \
244	DECLARE_GPU_WHERE(6, T); \
245	DECLARE_GPU_WHERE(7, T); \
246	DECLARE_GPU_WHERE(8, T);
247
248	TF_CALL_WHERE_GPU_TYPES(DECLARE_GPU_WHERE_TYPES);
249
250	#undef DECLARE_GPU_WHERE_TYPES
251	#undef DECLARE_GPU_WHERE
252	#undef DECLARE_GPU_WHERE_INDEX
253
254	} // namespace functor
255
256	template <typename T>
257	class WhereGPUOp : public AsyncOpKernel {
258	public:
259	explicit WhereGPUOp(OpKernelConstruction* context) : AsyncOpKernel(context) {}
260
261	void ComputeAsync(OpKernelContext* context, DoneCallback done) override {
262	const Tensor& input = context->input(`0`);
263	const int input_dims = input.dims();
264
265	if (input.NumElements() < std::numeric_limits<int32>::max()) {
266	ComputeAsyncType<int32>(input, input_dims, context, done);
267	} else {
268	ComputeAsyncType<int64_t>(input, input_dims, context, done);
269	}
270	}
271
272	template <typename Tindex>
273	void ComputeAsyncType(const Tensor& input, const int input_dims,
274	OpKernelContext* context, DoneCallback done) {
275	// Step 0: alloc nnz
276	// Step 1: call nnz kernel
277	// Step 2: call create_output
278	// Step 3: call where kernel
279
280	// Allocate pinned memory for `num_true`. This memory is accessible on host
281	// and device.
282	ScratchSpace<Tindex> num_true(context, `1`, /on_host=/true);
283	typename TTypes<Tindex>::UnalignedScalar num_true_t(
284	num_true.mutable_data());
285
286	// Push kernel to stream to get number of true elements.
287	const GPUDevice& d = context->eigen_device<GPUDevice>();
288	Status s = functor::NumTrue<GPUDevice, T, Tindex>::Compute(
289	context, d, input.flat<T>(), num_true_t);
290	OP_REQUIRES_OK_ASYNC(context, s, done);
291
292	auto create_and_check_output = [context, &d, &input, input_dims,
293	num_true = std::move(num_true), done]() {
294	// Ensure that within the callback, the proper GPU settings are
295	// configured.
296	auto stream = context->op_device_context()->stream();
297	ScopedActivateExecutorContext scoped_activation{stream->parent()};
298
299	// TODO(ebrevdo): Properly copy back found_true value to CPU for
300	// validation checking. Currently Where<GPUDevice>::Compute()
301	// does not perform this copy back to CPU.
302	Tindex found_true = -`1`;
303
304	// Step 1: Allocate the output and perform the selection/copy.
305	Tensor* output;
306	OP_REQUIRES_OK_ASYNC(
307	context,
308	context->allocate_output(
309	`0`, TensorShape({*num_true.data(), input_dims}), &output),
310	done);
311
312	#define HANDLE_DIM(NDIM) \
313	case NDIM: { \
314	Status s = functor::Where<GPUDevice, NDIM, T, Tindex>::Compute( \
315	context, d, input.tensor<T, NDIM>(), output->matrix<int64_t>(), \
316	&found_true); \
317	OP_REQUIRES_OK_ASYNC(context, s, done); \
318	} break;
319
320	switch (input_dims) {
321	HANDLE_DIM(`1`);
322	HANDLE_DIM(`2`);
323	HANDLE_DIM(`3`);
324	HANDLE_DIM(`4`);
325	HANDLE_DIM(`5`);
326	HANDLE_DIM(`6`);
327	HANDLE_DIM(`7`);
328	HANDLE_DIM(`8`);
329
330	default:
331	OP_REQUIRES_ASYNC(
332	context, false,
333	errors::InvalidArgument("WhereOp: Unhandled input dimensions: ",
334	input_dims),
335	done);
336	}
337	#undef HANDLE_DIM
338
339	// TODO(ebrevdo): Fix the copy back to host.
340
341	// OP_REQUIRES_ASYNC(
342	// context, found_true == num_true,
343	// errors::InvalidArgument(
344	// "WhereOp: Race condition between counting the number of true "
345	// "elements and writing them. When counting, saw ",
346	// num_true, " elements; but when writing their indices, saw ",
347	// found_true, " elements."),
348	// done);
349
350	done();
351	};
352
353	auto stream = context->op_device_context()->stream();
354	context->device()
355	->tensorflow_accelerator_device_info()
356	->event_mgr->ThenExecute(stream, create_and_check_output);
357	}
358
359	private:
360	TF_DISALLOW_COPY_AND_ASSIGN(WhereGPUOp);
361	};
362
363	#define REGISTER_GPU_WHERE_OP(T) \
364	REGISTER_KERNEL_BUILDER( \
365	Name("Where").Device(DEVICE_GPU).TypeConstraint<T>("T"), WhereGPUOp<T>);
366
367	TF_CALL_WHERE_GPU_TYPES(REGISTER_GPU_WHERE_OP);
368	#undef REGISTER_GPU_WHERE_OP
369
370	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
371
372	REGISTER_KERNEL_BUILDER(Name("Where")
373	.Device(DEVICE_DEFAULT)
374	.TypeConstraint<int32>("T")
375	.HostMemory("input")
376	.HostMemory("index"),
377	WhereCPUOp<int32>);
378
379	} // namespace tensorflow
380

Browse the source code of tensorflow/tensorflow/core/kernels/where_op.cc