depthwise_conv_op.cc source code [tensorflow/tensorflow/core/kernels/depthwise_conv_op.cc]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#define EIGEN_USE_THREADS
17
18	#include "tensorflow/core/kernels/depthwise_conv_op.h"
19
20	#include <algorithm>
21	#include <cmath>
22	#include <type_traits>
23	#include <vector>
24
25	#include "tensorflow/core/framework/bounds_check.h"
26	#include "tensorflow/core/framework/kernel_shape_util.h"
27	#include "tensorflow/core/framework/numeric_op.h"
28	#include "tensorflow/core/framework/op_kernel.h"
29	#include "tensorflow/core/framework/register_types.h"
30	#include "tensorflow/core/framework/tensor.h"
31	#include "tensorflow/core/framework/tensor_shape.h"
32	#include "tensorflow/core/framework/tensor_types.h"
33	#include "tensorflow/core/framework/types.h"
34	#include "tensorflow/core/kernels/conv_ops.h"
35	#include "tensorflow/core/lib/core/status.h"
36	#include "tensorflow/core/platform/errors.h"
37	#include "tensorflow/core/platform/logging.h"
38	#include "tensorflow/core/platform/types.h"
39	#include "tensorflow/core/util/padding.h"
40	#include "tensorflow/core/util/tensor_format.h"
41	#include "tensorflow/core/util/use_cudnn.h"
42	#include "tensorflow/core/util/work_sharder.h"
43
44	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
45
46	#if GOOGLE_CUDA
47	#include "third_party/gpus/cudnn/cudnn.h"
48	#endif
49
50	#include "tensorflow/core/platform/stream_executor.h"
51	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
52
53	namespace tensorflow {
54
55	// In depthwise convolution, one input is convolved into depth_multipler
56	// outputs and the outputs don't need to be reduced again like what regular
57	// convolution does.
58	// However, the way to apply filters to inputs is exactly the same as the
59	// regular convolution. Please refer to the regular convolution kernels for
60	// more details.
61
62	typedef Eigen::ThreadPoolDevice CPUDevice;
63	typedef Eigen::GpuDevice GPUDevice;
64
65	// Computes the vectorized product of 'input_buffer' and 'filter' and stores
66	// result in 'output' at location specified by 'out_r' and 'out_c'.
67	//
68	// EX:
69	// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
70	// Both 'input_buffer' and 'filter' are padded to register-width boundaries.
71	//
72	// input_buffer [rows, cols, in_depth, depth_multiplier]
73	// [a0, a0, a1, a1] [a2, a2, 0, 0] [b0, b0, b1, b1] [b2, b2, 0, 0]
74	// [e0, e0, e1, e1] [e2, e2, 0, 0] [f0, f0, f1, f1] [f2, f2, 0, 0]
75	//
76	// filter [rows, cols, in_depth, depth_multiplier]
77	// [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
78	// [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
79	//
80	// First output register [in_depth, depth_multiplier]
81	// [q0, q1, q2, q3] = ([a0, a0, a1, a1] x [u0, v0, w0, x0]) +
82	// ([b0, b0, b1, b1] x [u1, v1, w1, x1]) +
83	// ([e0, e0, e1, e1] x [u2, v2, w2, x2]) +
84	// ([f0, f0, f1, f1] x [u3, v3, w3, x3])
85	//
86	// TODO(andydavis) Experiment with processing multiple inputs per input buffer.
87	template <typename T>
88	struct DepthwiseConv2DKernel {
89	static void Run(const DepthwiseArgs& args,
90	const int64_t padded_filter_inner_dim_size,
91	const int64_t out_r, const int64_t out_c, const T* filter,
92	const T* input_buffer, T* output, TensorFormat data_format) {
93	typedef typename Eigen::internal::packet_traits<T>::type Packet;
94	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
95
96	const int64_t out_depth = args.out_depth;
97	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
98	const int64_t output_scalar_size = out_depth % kPacketSize;
99	const int64_t output_vectorized_size =
100	(out_depth / kPacketSize) * kPacketSize;
101	const int64_t base_output_index =
102	(out_r * args.out_cols + out_c) * out_depth;
103
104	for (int i = `0`; i < output_vectorized_size; i += kPacketSize) {
105	// Reset accumulator.
106	auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(`0`));
107	for (int j = `0`; j < filter_spatial_size; ++j) {
108	// Calculate index.
109	const int64_t index = i + j * padded_filter_inner_dim_size;
110	// Load filter.
111	// TODO(andydavis) Unroll 'out_c' loop in caller so we can load
112	// multiple inputs here to amortize the cost of each filter block load.
113	const auto filter_block =
114	Eigen::internal::ploadu<Packet>(filter + index);
115	// Load input.
116	const auto data_block =
117	Eigen::internal::ploadu<Packet>(input_buffer + index);
118	// Vector multiply-add.
119	vaccum =
120	Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
121	}
122	// Store vector accumulator to output.
123	Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
124	}
125
126	if (output_scalar_size > `0`) {
127	auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(`0`));
128	for (int j = `0`; j < filter_spatial_size; ++j) {
129	const int64_t index =
130	output_vectorized_size + j * padded_filter_inner_dim_size;
131	const auto filter_block =
132	Eigen::internal::ploadu<Packet>(filter + index);
133	const auto data_block =
134	Eigen::internal::ploadu<Packet>(input_buffer + index);
135	vaccum =
136	Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
137	}
138	// Load accumulator into an array and loop through output.
139	T out_buf[kPacketSize];
140	Eigen::internal::pstoreu<T>(out_buf, vaccum);
141	const int64_t last_output_index =
142	base_output_index + output_vectorized_size;
143	for (int j = `0`; j < output_scalar_size; ++j) {
144	output[last_output_index + j] = out_buf[j];
145	}
146	}
147	}
148	};
149
150	// Computes the depthwise conv2d of 'input' by 'depthwise_filter' and stores
151	// the result in 'output'. This implementation trades off copying small patches
152	// of the input to achieve better data alignment, which enables vectorized
153	// load/store and multiply-add operations (see comments at InputBufferCopyOp and
154	// DepthwiseConv2DKernel for details).
155	//
156	// TODO(andydavis) Evaluate the performance of processing multiple input
157	// patches in the inner loop.
158	// TODO(andydavis) Consider a zero-copy implementation for the case when
159	// 'in_depth' is a multiple of register width, and 'depth_multipler' is one.
160	// TODO(andydavis) Evaluate the performance of alternative implementations.
161	template <typename T>
162	struct LaunchDepthwiseConvOp<CPUDevice, T> {
163	typedef typename Eigen::internal::packet_traits<T>::type Packet;
164
165	void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
166	const T* input, const T* depthwise_filter, T* output,
167	TensorFormat data_format) {
168	OP_REQUIRES(
169	ctx, data_format == FORMAT_NHWC,
170	errors::Unimplemented(
171	"Depthwise convolution on CPU is only supported for NHWC format"));
172	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
173
174	// Pad 'depthwise_filter' to vector register width (if needed).
175	const bool pad_filter = (args.out_depth % kPacketSize) == `0` ? false : true;
176	Tensor padded_filter;
177	if (pad_filter) {
178	// Allocate space for padded filter.
179	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
180	const int64_t padded_filter_inner_dim_size =
181	((args.out_depth + kPacketSize - `1`) / kPacketSize) * kPacketSize;
182	OP_REQUIRES_OK(
183	ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
184	TensorShape ({filter_spatial_size,
185	padded_filter_inner_dim_size}),
186	&padded_filter));
187	// Write out padded filter.
188	functor::DepthwiseFilterPadOp<T>()(
189	args, depthwise_filter, padded_filter.template flat<T>().data());
190	}
191	const T* filter_data =
192	pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
193
194	// Computes one shard of depthwise conv2d output.
195	auto shard = [&ctx, &args, &input, &filter_data, &output, data_format](
196	int64_t start, int64_t limit) {
197	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
198	const int64_t input_image_size =
199	args.in_rows * args.in_cols * args.in_depth;
200	const int64_t output_image_size =
201	args.out_rows * args.out_cols * args.out_depth;
202	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
203	const int64_t padded_filter_inner_dim_size =
204	((args.out_depth + kPacketSize - `1`) / kPacketSize) * kPacketSize;
205
206	// Allocate buffer for local input regions.
207	Tensor input_buffer;
208	OP_REQUIRES_OK(
209	ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
210	TensorShape ({filter_spatial_size,
211	padded_filter_inner_dim_size}),
212	&input_buffer));
213	T* input_buffer_data = input_buffer.template flat<T>().data();
214
215	for (int64_t i = start; i < limit; ++i) {
216	const int64_t b = i / args.out_rows;
217	const int64_t in_base = b * input_image_size;
218	const int64_t out_base = b * output_image_size;
219
220	const int64_t out_r = i % args.out_rows;
221
222	for (int64_t out_c = `0`; out_c < args.out_cols; ++out_c) {
223	// Populate 'input_buffer_data' with data from local input region.
224	functor::DepthwiseInputCopyOp<T>()(args, padded_filter_inner_dim_size,
225	out_r, out_c, input + in_base,
226	input_buffer_data);
227
228	// Process buffered input across all filters and store to output.
229	DepthwiseConv2DKernel<T>::Run(
230	args, padded_filter_inner_dim_size, out_r, out_c, filter_data,
231	input_buffer_data, output + out_base, data_format);
232	}
233	}
234	};
235
236	const int64_t total_shards = args.batch * args.out_rows;
237
238	// Empirically tested to give reasonable performance boosts at batch size 1
239	// without reducing throughput at batch size 32.
240	const float kCostMultiplier = `2.5f`;
241
242	// TODO(andydavis): Estimate shard cost (in cycles) based on the number of
243	// flops/loads/stores required to compute one shard.
244	const int64_t shard_cost = kCostMultiplier * args.out_cols * args.out_depth;
245
246	auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
247	Shard(worker_threads.num_threads, worker_threads.workers, total_shards,
248	shard_cost, shard);
249	}
250	};
251
252	// Extern template instantiated in conv_ops.cc.
253	extern template struct LaunchConv2DOp<CPUDevice, bfloat16>;
254	extern template struct LaunchConv2DOp<CPUDevice, Eigen::half>;
255	extern template struct LaunchConv2DOp<CPUDevice, float>;
256	extern template struct LaunchConv2DOp<CPUDevice, double>;
257
258	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
259
260	// Extern template instantiated in conv_ops.cc.
261	extern template struct LaunchConv2DOp<GPUDevice, Eigen::half>;
262	extern template struct LaunchConv2DOp<GPUDevice, float>;
263	extern template struct LaunchConv2DOp<GPUDevice, double>;
264
265	// Extern template instantiated in depthwise_conv_op_gpu.cc.
266	extern template struct LaunchDepthwiseConvOp<GPUDevice, Eigen::half>;
267	extern template struct LaunchDepthwiseConvOp<GPUDevice, float>;
268	extern template struct LaunchDepthwiseConvOp<GPUDevice, double>;
269
270	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
271
272	template <typename Device, typename T>
273	class DepthwiseConv2dNativeOp : public BinaryOp<T> {
274	public:
275	explicit DepthwiseConv2dNativeOp(OpKernelConstruction* context)
276	: BinaryOp<T>(context) {
277	OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
278	string data_format;
279	OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
280	OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
281	errors::InvalidArgument("Invalid data format"));
282
283	OP_REQUIRES(context, strides_.size() == `4`,
284	errors::InvalidArgument("Sliding window strides field must "
285	"specify 4 dimensions"));
286	stride_ = GetTensorDim(strides_, data_format_, `'H'`);
287	const int64_t stride_w = GetTensorDim(strides_, data_format_, `'W'`);
288	const int64_t stride_n = GetTensorDim(strides_, data_format_, `'N'`);
289	const int64_t stride_c = GetTensorDim(strides_, data_format_, `'C'`);
290
291	OP_REQUIRES(context, stride_ == stride_w,
292	errors::InvalidArgument(
293	"Current implementation only supports equal length "
294	"strides in the row and column dimensions."));
295	OP_REQUIRES(
296	context, (stride_n == `1` && stride_c == `1`),
297	errors::InvalidArgument("Current implementation does not yet support "
298	"strides in the batch and depth dimensions."));
299	OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
300	OP_REQUIRES_OK(context,
301	context->GetAttr("explicit_paddings", &explicit_paddings_));
302	OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
303	/num_dims=/`4`, data_format_));
304
305	// CPU/GPU kernel currently ignores dilations, so all must be 1.
306	std::vector<int32_t> dilations;
307	OP_REQUIRES_OK(context, context->GetAttr("dilations", &dilations));
308	bool unit_dilations = true;
309	for (int32_t dilation : dilations) {
310	if (dilation != `1`) {
311	unit_dilations = false;
312	}
313	}
314	OP_REQUIRES(context, unit_dilations,
315	errors::Unimplemented(
316	"Current kernel implementation does not support "
317	"dilations, received [",
318	Eigen::Map<Eigen::Matrix<int32_t, `1`, Eigen::Dynamic>>(
319	dilations.data(), dilations.size()),
320	"]"));
321
322	cudnn_use_autotune_ = CudnnUseAutotune();
323	dtype_ = DataTypeToEnum<T>::value;
324	#if CUDNN_VERSION >= 8000
325	// From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
326	// NHWC depthwise kernels to support more combinations (filter
327	// sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
328	// to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
329	// good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
330	// release-notes/rel_8.html#rel_8)
331	use_cudnn_grouped_conv_ =
332	dtype_ == DT_HALF &&
333	(data_format_ == FORMAT_NCHW \|\|
334	(data_format_ == FORMAT_NHWC && stride_ == stride_w &&
335	(stride_ == `1` \|\| stride_ == `2`)));
336	#elif CUDNN_VERSION >= 7603
337	// Use CuDNN grouped conv only when input/output is NCHW and float16(half).
338	// See cudnn release note 7.6.3. (https://docs.nvidia.com/deeplearning/sdk/c
339	// udnn-release-notes/rel_763.html#rel_763)
340	use_cudnn_grouped_conv_ = dtype_ == DT_HALF && data_format_ == FORMAT_NCHW;
341	#else
342	use_cudnn_grouped_conv_ = false;
343	#endif
344	}
345
346	void Compute(OpKernelContext* context) override {
347	// Input tensor is of the following dimensions:
348	// [ batch, in_rows, in_cols, in_depth ]
349	const Tensor& input = context->input(`0`);
350
351	// Input filter is of the following dimensions:
352	// [ filter_rows, filter_cols, in_depth, depth_multiplier]
353	const Tensor& filter = context->input(`1`);
354
355	// For 2D convolution, there should be 4 dimensions.
356	OP_REQUIRES(context, input.dims() == `4`,
357	errors::InvalidArgument("input must be 4-dimensional",
358	input.shape().DebugString()));
359	OP_REQUIRES(context, filter.dims() == `4`,
360	errors::InvalidArgument("filter must be 4-dimensional: ",
361	filter.shape().DebugString()));
362
363	// in_depth for input and filter must match.
364	const int64_t in_depth = GetTensorDim(input, data_format_, `'C'`);
365	OP_REQUIRES(context, in_depth == filter.dim_size(`2`),
366	errors::InvalidArgument(
367	"input and filter must have the same depth: ", in_depth,
368	" vs ", filter.dim_size(`2`)));
369
370	// The last dimension for filter is depth multiplier.
371	const int32_t depth_multiplier = filter.dim_size(`3`);
372
373	// The output depth is input depth x depth multiplier
374	const int32_t out_depth = in_depth * depth_multiplier;
375
376	const int64_t input_rows_raw = GetTensorDim(input, data_format_, `'H'`);
377	OP_REQUIRES(
378	context,
379	FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()),
380	errors::InvalidArgument("Input rows too large"));
381	const int32_t input_rows = static_cast<int32>(input_rows_raw);
382	const int32_t filter_rows = filter.dim_size(`0`);
383
384	const int64_t input_cols_raw = GetTensorDim(input, data_format_, `'W'`);
385	OP_REQUIRES(
386	context,
387	FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()),
388	errors::InvalidArgument("Input cols too large"));
389	const int32_t input_cols = static_cast<int32>(input_cols_raw);
390	const int32_t filter_cols = filter.dim_size(`1`);
391
392	// The first dimension for input is batch.
393	const int32_t batch = input.dim_size(`0`);
394
395	int64_t out_rows = `0`, out_cols = `0`, pad_top = `0`, pad_bottom = `0`,
396	pad_left = `0`, pad_right = `0`;
397	if (padding_ == Padding::EXPLICIT) {
398	GetExplicitPaddingForDim(explicit_paddings_, data_format_, `'H'`, &pad_top,
399	&pad_bottom);
400	GetExplicitPaddingForDim(explicit_paddings_, data_format_, `'W'`, &pad_left,
401	&pad_right);
402	}
403	OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
404	input_rows, filter_rows, stride_, padding_,
405	&out_rows, &pad_top, &pad_bottom));
406	OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose(
407	input_cols, filter_cols, stride_, padding_,
408	&out_cols, &pad_left, &pad_right));
409	TensorShape out_shape =
410	ShapeFromFormat(data_format_, batch, out_rows, out_cols, out_depth);
411	OP_REQUIRES(
412	context,
413	(!std::is_same<Device, GPUDevice>::value \|\|
414	FastBoundsCheck(out_shape.num_elements(),
415	std::numeric_limits<int32>::max())),
416	errors::InvalidArgument("Output elements too large for GPU kernel"));
417
418	Tensor* output = nullptr;
419	OP_REQUIRES_OK(context, context->allocate_output(`0`, out_shape, &output));
420
421	// If there is nothing to compute, return.
422	if (out_shape.num_elements() == `0`) {
423	return;
424	}
425
426	// TODO(csigg): Have autotune decide if native is faster than cuDNN.
427	// If in_depth==1, this operation is just a standard convolution.
428	// Depthwise convolution is a special case of cuDNN's grouped convolution.
429	bool use_cudnn =
430	std::is_same<Device, GPUDevice>::value &&
431	(in_depth == `1` \|\| (use_cudnn_grouped_conv_ &&
432	ShouldCudnnGroupedConvolutionBeUsed(
433	filter_rows, filter_cols, in_depth, out_depth)));
434
435	VLOG(`2`) << "DepthwiseConv2dNative: "
436	<< " Input: [" << batch << ", " << input_rows << ", " << input_cols
437	<< ", " << in_depth << "]; Filter: [" << filter_rows << ", "
438	<< filter_cols << ", " << in_depth << ", " << depth_multiplier
439	<< "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
440	<< ", " << out_depth << "], stride = " << stride_
441	<< ", pad_top = " << pad_top << ", pad_left = " << pad_left
442	<< ", Use cuDNN: " << use_cudnn;
443
444	if (use_cudnn) {
445	// Reshape from TF depthwise filter to cuDNN grouped convolution filter:
446	//
447	// \| TensorFlow \| cuDNN
448	// --------------------------------------------------------------------
449	// filter_out_depth \| depth_multiplier \| depth_multiplier group_count*
450	// filter_in_depth \| in_depth \| in_depth / group_count
451	//
452	// For depthwise convolution, we have group_count == in_depth.
453	int32_t filter_in_depth = `1`;
454	TensorShape shape =
455	TensorShape {filter_rows, filter_cols, filter_in_depth, out_depth};
456	Tensor reshaped_filter(/type=/dtype_);
457	OP_REQUIRES(
458	context, reshaped_filter.CopyFrom(filter, shape),
459	errors::Internal(
460	"Failed to reshape filter tensor for grouped convolution."));
461	// TODO(yangzihao): Send in arbitrary dilation rates after the dilated
462	// conv is supported.
463	launcher_(context, /use_cudnn=/true, cudnn_use_autotune_, input,
464	reshaped_filter, /row_dilation=/`1`, /col_dilation=/`1`,
465	stride_, stride_, padding_, explicit_paddings_, output,
466	data_format_);
467	return;
468	}
469
470	DepthwiseArgs args;
471	args.batch = batch;
472	args.in_rows = input_rows;
473	args.in_cols = input_cols;
474	args.in_depth = in_depth;
475	args.filter_rows = filter_rows;
476	args.filter_cols = filter_cols;
477	args.depth_multiplier = depth_multiplier;
478	args.stride = stride_;
479	args.pad_rows = pad_top;
480	args.pad_cols = pad_left;
481	args.out_rows = out_rows;
482	args.out_cols = out_cols;
483	args.out_depth = out_depth;
484
485	auto input_ptr = input.template flat<T>().data();
486	auto filter_ptr = filter.template flat<T>().data();
487	auto output_ptr = output->template flat<T>().data();
488	LaunchDepthwiseConvOp<Device, T>()(context, args, input_ptr, filter_ptr,
489	output_ptr, data_format_);
490	}
491
492	protected:
493	bool use_cudnn_grouped_conv_;
494
495	private:
496	std::vector<int32_t> strides_;
497	Padding padding_;
498	std::vector<int64_t> explicit_paddings_;
499	TensorFormat data_format_;
500
501	int64_t stride_; // in height/width dimension.
502
503	// For in_depth == 1 and grouped convolutions.
504	LaunchConv2DOp<Device, T> launcher_;
505	bool cudnn_use_autotune_;
506	DataType dtype_;
507
508	TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeOp);
509	};
510
511	#define REGISTER_CPU_KERNEL(T) \
512	REGISTER_KERNEL_BUILDER( \
513	Name("DepthwiseConv2dNative").Device(DEVICE_CPU).TypeConstraint<T>("T"), \
514	DepthwiseConv2dNativeOp<CPUDevice, T>)
515
516	TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
517	TF_CALL_half(REGISTER_CPU_KERNEL);
518	TF_CALL_float(REGISTER_CPU_KERNEL);
519	#if !defined(PLATFORM_WINDOWS) \|\| !defined(_DEBUG)
520	TF_CALL_double(REGISTER_CPU_KERNEL);
521	#endif
522
523	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
524
525	#define REGISTER_GPU_KERNEL(T) \
526	REGISTER_KERNEL_BUILDER( \
527	Name("DepthwiseConv2dNative").Device(DEVICE_GPU).TypeConstraint<T>("T"), \
528	DepthwiseConv2dNativeOp<GPUDevice, T>)
529
530	TF_CALL_half(REGISTER_GPU_KERNEL);
531	TF_CALL_float(REGISTER_GPU_KERNEL);
532	TF_CALL_double(REGISTER_GPU_KERNEL);
533
534	#if CUDNN_VERSION >= 7000
535	template <typename T>
536	class DepthwiseConv2dGroupedConvOp
537	: public DepthwiseConv2dNativeOp<GPUDevice, T> {
538	public:
539	DepthwiseConv2dGroupedConvOp(OpKernelConstruction* context)
540	: DepthwiseConv2dNativeOp<GPUDevice, T>(context) {
541	this->use_cudnn_grouped_conv_ = true;
542	}
543	};
544
545	#define REGISTER_GROUPED_CONV_KERNEL(T) \
546	REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNative") \
547	.Device(DEVICE_GPU) \
548	.TypeConstraint<T>("T") \
549	.Label("cudnn_grouped_convolution"), \
550	DepthwiseConv2dGroupedConvOp<T>)
551
552	TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
553	TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
554	TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
555	#endif // CUDNN_VERSION
556	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
557
558	} // namespace tensorflow
559

Browse the source code of tensorflow/tensorflow/core/kernels/depthwise_conv_op.cc