depthwise_conv_grad_op.cc source code [tensorflow/tensorflow/core/kernels/depthwise_conv_grad_op.cc]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#define EIGEN_USE_THREADS
17
18	#include <algorithm>
19	#include <cmath>
20
21	#include "tensorflow/core/framework/bounds_check.h"
22	#include "tensorflow/core/framework/kernel_shape_util.h"
23	#include "tensorflow/core/framework/numeric_op.h"
24	#include "tensorflow/core/framework/op_kernel.h"
25	#include "tensorflow/core/framework/register_types.h"
26	#include "tensorflow/core/framework/tensor.h"
27	#include "tensorflow/core/framework/tensor_shape.h"
28	#include "tensorflow/core/framework/tensor_types.h"
29	#include "tensorflow/core/framework/types.h"
30	#include "tensorflow/core/kernels/cast_op.h"
31	#include "tensorflow/core/kernels/conv_grad_ops.h"
32	#include "tensorflow/core/kernels/depthwise_conv_op.h"
33	#include "tensorflow/core/lib/core/status.h"
34	#include "tensorflow/core/platform/logging.h"
35	#include "tensorflow/core/platform/types.h"
36	#include "tensorflow/core/util/determinism.h"
37	#include "tensorflow/core/util/padding.h"
38	#include "tensorflow/core/util/tensor_format.h"
39	#include "tensorflow/core/util/use_cudnn.h"
40	#include "tensorflow/core/util/work_sharder.h"
41
42	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
43
44	#if GOOGLE_CUDA
45	#include "third_party/gpus/cudnn/cudnn.h"
46	#endif
47
48	#include "tensorflow/core/platform/stream_executor.h"
49	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
50
51	namespace tensorflow {
52
53	// Gradient operations for depthwise convolution.
54
55	typedef Eigen::ThreadPoolDevice CPUDevice;
56	typedef Eigen::GpuDevice GPUDevice;
57
58	// Common code between the two backward pass kernels: verifies that the
59	// dimensions all match and extract the padded rows and columns.
60	#define EXTRACT_AND_VERIFY_DIMENSIONS(label) \
61	const Tensor& out_backprop = context->input(2); \
62	OP_REQUIRES( \
63	context, input_shape.dims() == 4, \
64	errors::InvalidArgument(label, ": input must be 4-dimensional")); \
65	OP_REQUIRES( \
66	context, filter_shape.dims() == 4, \
67	errors::InvalidArgument(label, ": filter must be 4-dimensional")); \
68	OP_REQUIRES( \
69	context, out_backprop.dims() == 4, \
70	errors::InvalidArgument(label, ": out_backprop must be 4-dimensional")); \
71	const int64_t batch = input_shape.dim_size(0); \
72	OP_REQUIRES( \
73	context, batch == out_backprop.dim_size(0), \
74	errors::InvalidArgument( \
75	label, ": input and out_backprop must have the same batch size")); \
76	const int64_t input_rows_raw = GetTensorDim(input_shape, data_format_, 'H'); \
77	OP_REQUIRES( \
78	context, \
79	FastBoundsCheck(input_rows_raw, std::numeric_limits<int32>::max()), \
80	errors::InvalidArgument("Input rows too large")); \
81	const int32 input_rows = static_cast<int32>(input_rows_raw); \
82	const int64_t input_cols_raw = GetTensorDim(input_shape, data_format_, 'W'); \
83	OP_REQUIRES( \
84	context, \
85	FastBoundsCheck(input_cols_raw, std::numeric_limits<int32>::max()), \
86	errors::InvalidArgument("Input cols too large")); \
87	const int32 input_cols = static_cast<int32>(input_cols_raw); \
88	const int64_t filter_rows = filter_shape.dim_size(0); \
89	const int64_t filter_cols = filter_shape.dim_size(1); \
90	const int64_t output_rows_raw = \
91	GetTensorDim(out_backprop.shape(), data_format_, 'H'); \
92	OP_REQUIRES( \
93	context, \
94	FastBoundsCheck(output_rows_raw, std::numeric_limits<int32>::max()), \
95	errors::InvalidArgument("Output rows too large")); \
96	const int32 output_rows = static_cast<int32>(output_rows_raw); \
97	const int64_t output_cols_raw = \
98	GetTensorDim(out_backprop.shape(), data_format_, 'W'); \
99	OP_REQUIRES( \
100	context, \
101	FastBoundsCheck(output_cols_raw, std::numeric_limits<int32>::max()), \
102	errors::InvalidArgument("Output cols too large")); \
103	const int32 output_cols = static_cast<int32>(output_cols_raw); \
104	const int64_t in_depth = GetTensorDim(input_shape, data_format_, 'C'); \
105	OP_REQUIRES(context, in_depth == filter_shape.dim_size(2), \
106	errors::InvalidArgument( \
107	label, ": input and filter must have the same in_depth")); \
108	const int64_t depth_multiplier = filter_shape.dim_size(3); \
109	const int64_t out_depth_raw = \
110	GetTensorDim(out_backprop.shape(), data_format_, 'C'); \
111	OP_REQUIRES( \
112	context, \
113	FastBoundsCheck(out_depth_raw, std::numeric_limits<int32>::max()), \
114	errors::InvalidArgument("Output depth too large")); \
115	const int32 out_depth = static_cast<int32>(out_depth_raw); \
116	OP_REQUIRES( \
117	context, (depth_multiplier * in_depth) == out_depth, \
118	errors::InvalidArgument( \
119	label, ": depth_multiplier * in_depth not equal to out_depth")); \
120	const auto stride = stride_; \
121	int64_t out_rows = 0, out_cols = 0, pad_top = 0, pad_bottom = 0, \
122	pad_left = 0, pad_right = 0; \
123	if (padding_ == Padding::EXPLICIT) { \
124	GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'H', &pad_top, \
125	&pad_bottom); \
126	GetExplicitPaddingForDim(explicit_paddings_, data_format_, 'W', &pad_left, \
127	&pad_right); \
128	} \
129	OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( \
130	input_rows, filter_rows, stride_, padding_, \
131	&out_rows, &pad_top, &pad_bottom)); \
132	OP_REQUIRES_OK(context, GetWindowedOutputSizeVerbose( \
133	input_cols, filter_cols, stride_, padding_, \
134	&out_cols, &pad_left, &pad_right)); \
135	OP_REQUIRES( \
136	context, output_rows == out_rows, \
137	errors::InvalidArgument( \
138	label, ": Number of rows of out_backprop doesn't match computed: ", \
139	"actual = ", output_rows, ", computed = ", out_rows)); \
140	OP_REQUIRES( \
141	context, output_cols == out_cols, \
142	errors::InvalidArgument( \
143	label, ": Number of cols of out_backprop doesn't match computed: ", \
144	"actual = ", output_cols, ", computed = ", out_cols)); \
145	DepthwiseArgs args; \
146	args.batch = batch; \
147	args.in_rows = input_rows; \
148	args.in_cols = input_cols; \
149	args.in_depth = in_depth; \
150	args.filter_rows = filter_rows; \
151	args.filter_cols = filter_cols; \
152	args.depth_multiplier = depth_multiplier; \
153	args.stride = stride; \
154	args.pad_rows = pad_top; \
155	args.pad_cols = pad_left; \
156	args.out_rows = out_rows; \
157	args.out_cols = out_cols; \
158	args.out_depth = out_depth; \
159	VLOG(2) << "DepthwiseConv2d: " << label << " Input: [" << batch << ", " \
160	<< input_rows << ", " << input_cols << ", " << in_depth \
161	<< "]; Filter: [" << filter_rows << ", " << filter_cols << ", " \
162	<< in_depth << ", " << depth_multiplier << "]; stride = " << stride \
163	<< ", pad_rows = " << pad_top << ", pad_cols = " << pad_left \
164	<< ", output: [" << batch << ", " << out_rows << ", " << out_cols \
165	<< ", " << out_depth << "]";
166
167	// Copies data from local region in 'out_backprop' into 'buffer'.
168	// The local region coordinates are calculated as the set of output points which
169	// used the input point ('in_r', 'in_'c') as input during the forward pass.
170	// Rather than spatially reversing the filter, the input is reversed during
171	// the copy. The copied data is padded to vector register-width boundaries so
172	// that it is aligned for efficient traversal and vector multiply-add by the
173	// depthwise input kernel.
174	//
175	// EX:
176	// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
177	//
178	// 'out_backprop': [batch, out_rows, out_cols, out_depth]
179	//
180	// [a00, a01, a10, a11] [a20, a21, b00, b01]
181	// [b10, b11, b20, b21] [...]
182	// [e00, e01, e10, e11] [e20, e21, f00, f01]
183	// [f10, f11, f20, f21] [...]
184	//
185	// 'buffer' (register boundaries shown):
186	//
187	// [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
188	// [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
189	// [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
190	// [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
191	//
192	template <typename T>
193	static void CopyOutputBackpropRegion(const DepthwiseArgs& args,
194	const int64_t padded_filter_inner_dim_size,
195	const int64_t in_r, const int64_t in_c,
196	const T* out_backprop, T* buffer) {
197	typedef typename Eigen::internal::packet_traits<T>::type Packet;
198	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
199
200	const int64_t stride = args.stride;
201	const int64_t filter_rows = args.filter_rows;
202	const int64_t filter_cols = args.filter_cols;
203	const int64_t pad_rows = args.pad_rows;
204	const int64_t pad_cols = args.pad_cols;
205	const int64_t out_rows = args.out_rows;
206	const int64_t out_cols = args.out_cols;
207
208	// Calculate the output spatial region which used point (in_r, in_c) as input.
209	const int64_t out_r_start =
210	std::max(static_cast<int64_t>(`0`),
211	(in_r - filter_rows + pad_rows + stride) / stride);
212	const int64_t out_r_end = std::min(out_rows - `1`, (in_r + pad_rows) / stride);
213	const int64_t out_c_start =
214	std::max(static_cast<int64_t>(`0`),
215	(in_c - filter_cols + pad_cols + stride) / stride);
216	const int64_t out_c_end = std::min(out_cols - `1`, (in_c + pad_cols) / stride);
217
218	// Zero-pad 'buffer' if output region is smaller than filter spatial size.
219	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
220	if ((out_r_end - out_r_start + `1`) < args.filter_rows \|\|
221	(out_c_end - out_c_start + `1`) < args.filter_cols) {
222	memset(buffer, `0`,
223	filter_spatial_size * padded_filter_inner_dim_size * sizeof(T));
224	}
225
226	// Calculate vectorized and scalar (residual) lengths for 'in_depth'.
227	const int64_t vectorized_size = (args.out_depth / kPacketSize) * kPacketSize;
228	const int64_t scalar_size = args.out_depth % kPacketSize;
229	const int64_t pad_size = scalar_size > `0` ? kPacketSize - scalar_size : `0`;
230
231	for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
232	const int64_t f_r = in_r + pad_rows - out_r * stride;
233	for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
234	const int64_t f_c = in_c + pad_cols - out_c * stride;
235	const int64_t buf_base =
236	(f_r * filter_cols + f_c) * padded_filter_inner_dim_size;
237	// Calculate index into 'out_backprop' for coordinate (out_r, out_c).
238	auto* out_bprop =
239	out_backprop + (out_r * args.out_cols + out_c) * args.out_depth;
240
241	// Copy vectorized portion of inner dimension into 'buffer'.
242	for (int64_t d = `0`; d < vectorized_size; d += kPacketSize) {
243	auto v = Eigen::internal::ploadu<Packet>(out_bprop + d);
244	Eigen::internal::pstoreu<T>(buffer + buf_base + d, v);
245	}
246	// Copy scalar portion of out_bprop to 'buffer'
247	for (int64_t d = `0`; d < scalar_size; ++d) {
248	buffer[buf_base + vectorized_size + d] = out_bprop[vectorized_size + d];
249	}
250	// Pad to vector-register width (if needed).
251	for (int64_t d = `0`; d < pad_size; ++d) {
252	buffer[buf_base + vectorized_size + scalar_size + d] =
253	static_cast<T>(`0`);
254	}
255	}
256	}
257	}
258
259	// Computes the vectorized product of 'buffer' and 'filter' and stores
260	// result in 'output' at location computed from 'in_r' and 'in_c'.
261	// If depth_multiplier is > 1, the intermediate output is reduced along
262	// the depth_multiplier dimension.
263	//
264	// EX:
265	// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
266	// Both 'input_buffer' and 'filter' are padded to register-width boundaries.
267	//
268	// 'buffer' [rows, cols, in_depth, depth_multiplier]
269	//
270	// [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
271	// [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
272	// [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
273	// [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
274	//
275	// filter [rows, cols, in_depth, depth_multiplier]
276	// [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
277	// [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
278	//
279	// First output register [in_depth, depth_multiplier]
280	// [q00, q01, q10, q11] = ([f00, f01, f10, f11] x [u0, v0, w0, x0]) +
281	// ([e00, e01, e10, e11] x [u1, v1, w1, x1]) +
282	// ([b00, b01, b10, b11] x [u2, v2, w2, x2]) +
283	// ([a00, a01, a10, a11] x [u3, v3, w3, x3])
284	//
285	// Reduction step along depth-multiplier dimension:
286	//
287	// [q00, q01, q10, q11] [q20, q21, 0, 0] -> [r0, r1, r2, 0]
288	//
289
290	template <typename T>
291	static void ComputeBackpropInput(const DepthwiseArgs& args,
292	const int64_t padded_filter_inner_dim_size,
293	const int64_t in_r, const int64_t in_c,
294	const T* filter, const T* buffer,
295	T* out_buffer, T* output) {
296	typedef typename Eigen::internal::packet_traits<T>::type Packet;
297	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
298
299	const int64_t in_depth = args.in_depth;
300	const int64_t depth_multiplier = args.depth_multiplier;
301	const int64_t out_depth = args.out_depth;
302	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
303
304	// Calculate vectorized and scalar lengths of 'out_depth'.
305	const int64_t output_vectorized_size =
306	(out_depth / kPacketSize) * kPacketSize;
307	const int64_t output_scalar_size = out_depth % kPacketSize;
308
309	// Calculate base index at which to begin writing output.
310	const int64_t base_output_index = (in_r * args.in_cols + in_c) * in_depth;
311
312	// Calculate vectorized and scalar lengths for 'depth_multiplier'. This is
313	// used to efficiently reduce output when 'depth_multiplier' > kPacketSize.
314	const int64_t dm_vectorized_size =
315	(depth_multiplier / kPacketSize) * kPacketSize;
316	const int64_t dm_scalar_size = depth_multiplier % kPacketSize;
317
318	for (int i = `0`; i < output_vectorized_size; i += kPacketSize) {
319	// Reset accumulator.
320	auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(`0`));
321	for (int j = `0`; j < filter_spatial_size; ++j) {
322	// Calculate index.
323	const int64_t index = i + j * padded_filter_inner_dim_size;
324	// Load filter.
325	const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
326	// Load input.
327	const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
328	// Vector multiply-add.
329	vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
330	}
331	if (depth_multiplier == `1`) {
332	// Write directly to the output.
333	Eigen::internal::pstoreu<T>(output + base_output_index + i, vaccum);
334	} else {
335	// Buffer output for subsequent reduction step.
336	Eigen::internal::pstoreu<T>(out_buffer + i, vaccum);
337	}
338	}
339
340	if (output_scalar_size > `0`) {
341	auto vaccum = Eigen::internal::pset1<Packet>(static_cast<T>(`0`));
342	for (int j = `0`; j < filter_spatial_size; ++j) {
343	const int64_t index =
344	output_vectorized_size + j * padded_filter_inner_dim_size;
345	const auto filter_block = Eigen::internal::ploadu<Packet>(filter + index);
346	const auto data_block = Eigen::internal::ploadu<Packet>(buffer + index);
347	vaccum = Eigen::internal::pmadd<Packet>(filter_block, data_block, vaccum);
348	}
349	// Load accumulator into an array and loop through output.
350	T out_buf[kPacketSize];
351	Eigen::internal::pstoreu<T>(out_buf, vaccum);
352	if (depth_multiplier == `1`) {
353	// Write directly to the output.
354	for (int j = `0`; j < output_scalar_size; ++j) {
355	output[base_output_index + output_vectorized_size + j] = out_buf[j];
356	}
357	} else {
358	// Buffer output for subsequent reduction step.
359	for (int j = `0`; j < output_scalar_size; ++j) {
360	out_buffer[output_vectorized_size + j] = out_buf[j];
361	}
362	}
363	}
364
365	// Iterate over 'in_depth', reduce over 'depth_multiplier', write 'output'.
366	if (depth_multiplier > `1`) {
367	for (int64_t d = `0`; d < in_depth; ++d) {
368	const int64_t index = d * args.depth_multiplier;
369	T accum = static_cast<T>(`0`);
370	for (int64_t dm = `0`; dm < dm_vectorized_size; dm += kPacketSize) {
371	const auto v = Eigen::internal::ploadu<Packet>(out_buffer + index + dm);
372	accum += Eigen::internal::predux(v);
373	}
374	// Copy scalar portion of replicated output.
375	for (int64_t dm = `0`; dm < dm_scalar_size; ++dm) {
376	accum += out_buffer[index + dm_vectorized_size + dm];
377	}
378	// Copy to output.
379	output[base_output_index + d] = accum;
380	}
381	}
382	}
383
384	// Computes the depthwise conv2d backprop input of 'out_backprop' by
385	// 'depthwise_filter' and stores the result in 'in_backprop'.
386	template <typename T>
387	struct LaunchDepthwiseConvBackpropInputOp<CPUDevice, T> {
388	typedef typename Eigen::internal::packet_traits<T>::type Packet;
389
390	void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
391	const T* out_backprop, const T* depthwise_filter,
392	T* in_backprop, TensorFormat data_format) {
393	OP_REQUIRES(
394	ctx, data_format == FORMAT_NHWC,
395	errors::Unimplemented(
396	"Depthwise convolution on CPU is only supported for NHWC format"));
397
398	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
399
400	// Pad 'depthwise_filter' to vector register width (if needed).
401	const bool pad_filter = (args.out_depth % kPacketSize) == `0` ? false : true;
402	Tensor padded_filter;
403	if (pad_filter) {
404	// Allocate space for padded filter.
405	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
406	const int64_t padded_filter_inner_dim_size =
407	((args.out_depth + kPacketSize - `1`) / kPacketSize) * kPacketSize;
408	OP_REQUIRES_OK(
409	ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
410	TensorShape ({filter_spatial_size,
411	padded_filter_inner_dim_size}),
412	&padded_filter));
413	// Write out padded filter.
414	functor::DepthwiseFilterPadOp<T>()(
415	args, depthwise_filter, padded_filter.template flat<T>().data());
416	}
417	const T* filter_data =
418	pad_filter ? padded_filter.template flat<T>().data() : depthwise_filter;
419
420	// Computes one shard of depthwise conv2d backprop input.
421	auto shard = [&ctx, &args, &out_backprop, &filter_data, &in_backprop](
422	int64_t start, int64_t limit) {
423	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
424
425	const int64_t input_image_size =
426	args.in_rows * args.in_cols * args.in_depth;
427	const int64_t output_image_size =
428	args.out_rows * args.out_cols * args.out_depth;
429	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
430	const int64_t padded_filter_inner_dim_size =
431	((args.out_depth + kPacketSize - `1`) / kPacketSize) * kPacketSize;
432
433	// Allocate buffer to copy regions from 'out_backprop'.
434	Tensor out_bprop_buffer;
435	OP_REQUIRES_OK(
436	ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
437	TensorShape ({filter_spatial_size,
438	padded_filter_inner_dim_size}),
439	&out_bprop_buffer));
440	T* out_bprop_buf = out_bprop_buffer.template flat<T>().data();
441
442	// Allocate buffer for intermediate results.
443	Tensor in_bprop_buffer;
444	OP_REQUIRES_OK(
445	ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
446	TensorShape ({padded_filter_inner_dim_size}),
447	&in_bprop_buffer));
448	T* in_bprop_buf = in_bprop_buffer.template flat<T>().data();
449
450	for (int64_t b = start; b < limit; ++b) {
451	for (int64_t in_r = `0`; in_r < args.in_rows; ++in_r) {
452	for (int64_t in_c = `0`; in_c < args.in_cols; ++in_c) {
453	// Populate 'out_bprop_buf' from local 'out_backprop' region.
454	CopyOutputBackpropRegion<T>(
455	args, padded_filter_inner_dim_size, in_r, in_c,
456	out_backprop + b * output_image_size, out_bprop_buf);
457
458	// Compute depthwise backprop input.
459	ComputeBackpropInput<T>(args, padded_filter_inner_dim_size, in_r,
460	in_c, filter_data, out_bprop_buf,
461	in_bprop_buf,
462	in_backprop + b * input_image_size);
463	}
464	}
465	}
466	};
467
468	const int64_t shard_cost = args.in_rows * args.in_cols * args.out_depth;
469	auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
470	Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
471	shard_cost, shard);
472	}
473	};
474
475	template <typename T>
476	static void DepthwiseConvBackpropInputReference(const DepthwiseArgs& args,
477	const T* out_backprop,
478	const T* filter,
479	T* in_backprop) {
480	// Naive for loop as a reference point without concerns about performance.
481	for (int b = `0`; b < args.batch; ++b) {
482	for (int in_r = `0`; in_r < args.in_rows; ++in_r) {
483	for (int in_c = `0`; in_c < args.in_cols; ++in_c) {
484	for (int in_d = `0`; in_d < args.in_depth; ++in_d) {
485	T sum = `0`;
486	const int stride = args.stride;
487	const int out_d_start = in_d * args.depth_multiplier;
488	const int out_d_end = out_d_start + args.depth_multiplier;
489
490	for (int out_d = out_d_start; out_d < out_d_end; ++out_d) {
491	const int out_r_start = std::max(
492	`0`, (in_r - args.filter_rows + args.pad_rows + stride) / stride);
493	const int out_r_end =
494	std::min(args.out_rows - `1`, (in_r + args.pad_rows) / stride);
495
496	for (int out_r = out_r_start; out_r <= out_r_end; ++out_r) {
497	const int out_c_start = std::max(
498	`0`,
499	(in_c - args.filter_cols + args.pad_cols + stride) / stride);
500	const int out_c_end =
501	std::min(args.out_cols - `1`, (in_c + args.pad_cols) / stride);
502
503	for (int out_c = out_c_start; out_c <= out_c_end; ++out_c) {
504	int f_r = in_r + args.pad_rows - out_r * stride;
505	int f_c = in_c + args.pad_cols - out_c * stride;
506	int filter_dm = out_d - out_d_start;
507	int out_backprop_offset =
508	out_d +
509	args.out_depth *
510	(out_c + args.out_cols * (out_r + args.out_rows * b));
511	int filter_offset =
512	filter_dm +
513	args.depth_multiplier *
514	(in_d + args.in_depth * (f_c + args.filter_cols * f_r));
515	sum +=
516	out_backprop[out_backprop_offset] * filter[filter_offset];
517	}
518	}
519	}
520
521	int in_backprop_offset =
522	in_d +
523	args.in_depth * (in_c + args.in_cols * (in_r + args.in_rows * b));
524	in_backprop[in_backprop_offset] = sum;
525	}
526	}
527	}
528	}
529	}
530
531	// Extern template instantiated in conv_grad_input_ops.cc.
532	extern template struct LaunchConv2DBackpropInputOp<CPUDevice, bfloat16>;
533	extern template struct LaunchConv2DBackpropInputOp<CPUDevice, Eigen::half>;
534	extern template struct LaunchConv2DBackpropInputOp<CPUDevice, float>;
535	extern template struct LaunchConv2DBackpropInputOp<CPUDevice, double>;
536
537	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
538
539	// Extern template instantiated in conv_grad_input_ops.cc.
540	extern template struct LaunchConv2DBackpropInputOp<GPUDevice, Eigen::half>;
541	extern template struct LaunchConv2DBackpropInputOp<GPUDevice, float>;
542	extern template struct LaunchConv2DBackpropInputOp<GPUDevice, double>;
543
544	// Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
545	extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice,
546	Eigen::half>;
547	extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, float>;
548	extern template struct LaunchDepthwiseConvBackpropInputOp<GPUDevice, double>;
549
550	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
551
552	// Kernel to compute the input backprop for depthwise convolution.
553	template <typename Device, class T>
554	class DepthwiseConv2dNativeBackpropInputOp : public OpKernel {
555	public:
556	explicit DepthwiseConv2dNativeBackpropInputOp(OpKernelConstruction* context)
557	: OpKernel(context) {
558	OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
559	OP_REQUIRES(context, strides_.size() == `4`,
560	errors::InvalidArgument("Sliding window strides field must "
561	"specify 4 dimensions"));
562
563	string data_format;
564	OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
565	OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
566	errors::InvalidArgument("Invalid data format"));
567
568	stride_ = GetTensorDim(strides_, data_format_, `'H'`);
569	const int64_t stride_w = GetTensorDim(strides_, data_format_, `'W'`);
570	const int64_t stride_n = GetTensorDim(strides_, data_format_, `'N'`);
571	const int64_t stride_c = GetTensorDim(strides_, data_format_, `'C'`);
572
573	OP_REQUIRES(context, stride_ == stride_w,
574	errors::InvalidArgument(
575	"Current implementation only supports equal length "
576	"strides in the row and column dimensions."));
577	OP_REQUIRES(
578	context, (stride_n == `1` && stride_c == `1`),
579	errors::InvalidArgument("Current implementation does not yet support "
580	"strides in the batch and depth dimensions."));
581	OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
582	OP_REQUIRES_OK(context,
583	context->GetAttr("explicit_paddings", &explicit_paddings_));
584	OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
585	/num_dims=/`4`, data_format_));
586
587	cudnn_use_autotune_ = CudnnUseAutotune();
588	dtype_ = DataTypeToEnum<T>::value;
589	#if CUDNN_VERSION >= 8000
590	// From the cuDNN release note 8.0: We’ve extended the fprop and dgrad
591	// NHWC depthwise kernels to support more combinations (filter
592	// sizes/strides) such as 5x5/1x1, 5x5/2x2, 7x7/1x1, 7x7/2x2 (in addition
593	// to what we already have, 1x1/1x1, 3x3/1x1, 3x3/2x2), which provides
594	// good performance. (https://docs.nvidia.com/deeplearning/sdk/cudnn-
595	// release-notes/rel_8.html#rel_8)
596	use_cudnn_grouped_conv_ =
597	dtype_ == DT_HALF &&
598	((data_format_ == FORMAT_NCHW && stride_ == `1` && stride_w == `1`) \|\|
599	(data_format_ == FORMAT_NHWC && stride_ == stride_w &&
600	(stride_ == `1` \|\| stride_ == `2`)));
601	#elif CUDNN_VERSION >= 7603
602	// Use CuDNN grouped conv (input gradient) when stride = 1, input/output is
603	// NCHW and float16(half). See cudnn release note 7.6.3 (https://docs.nvidi
604	// a.com/deeplearning/sdk/cudnn-release-notes/rel_763.html#rel_763).
605	use_cudnn_grouped_conv_ = dtype_ == DT_HALF &&
606	data_format_ == FORMAT_NCHW && stride_ == `1` &&
607	stride_w == `1`;
608	#else
609	use_cudnn_grouped_conv_ = false;
610	#endif
611	}
612
613	void Compute(OpKernelContext* context) override {
614	const Tensor& input_sizes = context->input(`0`);
615	const Tensor& filter = context->input(`1`);
616	OP_REQUIRES(
617	context, TensorShapeUtils::IsVector(input_sizes.shape()),
618	errors::InvalidArgument(
619	"Conv2DBackpropInput: input_sizes input must be 1-dim, not ",
620	input_sizes.dims()));
621	TensorShape input_shape;
622	const int32* in_sizes_data = input_sizes.template flat<int32>().data();
623
624	for (int i = `0`; i < input_sizes.NumElements(); ++i) {
625	OP_REQUIRES(context, in_sizes_data[i] >= `0`,
626	errors::InvalidArgument("Dimension ", i,
627	" of input_sizes must be >= 0"));
628	OP_REQUIRES_OK(context, input_shape.AddDimWithStatus(in_sizes_data[i]));
629	}
630	const TensorShape& filter_shape = filter.shape();
631	EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropInput");
632
633	Tensor* in_backprop = nullptr;
634	OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
635	{`0`}, `0`, input_shape, &in_backprop));
636
637	// If there is nothing to compute, return.
638	if (input_shape.num_elements() == `0`) {
639	return;
640	}
641
642	// If in_depth==1, this operation is just a standard convolution.
643	// Depthwise convolution is a special case of cuDNN's grouped convolution.
644	bool use_cudnn =
645	std::is_same<Device, GPUDevice>::value &&
646	(in_depth == `1` \|\| (use_cudnn_grouped_conv_ &&
647	ShouldCudnnGroupedConvolutionBeUsed(
648	filter_rows, filter_cols, in_depth, out_depth)));
649
650	VLOG(`2`) << "DepthwiseConv2dNativeBackpropInput: "
651	<< " Input: [" << batch << ", " << input_rows << ", " << input_cols
652	<< ", " << in_depth << "]; Filter: [" << filter_rows << ", "
653	<< filter_cols << ", " << in_depth << ", " << depth_multiplier
654	<< "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
655	<< ", " << out_depth << "], stride = " << stride_
656	<< ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
657	<< ", Use cuDNN: " << use_cudnn;
658
659	if (use_cudnn) {
660	// Reshape from TF depthwise filter to cuDNN grouped convolution filter:
661	//
662	// \| TensorFlow \| cuDNN
663	// --------------------------------------------------------------------
664	// filter_out_depth \| depth_multiplier \| depth_multiplier group_count*
665	// filter_in_depth \| in_depth \| in_depth / group_count
666	//
667	// For depthwise convolution, we have group_count == in_depth.
668	int32_t filter_in_depth = `1`;
669	TensorShape shape =
670	TensorShape {filter_rows, filter_cols, filter_in_depth, out_depth};
671	Tensor reshaped_filter(/type=/dtype_);
672	OP_REQUIRES(
673	context, reshaped_filter.CopyFrom(filter, shape),
674	errors::Internal(
675	"Failed to reshape filter tensor for grouped convolution."));
676	// TODO(yangzihao): Send in arbitrary dilation rates after the dilated
677	// conv is supported.
678	launcher_(context, /use_cudnn=/true, cudnn_use_autotune_, out_backprop,
679	reshaped_filter, /row_dilation=/`1`, /col_dilation=/`1`,
680	stride_, stride_, padding_, explicit_paddings_, in_backprop,
681	data_format_);
682	return;
683	}
684
685	auto out_backprop_ptr = out_backprop.template flat<T>().data();
686	auto filter_ptr = filter.template flat<T>().data();
687	auto in_backprop_ptr = in_backprop->template flat<T>().data();
688	LaunchDepthwiseConvBackpropInputOp<Device, T>()(
689	context, args, out_backprop_ptr, filter_ptr, in_backprop_ptr,
690	data_format_);
691	}
692
693	protected:
694	bool use_cudnn_grouped_conv_;
695
696	private:
697	std::vector<int32> strides_;
698	Padding padding_;
699	std::vector<int64_t> explicit_paddings_;
700	TensorFormat data_format_;
701	int64_t stride_;
702
703	// For in_depth == 1 and grouped convolutions.
704	LaunchConv2DBackpropInputOp<Device, T> launcher_;
705	bool cudnn_use_autotune_;
706	DataType dtype_;
707
708	TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropInputOp);
709	};
710
711	#define REGISTER_CPU_KERNEL(T) \
712	REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
713	.Device(DEVICE_CPU) \
714	.TypeConstraint<T>("T"), \
715	DepthwiseConv2dNativeBackpropInputOp<CPUDevice, T>);
716
717	TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
718	TF_CALL_half(REGISTER_CPU_KERNEL);
719	TF_CALL_float(REGISTER_CPU_KERNEL);
720	#if !defined(PLATFORM_WINDOWS) \|\| !defined(_DEBUG)
721	TF_CALL_double(REGISTER_CPU_KERNEL);
722	#endif
723	#undef REGISTER_CPU_KERNEL
724
725	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
726
727	#define REGISTER_GPU_KERNEL(T) \
728	REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
729	.Device(DEVICE_GPU) \
730	.TypeConstraint<T>("T") \
731	.HostMemory("input_sizes"), \
732	DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>)
733
734	TF_CALL_half(REGISTER_GPU_KERNEL);
735	TF_CALL_float(REGISTER_GPU_KERNEL);
736	TF_CALL_double(REGISTER_GPU_KERNEL);
737	#undef REGISTER_GPU_KERNEL
738
739	#if CUDNN_VERSION >= 7000
740	template <typename T>
741	class DepthwiseConv2dGroupedConvBackpropInputOp
742	: public DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T> {
743	public:
744	DepthwiseConv2dGroupedConvBackpropInputOp(OpKernelConstruction* context)
745	: DepthwiseConv2dNativeBackpropInputOp<GPUDevice, T>(context) {
746	this->use_cudnn_grouped_conv_ = true;
747	}
748	};
749
750	#define REGISTER_GROUPED_CONV_KERNEL(T) \
751	REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropInput") \
752	.Device(DEVICE_GPU) \
753	.TypeConstraint<T>("T") \
754	.HostMemory("input_sizes") \
755	.Label("cudnn_grouped_convolution"), \
756	DepthwiseConv2dGroupedConvBackpropInputOp<T>)
757
758	TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
759	TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
760	TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
761	#undef REGISTER_GROUPED_CONV_KERNEL
762	#endif // CUDNN_VERSION
763	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
764
765	// Kernels to compute the gradients of the filters for depthwise convolution.
766
767	// Computes filter backprop using 'out_backprop' and 'input_buffer', storing the
768	// result in 'output_buffer' at an index computed from 'out_r' and 'out_c'.
769	//
770	// EX:
771	// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
772	// Both 'input_buffer' and 'filter' are padded to register-width boundaries.
773	//
774	// 'input_buffer' [rows, cols, in_depth, depth_multiplier]
775	//
776	// [f00, f01, f10, f11] [f20, f21, 0, 0] in_row = 0, in_col = 0
777	// [e00, e01, e10, e11] [e20, e21, 0, 0] in_row = 0, in_col = 1
778	// [b00, b01, b10, b11] [b20, b21, 0, 0] in_row = 1, in_col = 0
779	// [a00, a01, a10, a11] [a20, a21, 0, 0] in_row = 1, in_col = 1
780	//
781	// 'out_backprop' [out_rows, out_cols, in_depth, depth_multiplier]
782	//
783	// [q00, q01, q10, q11] [q20, q21, r00, r01]
784	// [r10, r11, r20, r21] [s00, s01, s10, s11]
785	// [s20, s21, t00, t01] [t10, t11, t20, a21]
786	//
787	// First output register of 'filter_backprop'
788	// [u0, v0, w0, x0] += ([f00, f01, f10, f11] x [q00, q01, q10, q11])
789	//
790	template <typename T>
791	static void ComputeBackpropFilter(const DepthwiseArgs& args,
792	const int64_t padded_out_depth_size,
793	const int64_t out_r, const int64_t out_c,
794	const T* out_backprop, const T* input_buffer,
795	T* output_buffer) {
796	typedef typename Eigen::internal::packet_traits<T>::type Packet;
797	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
798	// Calculate vectorized size of 'padded_out_depth_size'.
799	const int64_t out_depth = args.out_depth;
800	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
801	const int64_t output_vectorized_size =
802	(padded_out_depth_size / kPacketSize) * kPacketSize;
803	const int64_t base_output_index = (out_r * args.out_cols + out_c) * out_depth;
804	// Determine whether we can execute fast or slow code path.
805	const int64_t output_image_size =
806	args.out_rows * args.out_cols * args.out_depth;
807	const int64_t output_last_vector_index =
808	output_image_size - (filter_spatial_size * padded_out_depth_size);
809	const bool fast_path = base_output_index <= output_last_vector_index;
810
811	if (fast_path) {
812	// TODO(andydavis) Process multiple inputs in 'input_buffer' so we can
813	// amortize the cost of 'output_buffer' load store in the loop below.
814	for (int i = `0`; i < output_vectorized_size; i += kPacketSize) {
815	// Load vector register from 'out_backprop'.
816	const auto out_bprop_block =
817	Eigen::internal::ploadu<Packet>(out_backprop + base_output_index + i);
818	for (int j = `0`; j < filter_spatial_size; ++j) {
819	const int64_t index = i + j * padded_out_depth_size;
820	// Load vector register from 'input_buffer'.
821	const auto input_block =
822	Eigen::internal::ploadu<Packet>(input_buffer + index);
823	// Load output block into vector register.
824	auto out_block_data = output_buffer + index;
825	auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
826	// Vector multiply-add.
827	out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
828	out_block);
829	// Store 'out_block' back to memory.
830	Eigen::internal::pstoreu<T>(out_block_data, out_block);
831	}
832	}
833	} else {
834	// Slow path (cant do vector reads from non-padded 'out_backprop'.
835	for (int i = `0`; i < output_vectorized_size; i += kPacketSize) {
836	// Calculate safe read size from 'out_backprop'.
837	const int64_t out_bprop_index = base_output_index + i;
838	const int64_t out_bprop_limit =
839	std::min(output_image_size, out_bprop_index + kPacketSize);
840	T out_buf[kPacketSize];
841	memset(&out_buf, `0`, kPacketSize * sizeof(T));
842	const int64_t scalar_size = out_bprop_limit - out_bprop_index;
843	for (int64_t j = `0`; j < scalar_size; ++j) {
844	out_buf[j] = out_backprop[out_bprop_index + j];
845	}
846	// Load vector register from 'out_buf'.
847	const auto out_bprop_block = Eigen::internal::ploadu<Packet>(out_buf);
848	for (int j = `0`; j < filter_spatial_size; ++j) {
849	const int64_t index = i + j * padded_out_depth_size;
850	// Load vector register from 'input_buffer'.
851	const auto input_block =
852	Eigen::internal::ploadu<Packet>(input_buffer + index);
853	// Load output block into vector register.
854	auto out_block_data = output_buffer + index;
855	auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
856	// Vector multiply-add.
857	out_block = Eigen::internal::pmadd<Packet>(out_bprop_block, input_block,
858	out_block);
859	// Store 'out_block' back to memory.
860	Eigen::internal::pstoreu<T>(out_block_data, out_block);
861	}
862	}
863	}
864	}
865
866	template <typename Device, typename T>
867	struct LaunchDepthwiseConvBackpropFilterOp;
868
869	template <typename T>
870	struct LaunchDepthwiseConvBackpropFilterOp<CPUDevice, T> {
871	typedef typename Eigen::internal::packet_traits<T>::type Packet;
872
873	void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
874	const T* out_backprop, const T* input, T* filter_backprop,
875	TensorFormat data_format) {
876	OP_REQUIRES(
877	ctx, data_format == FORMAT_NHWC,
878	errors::Unimplemented(
879	"Depthwise convolution on CPU is only supported for NHWC format"));
880
881	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
882
883	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
884	const int64_t padded_out_depth_size =
885	((args.out_depth + kPacketSize - `1`) / kPacketSize) * kPacketSize;
886
887	// Allocate output buffers for each image in 'batch' (padded to vector
888	// register boundaries).
889	Tensor output_buffer;
890	OP_REQUIRES_OK(
891	ctx, ctx->allocate_temp(DataTypeToEnum<T>::value,
892	TensorShape ({args.batch, filter_spatial_size,
893	padded_out_depth_size}),
894	&output_buffer));
895	T* output_buffer_data = output_buffer.template flat<T>().data();
896
897	// Computes one shard of depthwise conv2d backprop filter.
898	auto shard = [&ctx, &args, &out_backprop, &input, &output_buffer_data](
899	int64_t start, int64_t limit) {
900	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
901	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
902	const int64_t padded_out_depth_size =
903	((args.out_depth + kPacketSize - `1`) / kPacketSize) * kPacketSize;
904
905	// Allocate buffer for local input regions.
906	Tensor input_buffer;
907	OP_REQUIRES_OK(
908	ctx, ctx->allocate_temp(
909	DataTypeToEnum<T>::value,
910	TensorShape ({filter_spatial_size, padded_out_depth_size}),
911	&input_buffer));
912	T* input_buffer_data = input_buffer.template flat<T>().data();
913
914	const int64_t input_image_size =
915	args.in_rows * args.in_cols * args.in_depth;
916	const int64_t output_image_size =
917	args.out_rows * args.out_cols * args.out_depth;
918	const int64_t padded_filter_size =
919	filter_spatial_size * padded_out_depth_size;
920
921	for (int b = start; b < limit; ++b) {
922	// Initialize 'output_buffer' for 'b'.
923	auto* output_buffer = output_buffer_data + b * padded_filter_size;
924	memset(output_buffer, `0`, padded_filter_size * sizeof(T));
925
926	for (int out_r = `0`; out_r < args.out_rows; ++out_r) {
927	for (int out_c = `0`; out_c < args.out_cols; ++out_c) {
928	// Populate 'input_buffer_data' with data from local input region.
929	functor::DepthwiseInputCopyOp<T>()(
930	args, padded_out_depth_size, out_r, out_c,
931	input + b * input_image_size, input_buffer_data);
932	// Compute depthwise backprop filter.
933	ComputeBackpropFilter(args, padded_out_depth_size, out_r, out_c,
934	out_backprop + b * output_image_size,
935	input_buffer_data, output_buffer);
936	}
937	}
938	}
939	};
940	const int64_t shard_cost = args.out_rows * args.out_cols * args.out_depth;
941	auto worker_threads = *(ctx->device()->tensorflow_cpu_worker_threads());
942	Shard(worker_threads.num_threads, worker_threads.workers, args.batch,
943	shard_cost, shard);
944
945	// Accumulate 'output_buffer' from each shard into 'output'.
946	const int64_t out_depth = args.out_depth;
947	const int64_t vectorized_size = (out_depth / kPacketSize) * kPacketSize;
948	const int64_t scalar_size = out_depth - vectorized_size;
949	const int64_t padded_filter_size =
950	filter_spatial_size * padded_out_depth_size;
951	memset(filter_backprop, `0`, filter_spatial_size * out_depth * sizeof(T));
952
953	for (int64_t i = `0`; i < filter_spatial_size; ++i) {
954	const int64_t buffer_base = i * padded_out_depth_size;
955	const int64_t output_base = i * out_depth;
956	// Write vectorized length of filter's inner dimension to output.
957	for (int64_t j = `0`; j < vectorized_size; j += kPacketSize) {
958	// Load data from 'filter_backprop' into vector register.
959	auto out_block_data = filter_backprop + output_base + j;
960	auto out_block = Eigen::internal::ploadu<Packet>(out_block_data);
961	for (int b = `0`; b < args.batch; ++b) {
962	// Load data from 'output_buffer' for 'b'.
963	const auto* output_buffer =
964	output_buffer_data + b * padded_filter_size;
965	const auto v =
966	Eigen::internal::ploadu<Packet>(output_buffer + buffer_base + j);
967	// Add 'v' to 'out_block'.
968	out_block = Eigen::internal::padd<Packet>(out_block, v);
969	}
970	// Store 'out_block' back to memory.
971	Eigen::internal::pstoreu<T>(out_block_data, out_block);
972	}
973	// Write scalar length of filter's inner dimension to output.
974	for (int64_t j = `0`; j < scalar_size; ++j) {
975	for (int b = `0`; b < args.batch; ++b) {
976	const auto* output_buffer =
977	output_buffer_data + b * padded_filter_size;
978	filter_backprop[output_base + vectorized_size + j] +=
979	output_buffer[buffer_base + vectorized_size + j];
980	}
981	}
982	}
983	}
984	};
985
986	template <typename T>
987	static void DepthwiseConvBackpropFilterReference(const DepthwiseArgs& args,
988	const T* out_backprop,
989	const T* input,
990	T* filter_backprop) {
991	int num_filter_backprop = args.filter_rows * args.filter_cols *
992	args.in_depth * args.depth_multiplier;
993	memset(filter_backprop, `0`, num_filter_backprop * sizeof(T));
994	// Naive for loop as a reference point without concerns about performance.
995	for (int b = `0`; b < args.batch; ++b) {
996	for (int out_r = `0`; out_r < args.out_rows; ++out_r) {
997	for (int out_c = `0`; out_c < args.out_cols; ++out_c) {
998	for (int out_d = `0`; out_d < args.out_depth; ++out_d) {
999	const int in_d = out_d / args.depth_multiplier;
1000	const int dm = out_d % args.depth_multiplier;
1001	const int in_r_start = out_r * args.stride - args.pad_rows;
1002	const int in_c_start = out_c * args.stride - args.pad_cols;
1003
1004	for (int f_r = `0`; f_r < args.filter_rows; ++f_r) {
1005	for (int f_c = `0`; f_c < args.filter_cols; ++f_c) {
1006	const int in_r = in_r_start + f_r;
1007	const int in_c = in_c_start + f_c;
1008
1009	if (in_r >= `0` && in_r < args.in_rows && in_c >= `0` &&
1010	in_c < args.in_cols) {
1011	int out_backprop_offset =
1012	out_d +
1013	args.out_depth *
1014	(out_c + args.out_cols * (out_r + args.out_rows * b));
1015	int input_offset =
1016	in_d +
1017	args.in_depth *
1018	(in_c + args.in_cols * (in_r + args.in_rows * b));
1019	int filter_backprop_offset =
1020	dm +
1021	args.depth_multiplier *
1022	(in_d + args.in_depth * (f_c + args.filter_cols * f_r));
1023	filter_backprop[filter_backprop_offset] +=
1024	input[input_offset] * out_backprop[out_backprop_offset];
1025	}
1026	}
1027	}
1028	}
1029	}
1030	}
1031	}
1032	}
1033
1034	// Extern template instantiated in conv_grad_filter_ops.cc.
1035	extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, bfloat16>;
1036	extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, Eigen::half>;
1037	extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, float>;
1038	extern template struct LaunchConv2DBackpropFilterOp<CPUDevice, double>;
1039
1040	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
1041
1042	// Extern template instantiated in conv_grad_filter_ops.cc.
1043	extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, Eigen::half>;
1044	extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, float>;
1045	extern template struct LaunchConv2DBackpropFilterOp<GPUDevice, double>;
1046
1047	// Extern template instantiated in depthwise_conv_op_gpu.cu.cc.
1048	extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice,
1049	Eigen::half>;
1050	extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, float>;
1051	extern template struct LaunchDepthwiseConvBackpropFilterOp<GPUDevice, double>;
1052
1053	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
1054
1055	// Kernel to compute the filter backprop for depthwise convolution.
1056	template <typename Device, class T>
1057	class DepthwiseConv2dNativeBackpropFilterOp : public OpKernel {
1058	public:
1059	explicit DepthwiseConv2dNativeBackpropFilterOp(OpKernelConstruction* context)
1060	: OpKernel(context) {
1061	OP_REQUIRES_OK(context, context->GetAttr("strides", &strides_));
1062	OP_REQUIRES(context, strides_.size() == `4`,
1063	errors::InvalidArgument("Sliding window strides field must "
1064	"specify 4 dimensions"));
1065
1066	string data_format;
1067	OP_REQUIRES_OK(context, context->GetAttr("data_format", &data_format));
1068	OP_REQUIRES(context, FormatFromString(data_format, &data_format_),
1069	errors::InvalidArgument("Invalid data format"));
1070
1071	stride_ = GetTensorDim(strides_, data_format_, `'H'`);
1072	const int64_t stride_w = GetTensorDim(strides_, data_format_, `'W'`);
1073	const int64_t stride_n = GetTensorDim(strides_, data_format_, `'N'`);
1074	const int64_t stride_c = GetTensorDim(strides_, data_format_, `'C'`);
1075
1076	OP_REQUIRES(context, stride_ == stride_w,
1077	errors::InvalidArgument(
1078	"Current implementation only supports equal length "
1079	"strides in the row and column dimensions."));
1080	OP_REQUIRES(
1081	context, (stride_n == `1` && stride_c == `1`),
1082	errors::InvalidArgument("Current implementation does not yet support "
1083	"strides in the batch and depth dimensions."));
1084	OP_REQUIRES_OK(context, context->GetAttr("padding", &padding_));
1085	OP_REQUIRES_OK(context,
1086	context->GetAttr("explicit_paddings", &explicit_paddings_));
1087	OP_REQUIRES_OK(context, CheckValidPadding(padding_, explicit_paddings_,
1088	/num_dims=/`4`, data_format_));
1089
1090	cudnn_use_autotune_ = CudnnUseAutotune();
1091
1092	if (std::is_same<T, bfloat16>::value) {
1093	dtype_ = DT_BFLOAT16;
1094	} else if (std::is_same<T, Eigen::half>::value) {
1095	dtype_ = DT_HALF;
1096	} else if (std::is_same<T, float>::value) {
1097	dtype_ = DT_FLOAT;
1098	} else if (std::is_same<T, double>::value) {
1099	dtype_ = DT_DOUBLE;
1100	} else {
1101	LOG(ERROR) << "Only bfloat16, half, float, and double are supported.";
1102	}
1103	#if CUDNN_VERSION >= 7603
1104	// Use CuDNN grouped conv (filter gradients) when input/output is
1105	// float16(half). See cudnn release note 7.6.3. (https://docs.nvidia.com/dee
1106	// plearning/sdk/cudnn-release-notes/rel_763.html#rel_763)
1107	//
1108	// Grouped convolution was added to cuDNN in version 7.0.1 but
1109	// TensorFlow op-determinism has been added only for cuDNN versions 7.6.3
1110	// and later intentionally. This is to avoid potential issues with earlier
1111	// versions of cuDNN.
1112	use_cudnn_grouped_conv_ = OpDeterminismRequired() \|\| dtype_ == DT_HALF;
1113	#else
1114	use_cudnn_grouped_conv_ = false;
1115	#endif
1116	}
1117
1118	void Compute(OpKernelContext* context) override {
1119	const Tensor& input = context->input(`0`);
1120	const Tensor& filter_sizes = context->input(`1`);
1121	OP_REQUIRES(
1122	context, TensorShapeUtils::IsVector(filter_sizes.shape()),
1123	errors::InvalidArgument(
1124	"Conv2DBackpropFilter: filter_sizes input must be 1-dim, not ",
1125	filter_sizes.dims()));
1126	TensorShape filter_shape;
1127	const int32* filter_sizes_data = filter_sizes.template flat<int32>().data();
1128	for (int i = `0`; i < filter_sizes.NumElements(); ++i) {
1129	OP_REQUIRES(context, filter_sizes_data[i] >= `0`,
1130	errors::InvalidArgument("Dimension ", i,
1131	" of filter_sizes must be >= 0"));
1132	OP_REQUIRES_OK(context,
1133	filter_shape.AddDimWithStatus(filter_sizes_data[i]));
1134	}
1135	const TensorShape& input_shape = input.shape();
1136
1137	EXTRACT_AND_VERIFY_DIMENSIONS("DepthwiseConv2DBackpropFilter");
1138	Tensor* filter_backprop = nullptr;
1139	OP_REQUIRES_OK(context, context->forward_input_or_allocate_output(
1140	{`1`}, `0`, filter_shape, &filter_backprop));
1141
1142	// If there is nothing to compute, return.
1143	if (out_backprop.shape().num_elements() == `0`) {
1144	return;
1145	}
1146
1147	// If in_depth==1, this operation is just a standard convolution.
1148	// Depthwise convolution is a special case of cuDNN's grouped convolution.
1149	bool use_cudnn = std::is_same<Device, GPUDevice>::value &&
1150	(in_depth == `1` \|\|
1151	(use_cudnn_grouped_conv_ &&
1152	(ShouldCudnnGroupedConvolutionBeUsed(
1153	filter_rows, filter_cols, in_depth, out_depth) \|\|
1154	OpDeterminismRequired())));
1155
1156	VLOG(`2`) << "DepthwiseConv2dNativeBackpropFilter: "
1157	<< " Input: [" << batch << ", " << input_rows << ", " << input_cols
1158	<< ", " << in_depth << "]; Filter: [" << filter_rows << ", "
1159	<< filter_cols << ", " << in_depth << ", " << depth_multiplier
1160	<< "]; Output: [" << batch << ", " << out_rows << ", " << out_cols
1161	<< ", " << out_depth << "], stride = " << stride_
1162	<< ", pad_rows = " << pad_top << ", pad_cols = " << pad_left
1163	<< ", Use cuDNN: " << use_cudnn;
1164
1165	if (use_cudnn) {
1166	// Reshape from TF depthwise filter to cuDNN grouped convolution filter:
1167	//
1168	// \| TensorFlow \| cuDNN
1169	// --------------------------------------------------------------------
1170	// filter_out_depth \| depth_multiplier \| depth_multiplier group_count*
1171	// filter_in_depth \| in_depth \| in_depth / group_count
1172	//
1173	// For depthwise convolution, we have group_count == in_depth.
1174	int32_t filter_in_depth = `1`;
1175	TensorShape shape =
1176	TensorShape {filter_rows, filter_cols, filter_in_depth, out_depth};
1177	Tensor reshaped_filter(/type=/dtype_);
1178	OP_REQUIRES(
1179	context, reshaped_filter.CopyFrom(*filter_backprop, shape),
1180	errors::Internal(
1181	"Failed to reshape filter tensor for grouped convolution."));
1182
1183	// TODO(yangzihao): Send in arbitrary dilation rates after the dilated
1184	// conv is supported.
1185	launcher_(context, /use_cudnn=/true, cudnn_use_autotune_, out_backprop,
1186	input,
1187	/row_dilation=/`1`, /col_dilation=/`1`, stride_, stride_,
1188	padding_, explicit_paddings_, &reshaped_filter, data_format_);
1189	return;
1190	}
1191
1192	// For GPU inputs with type half, we cast inputs to float and outputs back
1193	// to half, as half implementation is slow and does not use full precision
1194	// accumulation in some cases.
1195	constexpr bool cast_to_float = std::is_same<T, Eigen::half>::value &&
1196	std::is_same<Device, GPUDevice>::value;
1197	using U = typename std::conditional<cast_to_float, float, T>::type;
1198	Tensor casted_out_backprop = out_backprop;
1199	Tensor casted_input = input;
1200	Tensor casted_filter_backprop = *filter_backprop;
1201	const Device& device = context->template eigen_device<Device>();
1202	if (cast_to_float) {
1203	functor::CastFunctor<Device, float, Eigen::half> cast;
1204	OP_REQUIRES_OK(context,
1205	context->allocate_temp(DT_FLOAT, out_backprop.shape(),
1206	&casted_out_backprop));
1207	cast(device, casted_out_backprop.template flat<float>(),
1208	out_backprop.template flat<Eigen::half>());
1209	OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, input.shape(),
1210	&casted_input));
1211	cast(device, casted_input.template flat<float>(),
1212	input.template flat<Eigen::half>());
1213	OP_REQUIRES_OK(context,
1214	context->allocate_temp(DT_FLOAT, filter_backprop->shape(),
1215	&casted_filter_backprop));
1216	}
1217
1218	auto out_backprop_ptr = casted_out_backprop.template flat<U>().data();
1219	auto input_ptr = casted_input.template flat<U>().data();
1220	auto filter_backprop_ptr = casted_filter_backprop.template flat<U>().data();
1221	LaunchDepthwiseConvBackpropFilterOp<Device, U>()(
1222	context, args, out_backprop_ptr, input_ptr, filter_backprop_ptr,
1223	data_format_);
1224
1225	if (cast_to_float) {
1226	functor::CastFunctor<Device, Eigen::half, float> cast;
1227	const Tensor& casted_filter_backprop_const = casted_filter_backprop;
1228	cast(device, filter_backprop->template flat<Eigen::half>(),
1229	casted_filter_backprop_const.template flat<float>());
1230	}
1231	}
1232
1233	protected:
1234	bool use_cudnn_grouped_conv_;
1235
1236	private:
1237	std::vector<int32> strides_;
1238	Padding padding_;
1239	std::vector<int64_t> explicit_paddings_;
1240	TensorFormat data_format_;
1241	int64_t stride_;
1242
1243	// For in_depth == 1 and grouped convolutions.
1244	LaunchConv2DBackpropFilterOp<Device, T> launcher_;
1245	bool cudnn_use_autotune_;
1246	DataType dtype_;
1247
1248	TF_DISALLOW_COPY_AND_ASSIGN(DepthwiseConv2dNativeBackpropFilterOp);
1249	};
1250
1251	#define REGISTER_CPU_KERNEL(T) \
1252	REGISTER_KERNEL_BUILDER( \
1253	Name("DepthwiseConv2dNativeBackpropFilter") \
1254	.Device(DEVICE_CPU) \
1255	.TypeConstraint<T>("T"), \
1256	DepthwiseConv2dNativeBackpropFilterOp<CPUDevice, T>);
1257	TF_CALL_bfloat16(REGISTER_CPU_KERNEL);
1258	TF_CALL_half(REGISTER_CPU_KERNEL);
1259	TF_CALL_float(REGISTER_CPU_KERNEL);
1260	#if !defined(PLATFORM_WINDOWS) \|\| !defined(_DEBUG)
1261	TF_CALL_double(REGISTER_CPU_KERNEL);
1262	#endif
1263	#undef REGISTER_CPU_KERNEL
1264
1265	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
1266	#define REGISTER_GPU_KERNEL(T) \
1267	REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1268	.Device(DEVICE_GPU) \
1269	.TypeConstraint<T>("T") \
1270	.HostMemory("filter_sizes"), \
1271	DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>)
1272
1273	TF_CALL_half(REGISTER_GPU_KERNEL);
1274	TF_CALL_float(REGISTER_GPU_KERNEL);
1275	TF_CALL_double(REGISTER_GPU_KERNEL);
1276	#undef REGISTER_GPU_KERNEL
1277
1278	#if CUDNN_VERSION >= 7000
1279	template <typename T>
1280	class DepthwiseConv2dGroupedConvBackpropFilterOp
1281	: public DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T> {
1282	public:
1283	DepthwiseConv2dGroupedConvBackpropFilterOp(OpKernelConstruction* context)
1284	: DepthwiseConv2dNativeBackpropFilterOp<GPUDevice, T>(context) {
1285	this->use_cudnn_grouped_conv_ = true;
1286	}
1287	};
1288
1289	#define REGISTER_GROUPED_CONV_KERNEL(T) \
1290	REGISTER_KERNEL_BUILDER(Name("DepthwiseConv2dNativeBackpropFilter") \
1291	.Device(DEVICE_GPU) \
1292	.TypeConstraint<T>("T") \
1293	.HostMemory("filter_sizes") \
1294	.Label("cudnn_grouped_convolution"), \
1295	DepthwiseConv2dGroupedConvBackpropFilterOp<T>)
1296
1297	TF_CALL_half(REGISTER_GROUPED_CONV_KERNEL);
1298	TF_CALL_float(REGISTER_GROUPED_CONV_KERNEL);
1299	TF_CALL_double(REGISTER_GROUPED_CONV_KERNEL);
1300	#undef REGISTER_GROUPED_CONV_KERNEL
1301	#endif // CUDNN_VERSION
1302	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
1303
1304	} // namespace tensorflow
1305

Browse the source code of tensorflow/tensorflow/core/kernels/depthwise_conv_grad_op.cc