depthwise_conv_op.h source code [tensorflow/tensorflow/core/kernels/depthwise_conv_op.h]

1	/ Copyright 2015 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
17	#define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
18
19	#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20	#include "tensorflow/core/framework/types.h"
21	#include "tensorflow/core/util/tensor_format.h"
22
23	namespace tensorflow {
24
25	struct DepthwiseArgs {
26	// Input layer dimensions
27	int batch;
28	int in_rows;
29	int in_cols;
30	int in_depth;
31	int filter_rows;
32	int filter_cols;
33	int depth_multiplier;
34	int stride;
35	int pad_rows; // Amount of padding to the top of the input
36	int pad_cols; // Amount of padding to the left of the input
37
38	// Output layer dimensions
39	int out_rows;
40	int out_cols;
41	int out_depth;
42
43	DepthwiseArgs()
44	: batch(`0`),
45	in_rows(`0`),
46	in_cols(`0`),
47	in_depth(`0`),
48	filter_rows(`0`),
49	filter_cols(`0`),
50	depth_multiplier(`0`),
51	stride(`0`),
52	pad_rows(`0`),
53	pad_cols(`0`),
54	out_rows(`0`),
55	out_cols(`0`),
56	out_depth(`0`) {}
57	};
58
59	// Forward declaration.
60	class OpKernelContext;
61
62	template <typename Device, typename T>
63	struct LaunchDepthwiseConvOp {
64	void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
65	const T* input, const T* filter, T* output,
66	TensorFormat data_format);
67	};
68
69	template <typename Device, typename T>
70	struct LaunchDepthwiseConvBackpropInputOp {
71	void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
72	const T* out_backprop, const T* filter, T* in_backprop,
73	TensorFormat data_format);
74	};
75
76	template <typename Device, typename T>
77	struct LaunchDepthwiseConvBackpropFilterOp {
78	void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
79	const T* out_backprop, const T* input, T* filter_backprop,
80	TensorFormat data_format);
81	};
82
83	#if GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
84	template <typename T>
85	struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> {
86	void operator()(OpKernelContext* ctx, const DepthwiseArgs& args,
87	const T* input, const T* filter, T* output,
88	TensorFormat data_format);
89	};
90
91	template <typename T>
92	struct LaunchDepthwiseConvBackpropInputOp<Eigen::GpuDevice, T> {
93	void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
94	const T* out_backprop, const T* filter, T* in_backprop,
95	TensorFormat data_format);
96	};
97
98	template <typename T>
99	struct LaunchDepthwiseConvBackpropFilterOp<Eigen::GpuDevice, T> {
100	void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args,
101	const T* out_backprop, const T* input, T* filter_backprop,
102	TensorFormat data_format);
103	};
104	#endif // GOOGLE_CUDA \|\| TENSORFLOW_USE_ROCM
105
106	} // namespace tensorflow
107
108	namespace tensorflow {
109	namespace functor {
110
111	// Pads 'filter' to vector-register boundary along its inner dimension:
112	// filter_inner_dim_size = in_depth depth_multiplier*
113	// Requires 'filter' to have the following storage order:
114	// [filter_rows, filter_cols, in_depth, depth_multiplier]
115	// Returns zero-padded filter in 'padded_filter'.
116	//
117	// EX:
118	// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
119	// So we have a total of 3 2 = 6 filters, each of spatial size 2 x 2.*
120	//
121	// filter [rows, cols, in_depth, depth_multiplier]
122	// [u0, v0, w0, x0] [y0, z0, u1, v1] [w1, x1, y1, z1]
123	// [u2, v2, w2, x2] [y2, z2, u3, v3] [w3, x3, y3, z3]
124	//
125	// padded_filter [rows, cols, in_depth, depth_multiplier]
126	// [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0]
127	// [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0]
128
129	template <typename T>
130	struct DepthwiseFilterPadOp {
131	void operator()(const DepthwiseArgs& args, const T* filter,
132	T* padded_filter) {
133	typedef typename Eigen::internal::packet_traits<T>::type Packet;
134	static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T));
135
136	// Calculate vectorized and scalar lengths of filter's inner dimension.
137	const int64_t filter_inner_dim_size = args.out_depth;
138	const int64_t vectorized_size =
139	(filter_inner_dim_size / kPacketSize) * kPacketSize;
140	const int64_t scalar_size = filter_inner_dim_size - vectorized_size;
141	// Calculate required padding and padded output buffer stride.
142	const int64_t pad_size = scalar_size > `0` ? kPacketSize - scalar_size : `0`;
143	const int64_t padded_filter_stride = vectorized_size + kPacketSize;
144
145	const int64_t filter_spatial_size = args.filter_rows * args.filter_cols;
146	for (int64_t i = `0`; i < filter_spatial_size; ++i) {
147	const int64_t input_base = i * filter_inner_dim_size;
148	const int64_t output_base = i * padded_filter_stride;
149	// Write vectorized length of filter's inner dimension to output.
150	for (int64_t j = `0`; j < vectorized_size; j += kPacketSize) {
151	const auto v = Eigen::internal::ploadu<Packet>(filter + input_base + j);
152	Eigen::internal::pstoreu<T>(padded_filter + output_base + j, v);
153	}
154	// Write scalar length of filter's inner dimension to output.
155	for (int64_t j = `0`; j < scalar_size; ++j) {
156	padded_filter[output_base + vectorized_size + j] =
157	filter[input_base + vectorized_size + j];
158	}
159	// Pad the remainder of output to vector-register boundary.
160	for (int64_t j = `0`; j < pad_size; ++j) {
161	padded_filter[output_base + vectorized_size + scalar_size + j] =
162	static_cast<T>(`0`);
163	}
164	}
165	}
166	};
167
168	// Copies data from local region in 'input' specified by 'out_r' and 'out_'c'
169	// to 'input_buffer'. The copied data is replicated by factor
170	// 'args.depth_multiplier', and padded to vector register-width boundaries so
171	// that it is aligned for efficient traversal and vector multiply-add by the
172	// depthwise kernel.
173	//
174	// EX:
175	// in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4
176	//
177	// input: [batch, in_rows, in_cols, in_depth]
178	//
179	// [a0, a1, a2, b0, b1, b2, ..., e0, e1, e2, f0, f1, f2, ...]
180	//
181	// input_buffer (register boundaries shown):
182	// [a0, a0, a1, a1] [a2, a2, 0, 0] in_row = 0, in_col = 0
183	// [b0, b0, b1, b1] [b2, b2, 0, 0] in_row = 0, in_col = 1
184	// [e0, e0, e1, e1] [e2, e2, 0, 0] in_row = 1, in_col = 0
185	// [f0, f0, f1, f1] [f2, f2, 0, 0] in_row = 1, in_col = 1
186	//
187	// Returns replicated and padded data from specified input region in
188	// 'input_buffer'.
189
190	template <typename T>
191	struct DepthwiseInputCopyOp {
192	void operator()(const DepthwiseArgs& args,
193	const int64_t padded_filter_inner_dim_size,
194	const int64_t out_r, const int64_t out_c, const T* input,
195	T* input_buffer) {
196	typedef typename Eigen::internal::packet_traits<T>::type Packet;
197	static const int64_t kPacketSize = Eigen::internal::packet_traits<T>::size;
198
199	const int64_t kDepth = args.depth_multiplier;
200	// Calculate vectorized and scalar (residual) lengths for 'in_depth'.
201	const int64_t input_vectorized_size =
202	(args.in_depth / kPacketSize) * kPacketSize;
203	const int64_t input_scalar_size = args.in_depth - input_vectorized_size;
204
205	// Calculate output padding length.
206	const int64_t output_scalar_size = args.out_depth % kPacketSize;
207	const int64_t output_pad_size =
208	output_scalar_size > `0` ? kPacketSize - output_scalar_size : `0`;
209
210	// Iterate through all rows x cols reading 'in_depth' from 'input' and
211	// replicating by 'depth_multiplier' into 'input_buffer' (otherwise
212	// zero-padding input buffer as needed).
213	auto* in_buf = input_buffer;
214	const int64_t in_r_start = out_r * args.stride - args.pad_rows;
215	const int64_t in_c_start = out_c * args.stride - args.pad_cols;
216
217	// TODO: add a ploaddup variant for depth == 2 if needed.
218	if (kDepth > `1` && kDepth <= kPacketSize) {
219	for (int64_t f_r = `0`; f_r < args.filter_rows; ++f_r) {
220	const int64_t in_r = in_r_start + f_r;
221
222	for (int64_t f_c = `0`; f_c < args.filter_cols; ++f_c) {
223	const int64_t in_c = in_c_start + f_c;
224
225	if (in_r >= `0` && in_r < args.in_rows && in_c >= `0` &&
226	in_c < args.in_cols) {
227	const auto* in =
228	input + (in_r * args.in_cols + in_c) * args.in_depth;
229	int64_t limit = args.in_depth;
230	// This will overwrite up to kPacketSize next elements,
231	// this is ok on all iterations except the last one, since
232	// we will write correct values on a next iteration.
233	if (f_c == args.filter_cols - `1`) {
234	limit -= (kPacketSize - kDepth) / kDepth + `1`;
235	if (limit < `0`) {
236	limit = `0`;
237	}
238	}
239	// Copy vectorized portion of inner dimension.
240	for (int64_t d = `0`; d < limit; d++) {
241	const auto p = Eigen::internal::pset1<Packet>(in[d]);
242	Eigen::internal::pstoreu<T>(in_buf, p);
243	in_buf += kDepth;
244	}
245
246	// Copy the scalar portion.
247	for (int64_t d = limit; d < args.in_depth; d++) {
248	const auto value = in[d];
249	for (int64_t dm = `0`; dm < kDepth; dm++) {
250	in_buf[dm] = value;
251	}
252	in_buf += kDepth;
253	}
254
255	// Pad the remainder of the output to vector register boundary.
256	for (int64_t d = `0`; d < output_pad_size; ++d) {
257	in_buf[d] = static_cast<T>(`0`);
258	}
259	in_buf += output_pad_size;
260	} else {
261	// Zero pad.
262	memset(in_buf, `0`, sizeof(T) * padded_filter_inner_dim_size);
263	in_buf += padded_filter_inner_dim_size;
264	}
265	}
266	}
267	} else if (kDepth > kPacketSize) {
268	// Calculate vectorized and scalar (residual) lengths for
269	// 'depth_multiplier'. This is used to efficiently replicate data for
270	// when 'depth_multiplier' > kPacketSize.
271	const int64_t dm_vectorized_size = (kDepth / kPacketSize) * kPacketSize;
272
273	for (int64_t f_r = `0`; f_r < args.filter_rows; ++f_r) {
274	const int64_t in_r = in_r_start + f_r;
275
276	for (int64_t f_c = `0`; f_c < args.filter_cols; ++f_c) {
277	const int64_t in_c = in_c_start + f_c;
278
279	if (in_r >= `0` && in_r < args.in_rows && in_c >= `0` &&
280	in_c < args.in_cols) {
281	const auto* in =
282	input + (in_r * args.in_cols + in_c) * args.in_depth;
283	// Copy vectorized portion of inner dimension.
284	for (int64_t d = `0`; d < args.in_depth; d++) {
285	const auto p = Eigen::internal::pset1<Packet>(in[d]);
286	for (int64_t dm = `0`; dm < dm_vectorized_size; dm += kPacketSize) {
287	Eigen::internal::pstoreu<T>(in_buf + dm, p);
288	}
289	// Overlapping store for the remainder.
290	Eigen::internal::pstoreu<T>(in_buf + kDepth - kPacketSize, p);
291	in_buf += kDepth;
292	}
293	// Pad the remainder of the output to vector register boundary.
294	for (int64_t d = `0`; d < output_pad_size; ++d) {
295	in_buf[d] = static_cast<T>(`0`);
296	}
297	in_buf += output_pad_size;
298	} else {
299	// Zero pad.
300	memset(in_buf, `0`, sizeof(T) * padded_filter_inner_dim_size);
301	in_buf += padded_filter_inner_dim_size;
302	}
303	}
304	}
305	} else if (kDepth == `1`) {
306	for (int64_t f_r = `0`; f_r < args.filter_rows; ++f_r) {
307	const int64_t in_r = in_r_start + f_r;
308
309	for (int64_t f_c = `0`; f_c < args.filter_cols; ++f_c) {
310	const int64_t in_c = in_c_start + f_c;
311
312	if (in_r >= `0` && in_r < args.in_rows && in_c >= `0` &&
313	in_c < args.in_cols) {
314	const auto* in =
315	input + (in_r * args.in_cols + in_c) * args.in_depth;
316	for (int64_t d = `0`; d < input_vectorized_size; d += kPacketSize) {
317	const auto p = Eigen::internal::ploadu<Packet>(in + d);
318	Eigen::internal::pstoreu<T>(in_buf, p);
319	in_buf += kPacketSize;
320	}
321	for (int64_t d = `0`; d < input_scalar_size; ++d) {
322	T v = in[input_vectorized_size + d];
323	in_buf[d] = v;
324	}
325	in_buf += input_scalar_size;
326
327	// Pad the remainder of the output to vector register boundary.
328	for (int64_t d = `0`; d < output_pad_size; ++d) {
329	in_buf[d] = static_cast<T>(`0`);
330	}
331	in_buf += output_pad_size;
332	} else {
333	// Zero pad.
334	memset(in_buf, `0`, sizeof(T) * padded_filter_inner_dim_size);
335	in_buf += padded_filter_inner_dim_size;
336	}
337	}
338	}
339	}
340	}
341	};
342
343	} // namespace functor
344	} // namespace tensorflow
345
346	#endif // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_
347

Browse the source code of tensorflow/tensorflow/core/kernels/depthwise_conv_op.h