1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_ |
17 | #define TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_ |
18 | |
19 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
20 | #include "tensorflow/core/framework/types.h" |
21 | #include "tensorflow/core/util/tensor_format.h" |
22 | |
23 | namespace tensorflow { |
24 | |
25 | struct DepthwiseArgs { |
26 | // Input layer dimensions |
27 | int batch; |
28 | int in_rows; |
29 | int in_cols; |
30 | int in_depth; |
31 | int filter_rows; |
32 | int filter_cols; |
33 | int depth_multiplier; |
34 | int stride; |
35 | int pad_rows; // Amount of padding to the top of the input |
36 | int pad_cols; // Amount of padding to the left of the input |
37 | |
38 | // Output layer dimensions |
39 | int out_rows; |
40 | int out_cols; |
41 | int out_depth; |
42 | |
43 | DepthwiseArgs() |
44 | : batch(0), |
45 | in_rows(0), |
46 | in_cols(0), |
47 | in_depth(0), |
48 | filter_rows(0), |
49 | filter_cols(0), |
50 | depth_multiplier(0), |
51 | stride(0), |
52 | pad_rows(0), |
53 | pad_cols(0), |
54 | out_rows(0), |
55 | out_cols(0), |
56 | out_depth(0) {} |
57 | }; |
58 | |
59 | // Forward declaration. |
60 | class OpKernelContext; |
61 | |
62 | template <typename Device, typename T> |
63 | struct LaunchDepthwiseConvOp { |
64 | void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, |
65 | const T* input, const T* filter, T* output, |
66 | TensorFormat data_format); |
67 | }; |
68 | |
69 | template <typename Device, typename T> |
70 | struct LaunchDepthwiseConvBackpropInputOp { |
71 | void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, |
72 | const T* out_backprop, const T* filter, T* in_backprop, |
73 | TensorFormat data_format); |
74 | }; |
75 | |
76 | template <typename Device, typename T> |
77 | struct LaunchDepthwiseConvBackpropFilterOp { |
78 | void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, |
79 | const T* out_backprop, const T* input, T* filter_backprop, |
80 | TensorFormat data_format); |
81 | }; |
82 | |
83 | #if GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
84 | template <typename T> |
85 | struct LaunchDepthwiseConvOp<Eigen::GpuDevice, T> { |
86 | void operator()(OpKernelContext* ctx, const DepthwiseArgs& args, |
87 | const T* input, const T* filter, T* output, |
88 | TensorFormat data_format); |
89 | }; |
90 | |
91 | template <typename T> |
92 | struct LaunchDepthwiseConvBackpropInputOp<Eigen::GpuDevice, T> { |
93 | void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args, |
94 | const T* out_backprop, const T* filter, T* in_backprop, |
95 | TensorFormat data_format); |
96 | }; |
97 | |
98 | template <typename T> |
99 | struct LaunchDepthwiseConvBackpropFilterOp<Eigen::GpuDevice, T> { |
100 | void operator()(class OpKernelContext* ctx, const DepthwiseArgs& args, |
101 | const T* out_backprop, const T* input, T* filter_backprop, |
102 | TensorFormat data_format); |
103 | }; |
104 | #endif // GOOGLE_CUDA || TENSORFLOW_USE_ROCM |
105 | |
106 | } // namespace tensorflow |
107 | |
108 | namespace tensorflow { |
109 | namespace functor { |
110 | |
111 | // Pads 'filter' to vector-register boundary along its inner dimension: |
112 | // filter_inner_dim_size = in_depth * depth_multiplier |
113 | // Requires 'filter' to have the following storage order: |
114 | // [filter_rows, filter_cols, in_depth, depth_multiplier] |
115 | // Returns zero-padded filter in 'padded_filter'. |
116 | // |
117 | // EX: |
118 | // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 |
119 | // So we have a total of 3 * 2 = 6 filters, each of spatial size 2 x 2. |
120 | // |
121 | // filter [rows, cols, in_depth, depth_multiplier] |
122 | // [u0, v0, w0, x0] [y0, z0, u1, v1] [w1, x1, y1, z1] |
123 | // [u2, v2, w2, x2] [y2, z2, u3, v3] [w3, x3, y3, z3] |
124 | // |
125 | // padded_filter [rows, cols, in_depth, depth_multiplier] |
126 | // [u0, v0, w0, x0] [y0, z0, 0, 0] [u1, v1, w1, x1] [y1, z1, 0, 0] |
127 | // [u2, v2, w2, x2] [y2, z2, 0, 0] [u3, v3, w3, x3] [y3, z3, 0, 0] |
128 | |
129 | template <typename T> |
130 | struct DepthwiseFilterPadOp { |
131 | void operator()(const DepthwiseArgs& args, const T* filter, |
132 | T* padded_filter) { |
133 | typedef typename Eigen::internal::packet_traits<T>::type Packet; |
134 | static const int64_t kPacketSize = (sizeof(Packet) / sizeof(T)); |
135 | |
136 | // Calculate vectorized and scalar lengths of filter's inner dimension. |
137 | const int64_t filter_inner_dim_size = args.out_depth; |
138 | const int64_t vectorized_size = |
139 | (filter_inner_dim_size / kPacketSize) * kPacketSize; |
140 | const int64_t scalar_size = filter_inner_dim_size - vectorized_size; |
141 | // Calculate required padding and padded output buffer stride. |
142 | const int64_t pad_size = scalar_size > 0 ? kPacketSize - scalar_size : 0; |
143 | const int64_t padded_filter_stride = vectorized_size + kPacketSize; |
144 | |
145 | const int64_t filter_spatial_size = args.filter_rows * args.filter_cols; |
146 | for (int64_t i = 0; i < filter_spatial_size; ++i) { |
147 | const int64_t input_base = i * filter_inner_dim_size; |
148 | const int64_t output_base = i * padded_filter_stride; |
149 | // Write vectorized length of filter's inner dimension to output. |
150 | for (int64_t j = 0; j < vectorized_size; j += kPacketSize) { |
151 | const auto v = Eigen::internal::ploadu<Packet>(filter + input_base + j); |
152 | Eigen::internal::pstoreu<T>(padded_filter + output_base + j, v); |
153 | } |
154 | // Write scalar length of filter's inner dimension to output. |
155 | for (int64_t j = 0; j < scalar_size; ++j) { |
156 | padded_filter[output_base + vectorized_size + j] = |
157 | filter[input_base + vectorized_size + j]; |
158 | } |
159 | // Pad the remainder of output to vector-register boundary. |
160 | for (int64_t j = 0; j < pad_size; ++j) { |
161 | padded_filter[output_base + vectorized_size + scalar_size + j] = |
162 | static_cast<T>(0); |
163 | } |
164 | } |
165 | } |
166 | }; |
167 | |
168 | // Copies data from local region in 'input' specified by 'out_r' and 'out_'c' |
169 | // to 'input_buffer'. The copied data is replicated by factor |
170 | // 'args.depth_multiplier', and padded to vector register-width boundaries so |
171 | // that it is aligned for efficient traversal and vector multiply-add by the |
172 | // depthwise kernel. |
173 | // |
174 | // EX: |
175 | // in_depth = 3, depth_multiplier = 2, filter [2, 2], register_width = 4 |
176 | // |
177 | // input: [batch, in_rows, in_cols, in_depth] |
178 | // |
179 | // [a0, a1, a2, b0, b1, b2, ..., e0, e1, e2, f0, f1, f2, ...] |
180 | // |
181 | // input_buffer (register boundaries shown): |
182 | // [a0, a0, a1, a1] [a2, a2, 0, 0] in_row = 0, in_col = 0 |
183 | // [b0, b0, b1, b1] [b2, b2, 0, 0] in_row = 0, in_col = 1 |
184 | // [e0, e0, e1, e1] [e2, e2, 0, 0] in_row = 1, in_col = 0 |
185 | // [f0, f0, f1, f1] [f2, f2, 0, 0] in_row = 1, in_col = 1 |
186 | // |
187 | // Returns replicated and padded data from specified input region in |
188 | // 'input_buffer'. |
189 | |
190 | template <typename T> |
191 | struct DepthwiseInputCopyOp { |
192 | void operator()(const DepthwiseArgs& args, |
193 | const int64_t padded_filter_inner_dim_size, |
194 | const int64_t out_r, const int64_t out_c, const T* input, |
195 | T* input_buffer) { |
196 | typedef typename Eigen::internal::packet_traits<T>::type Packet; |
197 | static const int64_t kPacketSize = Eigen::internal::packet_traits<T>::size; |
198 | |
199 | const int64_t kDepth = args.depth_multiplier; |
200 | // Calculate vectorized and scalar (residual) lengths for 'in_depth'. |
201 | const int64_t input_vectorized_size = |
202 | (args.in_depth / kPacketSize) * kPacketSize; |
203 | const int64_t input_scalar_size = args.in_depth - input_vectorized_size; |
204 | |
205 | // Calculate output padding length. |
206 | const int64_t output_scalar_size = args.out_depth % kPacketSize; |
207 | const int64_t output_pad_size = |
208 | output_scalar_size > 0 ? kPacketSize - output_scalar_size : 0; |
209 | |
210 | // Iterate through all rows x cols reading 'in_depth' from 'input' and |
211 | // replicating by 'depth_multiplier' into 'input_buffer' (otherwise |
212 | // zero-padding input buffer as needed). |
213 | auto* in_buf = input_buffer; |
214 | const int64_t in_r_start = out_r * args.stride - args.pad_rows; |
215 | const int64_t in_c_start = out_c * args.stride - args.pad_cols; |
216 | |
217 | // TODO: add a ploaddup variant for depth == 2 if needed. |
218 | if (kDepth > 1 && kDepth <= kPacketSize) { |
219 | for (int64_t f_r = 0; f_r < args.filter_rows; ++f_r) { |
220 | const int64_t in_r = in_r_start + f_r; |
221 | |
222 | for (int64_t f_c = 0; f_c < args.filter_cols; ++f_c) { |
223 | const int64_t in_c = in_c_start + f_c; |
224 | |
225 | if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 && |
226 | in_c < args.in_cols) { |
227 | const auto* in = |
228 | input + (in_r * args.in_cols + in_c) * args.in_depth; |
229 | int64_t limit = args.in_depth; |
230 | // This will overwrite up to kPacketSize next elements, |
231 | // this is ok on all iterations except the last one, since |
232 | // we will write correct values on a next iteration. |
233 | if (f_c == args.filter_cols - 1) { |
234 | limit -= (kPacketSize - kDepth) / kDepth + 1; |
235 | if (limit < 0) { |
236 | limit = 0; |
237 | } |
238 | } |
239 | // Copy vectorized portion of inner dimension. |
240 | for (int64_t d = 0; d < limit; d++) { |
241 | const auto p = Eigen::internal::pset1<Packet>(in[d]); |
242 | Eigen::internal::pstoreu<T>(in_buf, p); |
243 | in_buf += kDepth; |
244 | } |
245 | |
246 | // Copy the scalar portion. |
247 | for (int64_t d = limit; d < args.in_depth; d++) { |
248 | const auto value = in[d]; |
249 | for (int64_t dm = 0; dm < kDepth; dm++) { |
250 | in_buf[dm] = value; |
251 | } |
252 | in_buf += kDepth; |
253 | } |
254 | |
255 | // Pad the remainder of the output to vector register boundary. |
256 | for (int64_t d = 0; d < output_pad_size; ++d) { |
257 | in_buf[d] = static_cast<T>(0); |
258 | } |
259 | in_buf += output_pad_size; |
260 | } else { |
261 | // Zero pad. |
262 | memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size); |
263 | in_buf += padded_filter_inner_dim_size; |
264 | } |
265 | } |
266 | } |
267 | } else if (kDepth > kPacketSize) { |
268 | // Calculate vectorized and scalar (residual) lengths for |
269 | // 'depth_multiplier'. This is used to efficiently replicate data for |
270 | // when 'depth_multiplier' > kPacketSize. |
271 | const int64_t dm_vectorized_size = (kDepth / kPacketSize) * kPacketSize; |
272 | |
273 | for (int64_t f_r = 0; f_r < args.filter_rows; ++f_r) { |
274 | const int64_t in_r = in_r_start + f_r; |
275 | |
276 | for (int64_t f_c = 0; f_c < args.filter_cols; ++f_c) { |
277 | const int64_t in_c = in_c_start + f_c; |
278 | |
279 | if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 && |
280 | in_c < args.in_cols) { |
281 | const auto* in = |
282 | input + (in_r * args.in_cols + in_c) * args.in_depth; |
283 | // Copy vectorized portion of inner dimension. |
284 | for (int64_t d = 0; d < args.in_depth; d++) { |
285 | const auto p = Eigen::internal::pset1<Packet>(in[d]); |
286 | for (int64_t dm = 0; dm < dm_vectorized_size; dm += kPacketSize) { |
287 | Eigen::internal::pstoreu<T>(in_buf + dm, p); |
288 | } |
289 | // Overlapping store for the remainder. |
290 | Eigen::internal::pstoreu<T>(in_buf + kDepth - kPacketSize, p); |
291 | in_buf += kDepth; |
292 | } |
293 | // Pad the remainder of the output to vector register boundary. |
294 | for (int64_t d = 0; d < output_pad_size; ++d) { |
295 | in_buf[d] = static_cast<T>(0); |
296 | } |
297 | in_buf += output_pad_size; |
298 | } else { |
299 | // Zero pad. |
300 | memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size); |
301 | in_buf += padded_filter_inner_dim_size; |
302 | } |
303 | } |
304 | } |
305 | } else if (kDepth == 1) { |
306 | for (int64_t f_r = 0; f_r < args.filter_rows; ++f_r) { |
307 | const int64_t in_r = in_r_start + f_r; |
308 | |
309 | for (int64_t f_c = 0; f_c < args.filter_cols; ++f_c) { |
310 | const int64_t in_c = in_c_start + f_c; |
311 | |
312 | if (in_r >= 0 && in_r < args.in_rows && in_c >= 0 && |
313 | in_c < args.in_cols) { |
314 | const auto* in = |
315 | input + (in_r * args.in_cols + in_c) * args.in_depth; |
316 | for (int64_t d = 0; d < input_vectorized_size; d += kPacketSize) { |
317 | const auto p = Eigen::internal::ploadu<Packet>(in + d); |
318 | Eigen::internal::pstoreu<T>(in_buf, p); |
319 | in_buf += kPacketSize; |
320 | } |
321 | for (int64_t d = 0; d < input_scalar_size; ++d) { |
322 | T v = in[input_vectorized_size + d]; |
323 | in_buf[d] = v; |
324 | } |
325 | in_buf += input_scalar_size; |
326 | |
327 | // Pad the remainder of the output to vector register boundary. |
328 | for (int64_t d = 0; d < output_pad_size; ++d) { |
329 | in_buf[d] = static_cast<T>(0); |
330 | } |
331 | in_buf += output_pad_size; |
332 | } else { |
333 | // Zero pad. |
334 | memset(in_buf, 0, sizeof(T) * padded_filter_inner_dim_size); |
335 | in_buf += padded_filter_inner_dim_size; |
336 | } |
337 | } |
338 | } |
339 | } |
340 | } |
341 | }; |
342 | |
343 | } // namespace functor |
344 | } // namespace tensorflow |
345 | |
346 | #endif // TENSORFLOW_CORE_KERNELS_DEPTHWISE_CONV_OP_H_ |
347 | |