1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ |
17 | #define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ |
18 | |
19 | #include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" |
20 | |
21 | // Note the following header is used in both TF and TFLite. Particularly, it's |
22 | // used for float TFLite Conv2D. |
23 | #include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h" |
24 | |
25 | #if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) |
26 | #include "tensorflow/core/kernels/eigen_contraction_kernel.h" |
27 | |
28 | namespace Eigen { |
29 | namespace internal { |
30 | |
31 | // After we vectorized all loads from the underlying tensor using Packet ops, we |
32 | // have to finalize coefficients that do not fit into a packet. |
33 | template <typename Scalar, typename DataMapper, int packet_size, |
34 | bool masked_load_store> |
35 | struct FinalizeDataMapperCoeffs { |
36 | EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block, |
37 | const DataMapper& rhs, |
38 | Index base_idx, Index depth, |
39 | Index max_depth, bool pad = false) { |
40 | const Index num_coeffs = max_depth - depth; |
41 | eigen_assert(num_coeffs <= packet_size); |
42 | |
43 | for (; depth < max_depth; ++depth) { |
44 | *block = pad ? Scalar(0) : rhs.coeffNoPadding(depth, base_idx); |
45 | ++block; |
46 | } |
47 | |
48 | return num_coeffs; |
49 | } |
50 | }; |
51 | |
52 | template <typename Scalar, typename DataMapper, int packet_size> |
53 | struct FinalizeDataMapperCoeffs<Scalar, DataMapper, packet_size, |
54 | /*masked_load_store=*/true> { |
55 | EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block, |
56 | const DataMapper& rhs, |
57 | Index base_idx, Index depth, |
58 | Index max_depth, bool pad = false) { |
59 | Index num_coeffs = max_depth - depth; |
60 | eigen_assert(num_coeffs <= packet_size); |
61 | if (num_coeffs == 0) return 0; |
62 | |
63 | using Packet = typename packet_traits<Scalar>::type; |
64 | Packet p = pad ? pset1<Packet>(Scalar(0)) |
65 | : rhs.partialPacketNoPadding(depth, base_idx, num_coeffs); |
66 | internal::pstoreu(block, p, mask<Packet>(0, num_coeffs)); |
67 | |
68 | return num_coeffs; |
69 | } |
70 | }; |
71 | |
72 | // Pack a block of the right input matrix (in our case it's always a |
73 | // "virtual matrix" constructed from extracted image patches) in contiguous |
74 | // block in column-major storage order. Knowing the properties of the |
75 | // original patch op we can do it more efficient than the default |
76 | // gemm_pack_colmajor_block. |
77 | template <typename NewDimension, Index Rows, Index Cols, typename ArgType, |
78 | typename Device, typename Scalar, typename StorageIndex, |
79 | typename nocontract_t, typename contract_t, int packet_size, |
80 | bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment> |
81 | struct gemm_pack_colmajor_block< |
82 | Scalar, StorageIndex, |
83 | TensorContractionSubMapper< |
84 | Scalar, StorageIndex, Rhs, |
85 | TensorEvaluator< |
86 | const TensorReshapingOp< |
87 | NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, |
88 | Device>, |
89 | nocontract_t, contract_t, packet_size, inner_dim_contiguous, |
90 | inner_dim_reordered, Alignment>, |
91 | ColMajor> { |
92 | typedef TensorContractionSubMapper< |
93 | Scalar, StorageIndex, Rhs, |
94 | TensorEvaluator< |
95 | const TensorReshapingOp< |
96 | NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >, |
97 | Device>, |
98 | nocontract_t, contract_t, packet_size, inner_dim_contiguous, |
99 | inner_dim_reordered, Alignment> |
100 | SubMapper; |
101 | |
102 | typedef SubMapper DataMapper; |
103 | typedef typename packet_traits<Scalar>::type Packet; |
104 | |
105 | using CoeffFinalizer = FinalizeDataMapperCoeffs< |
106 | Scalar, DataMapper, packet_size, |
107 | TensorEvaluatorHasPartialPacket<typename DataMapper::TensorEvaluatorT, |
108 | Packet, Index>::value && |
109 | unpacket_traits<Packet>::masked_store_available>; |
110 | |
111 | EIGEN_DONT_INLINE |
112 | void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows, |
113 | StorageIndex cols) { |
114 | const bool standard_patches = !rhs.nonStandardPatches(); |
115 | |
116 | if (standard_patches && (rhs.patchDepth() % packet_size == 0)) { |
117 | // Single packet always belong to single patch (row, col). |
118 | if (rhs.hasPadding()) { |
119 | packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true, |
120 | /*has_padding=*/true>(block, rhs, rows, cols); |
121 | } else { |
122 | packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true, |
123 | /*has_padding=*/false>(block, rhs, rows, cols); |
124 | } |
125 | |
126 | } else if (standard_patches) { |
127 | // Single packet can span across multiple patch rows or columns. |
128 | if (rhs.hasPadding()) { |
129 | packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false, |
130 | /*has_padding=*/true>(block, rhs, rows, cols); |
131 | } else { |
132 | packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false, |
133 | /*has_padding=*/false>(block, rhs, rows, cols); |
134 | } |
135 | |
136 | } else if (rhs.patchDepth() % packet_size == 0) { |
137 | // Single packet always belong to single patch (row, col). |
138 | packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/ |
139 | true>(block, rhs, rows, cols); |
140 | |
141 | } else { |
142 | // Single packet can span across multiple patch rows or columns. |
143 | packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/ |
144 | false>(block, rhs, rows, cols); |
145 | } |
146 | } |
147 | |
148 | private: |
149 | // (A) Standard image patches: |
150 | // |
151 | // (1) patch_row_inflate_strides == 1 AND |
152 | // (2) patch_col_inflate_strides == 1 |
153 | // |
154 | // Standard patches guarantee that two inner most dimensions (depth and rows) |
155 | // are contiguous in memory and we can try to squeeze reads from them. |
156 | // |
157 | // (B) Non standard image patches: in_row/in_col and patch_row/patch_col |
158 | // strides can be not equal to 1, and for each [row, col] inside a patch we |
159 | // have to do additional computations to find corresponding row and col in the |
160 | // input tensor. Also we can no longer squeeze reads from inner dimensions. |
161 | // |
162 | // Additional parameters: |
163 | // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have |
164 | // depth dimension size to be a multiple of packet size, so we can skip all |
165 | // non vectorized loads and checks, because it's guaranteed that block size |
166 | // will be a multiple of a packet size (see TensorContractionBlocking). |
167 | // |
168 | // - has_padding: Input tensor has non-zero padding. In this case for each |
169 | // patch col and row we need to check that it doesn't correspond to the |
170 | // padded region of original input. |
171 | template <bool patch_depth_is_multiple_of_packet_size, bool has_padding> |
172 | EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* __restrict block, |
173 | const DataMapper& rhs, |
174 | StorageIndex rows, |
175 | StorageIndex cols) { |
176 | eigen_assert(!rhs.nonStandardPatches()); |
177 | |
178 | // Give vectorized_rows the name used in all other gemm_pack_rhs above. |
179 | const StorageIndex peeled_k = (rows / packet_size) * packet_size; |
180 | |
181 | const StorageIndex start_col = rhs.colOffset(); |
182 | const StorageIndex max_col = rhs.maxCol(peeled_k); |
183 | const StorageIndex rhs_depth_offset = rhs.depthOffset(); |
184 | |
185 | for (StorageIndex col = 0; col < cols; ++col) { |
186 | SubMapper lm = rhs.getLinearMapper(0, col); |
187 | |
188 | StorageIndex k = 0; |
189 | for (Index c = start_col; c < max_col; ++c) { |
190 | eigen_assert(k <= peeled_k); |
191 | |
192 | const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0; |
193 | const StorageIndex max_row = rhs.maxRow(peeled_k, c); |
194 | const bool pad_col = has_padding && lm.padCol(c); |
195 | |
196 | eigen_assert(has_padding || !lm.padCol(c)); |
197 | eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1)); |
198 | |
199 | // We can squeeze reads for all rows in [start_row, max_row) range. |
200 | if (!has_padding || |
201 | (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) { |
202 | const StorageIndex start_depth = |
203 | (c == start_col) ? rhs_depth_offset : 0; |
204 | |
205 | const StorageIndex max_depth = |
206 | std::min<StorageIndex>(start_depth + (peeled_k - k), |
207 | (max_row - start_row) * rhs.patchDepth()); |
208 | |
209 | const StorageIndex base_idx = lm.baseIndex(start_row, c); |
210 | |
211 | if (patch_depth_is_multiple_of_packet_size) { |
212 | // If patch depth is a multiple of packet size, it's guaranteed that |
213 | // we can process all values in depth dimension with packets. |
214 | eigen_assert((max_depth - start_depth) % packet_size == 0); |
215 | StorageIndex d = start_depth; |
216 | |
217 | const StorageIndex unrolled_depth = max_depth - 4 * packet_size; |
218 | for (; d <= unrolled_depth; d += 4 * packet_size) { |
219 | eigen_assert(k < peeled_k); |
220 | |
221 | Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx); |
222 | Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx); |
223 | Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx); |
224 | Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx); |
225 | |
226 | internal::pstoreu(block + 0 * packet_size, p0); |
227 | internal::pstoreu(block + 1 * packet_size, p1); |
228 | internal::pstoreu(block + 2 * packet_size, p2); |
229 | internal::pstoreu(block + 3 * packet_size, p3); |
230 | |
231 | block += 4 * packet_size; |
232 | k += 4 * packet_size; |
233 | } |
234 | |
235 | for (; d < max_depth; d += packet_size) { |
236 | eigen_assert(k < peeled_k); |
237 | internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); |
238 | block += packet_size; |
239 | k += packet_size; |
240 | } |
241 | |
242 | } else { |
243 | StorageIndex d = start_depth; |
244 | |
245 | const StorageIndex unrolled_depth = max_depth - 4 * packet_size; |
246 | for (; d <= unrolled_depth; d += 4 * packet_size) { |
247 | eigen_assert(k < peeled_k); |
248 | |
249 | Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx); |
250 | Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx); |
251 | Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx); |
252 | Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx); |
253 | |
254 | internal::pstoreu(block + 0 * packet_size, p0); |
255 | internal::pstoreu(block + 1 * packet_size, p1); |
256 | internal::pstoreu(block + 2 * packet_size, p2); |
257 | internal::pstoreu(block + 3 * packet_size, p3); |
258 | |
259 | block += 4 * packet_size; |
260 | k += 4 * packet_size; |
261 | } |
262 | |
263 | const StorageIndex vectorized_depth = max_depth - packet_size; |
264 | for (; d <= vectorized_depth; d += packet_size) { |
265 | eigen_assert(k < peeled_k); |
266 | internal::pstoreu(block, rhs.packetNoPadding(d, base_idx)); |
267 | block += packet_size; |
268 | k += packet_size; |
269 | } |
270 | |
271 | eigen_assert(k <= peeled_k); |
272 | const Index num_coeffs = |
273 | CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth); |
274 | |
275 | k += num_coeffs; |
276 | block += num_coeffs; |
277 | eigen_assert(k <= peeled_k); |
278 | } |
279 | |
280 | // Go to the next column. |
281 | continue; |
282 | } |
283 | |
284 | // If we are not allowed to squeeze reads along the `row` and `depth` |
285 | // dimensions, we must process rows one by one. |
286 | for (StorageIndex r = start_row; r < max_row; ++r) { |
287 | eigen_assert(k <= peeled_k); |
288 | |
289 | const StorageIndex start_depth = |
290 | ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0; |
291 | const StorageIndex max_depth = |
292 | rhs.maxDepth(peeled_k - k, start_depth); |
293 | |
294 | const bool pad = has_padding && (pad_col || lm.padRow(r)); |
295 | eigen_assert(has_padding || !lm.padRow(r)); |
296 | |
297 | const StorageIndex base_idx = lm.baseIndex(r, c); |
298 | |
299 | if (patch_depth_is_multiple_of_packet_size) { |
300 | // If patch depth is a multiple of packet size, it's guaranteed that |
301 | // we can process all values in depth dimension with packets. |
302 | eigen_assert((max_depth - start_depth) % packet_size == 0); |
303 | StorageIndex d = start_depth; |
304 | |
305 | for (; d < max_depth; d += packet_size) { |
306 | eigen_assert(k < peeled_k); |
307 | const Packet p = (has_padding && pad) |
308 | ? pset1<Packet>(Scalar(0)) |
309 | : rhs.packetNoPadding(d, base_idx); |
310 | internal::pstoreu(block, p); |
311 | block += packet_size; |
312 | k += packet_size; |
313 | } |
314 | |
315 | } else { |
316 | StorageIndex d = start_depth; |
317 | |
318 | const StorageIndex vectorized_depth = max_depth - packet_size; |
319 | for (; d <= vectorized_depth; d += packet_size) { |
320 | eigen_assert(k < peeled_k); |
321 | const Packet p = (has_padding && pad) |
322 | ? pset1<Packet>(Scalar(0)) |
323 | : rhs.packetNoPadding(d, base_idx); |
324 | internal::pstoreu(block, p); |
325 | block += packet_size; |
326 | k += packet_size; |
327 | } |
328 | |
329 | eigen_assert(k <= peeled_k); |
330 | const Index num_coeffs = CoeffFinalizer::finalize( |
331 | block, rhs, base_idx, d, max_depth, has_padding && pad); |
332 | |
333 | k += num_coeffs; |
334 | block += num_coeffs; |
335 | eigen_assert(k <= peeled_k); |
336 | } |
337 | } |
338 | } |
339 | |
340 | // The loop above should fill peeled_k elements. |
341 | eigen_assert(peeled_k == k); |
342 | |
343 | // Fill remaining elements using loadCoeffStandard. |
344 | for (; k < rows; ++k) { |
345 | *block = lm.loadCoeffStandard(k); |
346 | ++block; |
347 | } |
348 | } |
349 | } |
350 | |
351 | template <bool patch_depth_is_multiple_of_packet_size> |
352 | EIGEN_ALWAYS_INLINE void packNonStandardPatches(Scalar* __restrict block, |
353 | const DataMapper& rhs, |
354 | StorageIndex rows, |
355 | StorageIndex cols) { |
356 | eigen_assert(rhs.nonStandardPatches()); |
357 | |
358 | // Give vectorized_rows the name used in all other gemm_pack_rhs above. |
359 | const StorageIndex peeled_k = (rows / packet_size) * packet_size; |
360 | |
361 | const StorageIndex start_col = rhs.colOffset(); |
362 | const StorageIndex max_col = rhs.maxCol(peeled_k); |
363 | const StorageIndex rhs_depth_offset = rhs.depthOffset(); |
364 | |
365 | // Original input column and row after applying all non-standard strides and |
366 | // dilations. Computed by padOrSkip{Row,Col}. |
367 | Index orig_c = 0; |
368 | Index orig_r = 0; |
369 | |
370 | for (StorageIndex col = 0; col < cols; ++col) { |
371 | SubMapper lm = rhs.getLinearMapper(0, col); |
372 | |
373 | StorageIndex k = 0; |
374 | for (Index c = start_col; c < max_col; ++c) { |
375 | eigen_assert(k <= peeled_k); |
376 | |
377 | const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0; |
378 | const StorageIndex max_row = rhs.maxRow(peeled_k, c); |
379 | const bool pad_or_skip_col = lm.padOrSkipCol(c, &orig_c); |
380 | |
381 | for (StorageIndex r = start_row; r < max_row; ++r) { |
382 | eigen_assert(k <= peeled_k); |
383 | |
384 | const StorageIndex start_depth = |
385 | ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0; |
386 | const StorageIndex max_depth = |
387 | rhs.maxDepth(peeled_k - k, start_depth); |
388 | |
389 | const bool pad_or_skip = |
390 | pad_or_skip_col || lm.padOrSkipRow(r, &orig_r); |
391 | const StorageIndex base_idx = lm.origBaseIndex(orig_r, orig_c); |
392 | |
393 | if (patch_depth_is_multiple_of_packet_size) { |
394 | // If patch depth is a multiple of packet size, it's guaranteed that |
395 | // we can process all values in depth dimension with packets. |
396 | eigen_assert((max_depth - start_depth) % packet_size == 0); |
397 | StorageIndex d = start_depth; |
398 | |
399 | for (; d < max_depth; d += packet_size) { |
400 | eigen_assert(k < peeled_k); |
401 | const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0)) |
402 | : rhs.packetNoPadding(d, base_idx); |
403 | internal::pstoreu(block, p); |
404 | block += packet_size; |
405 | k += packet_size; |
406 | } |
407 | |
408 | } else { |
409 | const StorageIndex vectorized_depth = max_depth - packet_size; |
410 | StorageIndex d = start_depth; |
411 | for (; d <= vectorized_depth; d += packet_size) { |
412 | eigen_assert(k < peeled_k); |
413 | const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0)) |
414 | : rhs.packetNoPadding(d, base_idx); |
415 | internal::pstoreu(block, p); |
416 | block += packet_size; |
417 | k += packet_size; |
418 | } |
419 | |
420 | eigen_assert(k <= peeled_k); |
421 | const Index num_coeffs = CoeffFinalizer::finalize( |
422 | block, rhs, base_idx, d, max_depth, pad_or_skip); |
423 | |
424 | k += num_coeffs; |
425 | block += num_coeffs; |
426 | eigen_assert(k <= peeled_k); |
427 | } |
428 | } |
429 | } |
430 | |
431 | // The loop above should fill peeled_k elements. |
432 | eigen_assert(peeled_k == k); |
433 | |
434 | // Fill remaining elements using loadCoeff. |
435 | for (; k < rows; ++k) { |
436 | *block = lm(k); |
437 | ++block; |
438 | } |
439 | } |
440 | } |
441 | }; |
442 | } // namespace internal |
443 | } // namespace Eigen |
444 | #endif // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL) |
445 | #endif // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_ |
446 | |