1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
17#define TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
18
19#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20
21// Note the following header is used in both TF and TFLite. Particularly, it's
22// used for float TFLite Conv2D.
23#include "tensorflow/core/kernels/eigen_spatial_convolutions-inl.h"
24
25#if defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
26#include "tensorflow/core/kernels/eigen_contraction_kernel.h"
27
28namespace Eigen {
29namespace internal {
30
31// After we vectorized all loads from the underlying tensor using Packet ops, we
32// have to finalize coefficients that do not fit into a packet.
33template <typename Scalar, typename DataMapper, int packet_size,
34 bool masked_load_store>
35struct FinalizeDataMapperCoeffs {
36 EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
37 const DataMapper& rhs,
38 Index base_idx, Index depth,
39 Index max_depth, bool pad = false) {
40 const Index num_coeffs = max_depth - depth;
41 eigen_assert(num_coeffs <= packet_size);
42
43 for (; depth < max_depth; ++depth) {
44 *block = pad ? Scalar(0) : rhs.coeffNoPadding(depth, base_idx);
45 ++block;
46 }
47
48 return num_coeffs;
49 }
50};
51
52template <typename Scalar, typename DataMapper, int packet_size>
53struct FinalizeDataMapperCoeffs<Scalar, DataMapper, packet_size,
54 /*masked_load_store=*/true> {
55 EIGEN_ALWAYS_INLINE static Index finalize(Scalar* block,
56 const DataMapper& rhs,
57 Index base_idx, Index depth,
58 Index max_depth, bool pad = false) {
59 Index num_coeffs = max_depth - depth;
60 eigen_assert(num_coeffs <= packet_size);
61 if (num_coeffs == 0) return 0;
62
63 using Packet = typename packet_traits<Scalar>::type;
64 Packet p = pad ? pset1<Packet>(Scalar(0))
65 : rhs.partialPacketNoPadding(depth, base_idx, num_coeffs);
66 internal::pstoreu(block, p, mask<Packet>(0, num_coeffs));
67
68 return num_coeffs;
69 }
70};
71
72// Pack a block of the right input matrix (in our case it's always a
73// "virtual matrix" constructed from extracted image patches) in contiguous
74// block in column-major storage order. Knowing the properties of the
75// original patch op we can do it more efficient than the default
76// gemm_pack_colmajor_block.
77template <typename NewDimension, Index Rows, Index Cols, typename ArgType,
78 typename Device, typename Scalar, typename StorageIndex,
79 typename nocontract_t, typename contract_t, int packet_size,
80 bool inner_dim_contiguous, bool inner_dim_reordered, int Alignment>
81struct gemm_pack_colmajor_block<
82 Scalar, StorageIndex,
83 TensorContractionSubMapper<
84 Scalar, StorageIndex, Rhs,
85 TensorEvaluator<
86 const TensorReshapingOp<
87 NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
88 Device>,
89 nocontract_t, contract_t, packet_size, inner_dim_contiguous,
90 inner_dim_reordered, Alignment>,
91 ColMajor> {
92 typedef TensorContractionSubMapper<
93 Scalar, StorageIndex, Rhs,
94 TensorEvaluator<
95 const TensorReshapingOp<
96 NewDimension, const TensorImagePatchOp<Rows, Cols, ArgType> >,
97 Device>,
98 nocontract_t, contract_t, packet_size, inner_dim_contiguous,
99 inner_dim_reordered, Alignment>
100 SubMapper;
101
102 typedef SubMapper DataMapper;
103 typedef typename packet_traits<Scalar>::type Packet;
104
105 using CoeffFinalizer = FinalizeDataMapperCoeffs<
106 Scalar, DataMapper, packet_size,
107 TensorEvaluatorHasPartialPacket<typename DataMapper::TensorEvaluatorT,
108 Packet, Index>::value &&
109 unpacket_traits<Packet>::masked_store_available>;
110
111 EIGEN_DONT_INLINE
112 void operator()(Scalar* block, const DataMapper& rhs, StorageIndex rows,
113 StorageIndex cols) {
114 const bool standard_patches = !rhs.nonStandardPatches();
115
116 if (standard_patches && (rhs.patchDepth() % packet_size == 0)) {
117 // Single packet always belong to single patch (row, col).
118 if (rhs.hasPadding()) {
119 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true,
120 /*has_padding=*/true>(block, rhs, rows, cols);
121 } else {
122 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/true,
123 /*has_padding=*/false>(block, rhs, rows, cols);
124 }
125
126 } else if (standard_patches) {
127 // Single packet can span across multiple patch rows or columns.
128 if (rhs.hasPadding()) {
129 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false,
130 /*has_padding=*/true>(block, rhs, rows, cols);
131 } else {
132 packStandardPatches</*patch_depth_is_multiple_of_packet_size=*/false,
133 /*has_padding=*/false>(block, rhs, rows, cols);
134 }
135
136 } else if (rhs.patchDepth() % packet_size == 0) {
137 // Single packet always belong to single patch (row, col).
138 packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/
139 true>(block, rhs, rows, cols);
140
141 } else {
142 // Single packet can span across multiple patch rows or columns.
143 packNonStandardPatches</*patch_depth_is_multiple_of_packet_size*/
144 false>(block, rhs, rows, cols);
145 }
146 }
147
148 private:
149 // (A) Standard image patches:
150 //
151 // (1) patch_row_inflate_strides == 1 AND
152 // (2) patch_col_inflate_strides == 1
153 //
154 // Standard patches guarantee that two inner most dimensions (depth and rows)
155 // are contiguous in memory and we can try to squeeze reads from them.
156 //
157 // (B) Non standard image patches: in_row/in_col and patch_row/patch_col
158 // strides can be not equal to 1, and for each [row, col] inside a patch we
159 // have to do additional computations to find corresponding row and col in the
160 // input tensor. Also we can no longer squeeze reads from inner dimensions.
161 //
162 // Additional parameters:
163 // - patch_depth_is_multiple_of_packet_size=true: We are guaranteed to have
164 // depth dimension size to be a multiple of packet size, so we can skip all
165 // non vectorized loads and checks, because it's guaranteed that block size
166 // will be a multiple of a packet size (see TensorContractionBlocking).
167 //
168 // - has_padding: Input tensor has non-zero padding. In this case for each
169 // patch col and row we need to check that it doesn't correspond to the
170 // padded region of original input.
171 template <bool patch_depth_is_multiple_of_packet_size, bool has_padding>
172 EIGEN_ALWAYS_INLINE void packStandardPatches(Scalar* __restrict block,
173 const DataMapper& rhs,
174 StorageIndex rows,
175 StorageIndex cols) {
176 eigen_assert(!rhs.nonStandardPatches());
177
178 // Give vectorized_rows the name used in all other gemm_pack_rhs above.
179 const StorageIndex peeled_k = (rows / packet_size) * packet_size;
180
181 const StorageIndex start_col = rhs.colOffset();
182 const StorageIndex max_col = rhs.maxCol(peeled_k);
183 const StorageIndex rhs_depth_offset = rhs.depthOffset();
184
185 for (StorageIndex col = 0; col < cols; ++col) {
186 SubMapper lm = rhs.getLinearMapper(0, col);
187
188 StorageIndex k = 0;
189 for (Index c = start_col; c < max_col; ++c) {
190 eigen_assert(k <= peeled_k);
191
192 const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
193 const StorageIndex max_row = rhs.maxRow(peeled_k, c);
194 const bool pad_col = has_padding && lm.padCol(c);
195
196 eigen_assert(has_padding || !lm.padCol(c));
197 eigen_assert(has_padding || !lm.padAnyRow(start_row, max_row - 1));
198
199 // We can squeeze reads for all rows in [start_row, max_row) range.
200 if (!has_padding ||
201 (!pad_col && !lm.padAnyRow(start_row, max_row - 1))) {
202 const StorageIndex start_depth =
203 (c == start_col) ? rhs_depth_offset : 0;
204
205 const StorageIndex max_depth =
206 std::min<StorageIndex>(start_depth + (peeled_k - k),
207 (max_row - start_row) * rhs.patchDepth());
208
209 const StorageIndex base_idx = lm.baseIndex(start_row, c);
210
211 if (patch_depth_is_multiple_of_packet_size) {
212 // If patch depth is a multiple of packet size, it's guaranteed that
213 // we can process all values in depth dimension with packets.
214 eigen_assert((max_depth - start_depth) % packet_size == 0);
215 StorageIndex d = start_depth;
216
217 const StorageIndex unrolled_depth = max_depth - 4 * packet_size;
218 for (; d <= unrolled_depth; d += 4 * packet_size) {
219 eigen_assert(k < peeled_k);
220
221 Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
222 Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
223 Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
224 Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
225
226 internal::pstoreu(block + 0 * packet_size, p0);
227 internal::pstoreu(block + 1 * packet_size, p1);
228 internal::pstoreu(block + 2 * packet_size, p2);
229 internal::pstoreu(block + 3 * packet_size, p3);
230
231 block += 4 * packet_size;
232 k += 4 * packet_size;
233 }
234
235 for (; d < max_depth; d += packet_size) {
236 eigen_assert(k < peeled_k);
237 internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
238 block += packet_size;
239 k += packet_size;
240 }
241
242 } else {
243 StorageIndex d = start_depth;
244
245 const StorageIndex unrolled_depth = max_depth - 4 * packet_size;
246 for (; d <= unrolled_depth; d += 4 * packet_size) {
247 eigen_assert(k < peeled_k);
248
249 Packet p0 = rhs.packetNoPadding(d + 0 * packet_size, base_idx);
250 Packet p1 = rhs.packetNoPadding(d + 1 * packet_size, base_idx);
251 Packet p2 = rhs.packetNoPadding(d + 2 * packet_size, base_idx);
252 Packet p3 = rhs.packetNoPadding(d + 3 * packet_size, base_idx);
253
254 internal::pstoreu(block + 0 * packet_size, p0);
255 internal::pstoreu(block + 1 * packet_size, p1);
256 internal::pstoreu(block + 2 * packet_size, p2);
257 internal::pstoreu(block + 3 * packet_size, p3);
258
259 block += 4 * packet_size;
260 k += 4 * packet_size;
261 }
262
263 const StorageIndex vectorized_depth = max_depth - packet_size;
264 for (; d <= vectorized_depth; d += packet_size) {
265 eigen_assert(k < peeled_k);
266 internal::pstoreu(block, rhs.packetNoPadding(d, base_idx));
267 block += packet_size;
268 k += packet_size;
269 }
270
271 eigen_assert(k <= peeled_k);
272 const Index num_coeffs =
273 CoeffFinalizer::finalize(block, rhs, base_idx, d, max_depth);
274
275 k += num_coeffs;
276 block += num_coeffs;
277 eigen_assert(k <= peeled_k);
278 }
279
280 // Go to the next column.
281 continue;
282 }
283
284 // If we are not allowed to squeeze reads along the `row` and `depth`
285 // dimensions, we must process rows one by one.
286 for (StorageIndex r = start_row; r < max_row; ++r) {
287 eigen_assert(k <= peeled_k);
288
289 const StorageIndex start_depth =
290 ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
291 const StorageIndex max_depth =
292 rhs.maxDepth(peeled_k - k, start_depth);
293
294 const bool pad = has_padding && (pad_col || lm.padRow(r));
295 eigen_assert(has_padding || !lm.padRow(r));
296
297 const StorageIndex base_idx = lm.baseIndex(r, c);
298
299 if (patch_depth_is_multiple_of_packet_size) {
300 // If patch depth is a multiple of packet size, it's guaranteed that
301 // we can process all values in depth dimension with packets.
302 eigen_assert((max_depth - start_depth) % packet_size == 0);
303 StorageIndex d = start_depth;
304
305 for (; d < max_depth; d += packet_size) {
306 eigen_assert(k < peeled_k);
307 const Packet p = (has_padding && pad)
308 ? pset1<Packet>(Scalar(0))
309 : rhs.packetNoPadding(d, base_idx);
310 internal::pstoreu(block, p);
311 block += packet_size;
312 k += packet_size;
313 }
314
315 } else {
316 StorageIndex d = start_depth;
317
318 const StorageIndex vectorized_depth = max_depth - packet_size;
319 for (; d <= vectorized_depth; d += packet_size) {
320 eigen_assert(k < peeled_k);
321 const Packet p = (has_padding && pad)
322 ? pset1<Packet>(Scalar(0))
323 : rhs.packetNoPadding(d, base_idx);
324 internal::pstoreu(block, p);
325 block += packet_size;
326 k += packet_size;
327 }
328
329 eigen_assert(k <= peeled_k);
330 const Index num_coeffs = CoeffFinalizer::finalize(
331 block, rhs, base_idx, d, max_depth, has_padding && pad);
332
333 k += num_coeffs;
334 block += num_coeffs;
335 eigen_assert(k <= peeled_k);
336 }
337 }
338 }
339
340 // The loop above should fill peeled_k elements.
341 eigen_assert(peeled_k == k);
342
343 // Fill remaining elements using loadCoeffStandard.
344 for (; k < rows; ++k) {
345 *block = lm.loadCoeffStandard(k);
346 ++block;
347 }
348 }
349 }
350
351 template <bool patch_depth_is_multiple_of_packet_size>
352 EIGEN_ALWAYS_INLINE void packNonStandardPatches(Scalar* __restrict block,
353 const DataMapper& rhs,
354 StorageIndex rows,
355 StorageIndex cols) {
356 eigen_assert(rhs.nonStandardPatches());
357
358 // Give vectorized_rows the name used in all other gemm_pack_rhs above.
359 const StorageIndex peeled_k = (rows / packet_size) * packet_size;
360
361 const StorageIndex start_col = rhs.colOffset();
362 const StorageIndex max_col = rhs.maxCol(peeled_k);
363 const StorageIndex rhs_depth_offset = rhs.depthOffset();
364
365 // Original input column and row after applying all non-standard strides and
366 // dilations. Computed by padOrSkip{Row,Col}.
367 Index orig_c = 0;
368 Index orig_r = 0;
369
370 for (StorageIndex col = 0; col < cols; ++col) {
371 SubMapper lm = rhs.getLinearMapper(0, col);
372
373 StorageIndex k = 0;
374 for (Index c = start_col; c < max_col; ++c) {
375 eigen_assert(k <= peeled_k);
376
377 const StorageIndex start_row = (c == start_col) ? rhs.rowOffset() : 0;
378 const StorageIndex max_row = rhs.maxRow(peeled_k, c);
379 const bool pad_or_skip_col = lm.padOrSkipCol(c, &orig_c);
380
381 for (StorageIndex r = start_row; r < max_row; ++r) {
382 eigen_assert(k <= peeled_k);
383
384 const StorageIndex start_depth =
385 ((c == start_col) && (r == start_row)) ? rhs_depth_offset : 0;
386 const StorageIndex max_depth =
387 rhs.maxDepth(peeled_k - k, start_depth);
388
389 const bool pad_or_skip =
390 pad_or_skip_col || lm.padOrSkipRow(r, &orig_r);
391 const StorageIndex base_idx = lm.origBaseIndex(orig_r, orig_c);
392
393 if (patch_depth_is_multiple_of_packet_size) {
394 // If patch depth is a multiple of packet size, it's guaranteed that
395 // we can process all values in depth dimension with packets.
396 eigen_assert((max_depth - start_depth) % packet_size == 0);
397 StorageIndex d = start_depth;
398
399 for (; d < max_depth; d += packet_size) {
400 eigen_assert(k < peeled_k);
401 const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0))
402 : rhs.packetNoPadding(d, base_idx);
403 internal::pstoreu(block, p);
404 block += packet_size;
405 k += packet_size;
406 }
407
408 } else {
409 const StorageIndex vectorized_depth = max_depth - packet_size;
410 StorageIndex d = start_depth;
411 for (; d <= vectorized_depth; d += packet_size) {
412 eigen_assert(k < peeled_k);
413 const Packet p = pad_or_skip ? pset1<Packet>(Scalar(0))
414 : rhs.packetNoPadding(d, base_idx);
415 internal::pstoreu(block, p);
416 block += packet_size;
417 k += packet_size;
418 }
419
420 eigen_assert(k <= peeled_k);
421 const Index num_coeffs = CoeffFinalizer::finalize(
422 block, rhs, base_idx, d, max_depth, pad_or_skip);
423
424 k += num_coeffs;
425 block += num_coeffs;
426 eigen_assert(k <= peeled_k);
427 }
428 }
429 }
430
431 // The loop above should fill peeled_k elements.
432 eigen_assert(peeled_k == k);
433
434 // Fill remaining elements using loadCoeff.
435 for (; k < rows; ++k) {
436 *block = lm(k);
437 ++block;
438 }
439 }
440 }
441};
442} // namespace internal
443} // namespace Eigen
444#endif // defined(TENSORFLOW_USE_CUSTOM_CONTRACTION_KERNEL)
445#endif // TENSORFLOW_CORE_KERNELS_EIGEN_SPATIAL_CONVOLUTIONS_H_
446