1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #ifndef RUY_RUY_PACK_COMMON_H_ |
17 | #define RUY_RUY_PACK_COMMON_H_ |
18 | |
19 | #include <algorithm> |
20 | #include <cstdint> |
21 | #include <cstring> |
22 | #include <limits> |
23 | #include <type_traits> |
24 | |
25 | #include "ruy/check_macros.h" |
26 | #include "ruy/mat.h" |
27 | #include "ruy/matrix.h" |
28 | #include "ruy/opt_set.h" |
29 | #include "ruy/path.h" |
30 | #include "ruy/platform.h" |
31 | #include "ruy/profiler/instrumentation.h" |
32 | #include "ruy/tune.h" |
33 | |
34 | namespace ruy { |
35 | |
36 | template <typename Scalar> |
37 | Scalar SymmetricZeroPoint() { |
38 | if (std::is_floating_point<Scalar>::value) { |
39 | return 0; |
40 | } |
41 | if (std::is_signed<Scalar>::value) { |
42 | return 0; |
43 | } |
44 | return std::numeric_limits<Scalar>::max() / 2 + 1; |
45 | } |
46 | |
47 | template <Path ThePath, typename Scalar> |
48 | struct PackedTypeImpl { |
49 | using Type = Scalar; |
50 | }; |
51 | |
52 | template <Path ThePath, typename Scalar> |
53 | using PackedType = typename PackedTypeImpl<ThePath, Scalar>::Type; |
54 | |
55 | template <typename PackedScalar, typename Scalar> |
56 | PackedScalar Pack(Scalar x) { |
57 | return x - SymmetricZeroPoint<Scalar>() + SymmetricZeroPoint<PackedScalar>(); |
58 | } |
59 | |
60 | template <Path ThePath, typename FixedKernelLayout, typename Scalar, |
61 | typename PackedScalar, typename SumsType, Order SrcOrder> |
62 | struct PackImpl; |
63 | |
64 | #define RUY_INHERIT_PACK(PARENT, CHILD) \ |
65 | template <typename FixedKernelLayout, typename Scalar, \ |
66 | typename PackedScalar, typename SumsType, Order SrcOrder> \ |
67 | struct PackImpl<CHILD, FixedKernelLayout, Scalar, PackedScalar, SumsType, \ |
68 | SrcOrder> : PackImpl<PARENT, FixedKernelLayout, Scalar, \ |
69 | PackedScalar, SumsType, SrcOrder> {}; |
70 | |
71 | // A generic yet fairly fast implementation of |
72 | // |
73 | // PackImpl<ThePath, FixedKernelLayout<Order::kRowMajor, 1, KernelCols>, |
74 | // float, float, float, Order::kRowMajor> |
75 | // |
76 | // that is, a packing code path for the case of floating-point, row-major |
77 | // source matrix, targeting typical float kernel layouts consisting of a |
78 | // single row. |
79 | // |
80 | // The only reason why this isn't a partial specialization of PackImpl is that |
81 | // this leads to ambiguous partial specializations as this conflicts with |
82 | // the ones defined by RUY_INHERIT_PACK. |
83 | // |
84 | // What's special about floating-point kernels is that they tend to use |
85 | // FixedKernelLayout<Order::kRowMajor, 1, KernelCols> for some value of |
86 | // KernelCols, making it easy to implement the packing code as essentially |
87 | // a bunch of memcpy's with compile-time-fixed size |
88 | // (KernelCols * sizeof(float)), typically 16, 32 or 64 bytes. Unlike the |
89 | // quantized case, there are no sums to compute, and the float kernels tend |
90 | // to use this kind of simple layout on multiple architectures, unlike the |
91 | // heavily architecture-specific layouts of quantized kernels. |
92 | // |
93 | // Here are the current instantiations of this template (as of 2020): |
94 | // Path | KernelCols |
95 | // --------------+--------------------------------- |
96 | // kNeon (ARM32) | 8 and 4 (for LHS and RHS sides) |
97 | // kNeon (ARM64) | 8 |
98 | // kAvxFma | 8 |
99 | // kAvx512 | 16 |
100 | template <Path ThePath, int KernelCols> |
101 | struct MemcpyRowMajorFloatPackImpl { |
102 | static void Run(Tuning, const Mat<float>& src_matrix, |
103 | PMat<float>* packed_matrix, int start_col, int end_col) { |
104 | RUY_DCHECK(IsRowMajor(src_matrix.layout)); |
105 | RUY_DCHECK(IsColMajor(packed_matrix->layout)); |
106 | RUY_DCHECK_EQ(start_col % KernelCols, 0); |
107 | int src_stride = src_matrix.layout.stride; |
108 | // As the source matrix is row-major and the destination packed matrix is |
109 | // column-major, there is no traversal order that will be optimal for both |
110 | // so we choose to favor the source matrix with a row-major traversal order. |
111 | for (int block_row = 0; block_row < src_matrix.layout.rows; |
112 | block_row += 1) { |
113 | const float* src_ptr = |
114 | src_matrix.data.get() + src_stride * block_row + start_col; |
115 | float* packed_ptr = packed_matrix->data + |
116 | packed_matrix->layout.stride * start_col + |
117 | KernelCols * block_row; |
118 | int src_cols = std::min(end_col, src_matrix.layout.cols) - start_col; |
119 | int col = 0; |
120 | for (; col <= src_cols - KernelCols; col += KernelCols) { |
121 | memcpy(packed_ptr, src_ptr, KernelCols * sizeof(float)); |
122 | packed_ptr += KernelCols * packed_matrix->layout.stride; |
123 | src_ptr += KernelCols; |
124 | } |
125 | int remaining_cols = src_cols - col; |
126 | if (remaining_cols > 0) { |
127 | memcpy(packed_ptr, src_ptr, remaining_cols * sizeof(float)); |
128 | memset(packed_ptr + remaining_cols, 0, |
129 | (KernelCols - remaining_cols) * sizeof(float)); |
130 | } |
131 | } |
132 | } |
133 | }; |
134 | |
135 | #define RUY_USE_MEMCPY_ROWMAJOR_FLOAT_PACK(ThePath, KernelCols) \ |
136 | template <> \ |
137 | struct PackImpl<ThePath, FixedKernelLayout<Order::kRowMajor, 1, KernelCols>, \ |
138 | float, float, float, Order::kRowMajor> \ |
139 | : MemcpyRowMajorFloatPackImpl<ThePath, KernelCols> {}; |
140 | |
141 | } // namespace ruy |
142 | |
143 | #endif // RUY_RUY_PACK_COMMON_H_ |
144 | |