1 | // Copyright 2022 Google LLC |
2 | // |
3 | // This source code is licensed under the BSD-style license found in the |
4 | // LICENSE file in the root directory of this source tree. |
5 | |
6 | #include <assert.h> |
7 | #include <stddef.h> |
8 | |
9 | #ifdef _WIN32 |
10 | #include <windows.h> |
11 | #else |
12 | #include <pthread.h> |
13 | #endif |
14 | |
15 | #include <xnnpack/common.h> |
16 | #include <xnnpack/config.h> |
17 | #include <xnnpack/microparams-init.h> |
18 | #include <xnnpack/transpose.h> |
19 | #include <xnnpack/vunary.h> |
20 | |
21 | |
22 | static struct xnn_transpose_config transpose_config = {0}; |
23 | |
24 | #if XNN_PLATFORM_WINDOWS |
25 | static INIT_ONCE init_guard = INIT_ONCE_STATIC_INIT; |
26 | #else |
27 | static pthread_once_t init_guard = PTHREAD_ONCE_INIT; |
28 | #endif |
29 | |
30 | static void init_transpose_config(void) { |
31 | #if XNN_ARCH_ARM |
32 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
33 | assert(hardware_config != NULL); |
34 | |
35 | if (hardware_config->use_arm_neon) { |
36 | transpose_config.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy; |
37 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
38 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon, |
39 | .tile_size = 32, |
40 | }; |
41 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
42 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon, |
43 | .tile_size = 32, |
44 | }; |
45 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
46 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__2x2_neon_tbl64, |
47 | .init.x24 = (xnn_init_x24_transpose_params_fn) xnn_init_x24_transpose_neon_tbl64_params, |
48 | .tile_size = 32, |
49 | }; |
50 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
51 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__4x4_reuse_dec_zip_neon, |
52 | .tile_size = 32, |
53 | }; |
54 | transpose_config.xx = (struct xnn_transpose_subconfig) { |
55 | .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_scalar_memcpy, |
56 | .tile_size = 32, |
57 | }; |
58 | } else if (!XNN_PLATFORM_MOBILE) { |
59 | transpose_config.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy; |
60 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
61 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__2x4_scalar_int, |
62 | .tile_size = 32, |
63 | }; |
64 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
65 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__2x4_scalar_int, |
66 | .tile_size = 32, |
67 | }; |
68 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
69 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__1x2_scalar, |
70 | .tile_size = 32, |
71 | }; |
72 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
73 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__2x4_scalar_int, |
74 | .tile_size = 32, |
75 | }; |
76 | transpose_config.xx = (struct xnn_transpose_subconfig) { |
77 | .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_scalar_memcpy, |
78 | .tile_size = 32, |
79 | }; |
80 | } |
81 | #elif XNN_ARCH_ARM64 |
82 | transpose_config.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy; |
83 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
84 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__16x16_reuse_dec_zip_neon, |
85 | .tile_size = 32, |
86 | }; |
87 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
88 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__8x8_reuse_dec_zip_neon, |
89 | .tile_size = 32, |
90 | }; |
91 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
92 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__4x4_aarch64_neon_tbl128, |
93 | .init.x24 = (xnn_init_x24_transpose_params_fn) xnn_init_x24_transpose_neon_tbl128_params, |
94 | .tile_size = 32, |
95 | }; |
96 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
97 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__4x4_aarch64_neon_tbl128, |
98 | .tile_size = 32, |
99 | .init.x32 = (xnn_init_x32_transpose_params_fn) xnn_init_x32_transpose_neon_tbl128_params, |
100 | }; |
101 | transpose_config.xx = (struct xnn_transpose_subconfig) { |
102 | .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_scalar_memcpy, |
103 | .tile_size = 32, |
104 | }; |
105 | #elif XNN_ARCH_X86 || XNN_ARCH_X86_64 |
106 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
107 | assert(hardware_config != NULL); |
108 | |
109 | transpose_config.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy; |
110 | transpose_config.xx = (struct xnn_transpose_subconfig) { |
111 | .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_scalar_memcpy, |
112 | .tile_size = 32, |
113 | }; |
114 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
115 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__16x16_reuse_mov_sse2, |
116 | .tile_size = 32, |
117 | }; |
118 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
119 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__8x8_reuse_multi_sse2, |
120 | .tile_size = 32, |
121 | }; |
122 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
123 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__1x2_scalar, |
124 | .tile_size = 32, |
125 | }; |
126 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
127 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__4x4_sse, |
128 | .tile_size = 32, |
129 | }; |
130 | if (hardware_config->use_x86_ssse3) { |
131 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
132 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__4x4_ssse3, |
133 | .init.x24 = (xnn_init_x24_transpose_params_fn) xnn_init_x24_transpose_ssse3_params, |
134 | .tile_size = 32, |
135 | }; |
136 | } |
137 | if (hardware_config->use_x86_avx) { |
138 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
139 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__8x8_reuse_multi_avx, |
140 | .init.x32 = (xnn_init_x32_transpose_params_fn) xnn_init_x32_transpose_avx_params, |
141 | .tile_size = 32, |
142 | }; |
143 | } |
144 | if (hardware_config->use_x86_avx2) { |
145 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
146 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__32x32_reuse_switch_avx2, |
147 | .init.x8 = (xnn_init_x8_transpose_params_fn) xnn_init_x8_transpose_avx2_params, |
148 | .tile_size = 32, |
149 | }; |
150 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
151 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__16x16_reuse_switch_avx2, |
152 | .init.x16 = (xnn_init_x16_transpose_params_fn) xnn_init_x16_transpose_avx2_params, |
153 | .tile_size = 32, |
154 | }; |
155 | } |
156 | #elif XNN_ARCH_WASMSIMD || XNN_ARCH_WASMRELAXEDSIMD |
157 | transpose_config.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy; |
158 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
159 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__16x16_reuse_mov_wasmsimd, |
160 | .tile_size = 32, |
161 | }; |
162 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
163 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__8x8_reuse_mov_wasmsimd, |
164 | .tile_size = 32, |
165 | }; |
166 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
167 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__1x2_scalar, |
168 | .tile_size = 32, |
169 | }; |
170 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
171 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__4x4_reuse_mov_wasmsimd, |
172 | .tile_size = 32, |
173 | }; |
174 | transpose_config.xx = (struct xnn_transpose_subconfig) { |
175 | .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_scalar_memcpy, |
176 | .tile_size = 32, |
177 | }; |
178 | #elif XNN_ARCH_WASM |
179 | transpose_config.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy; |
180 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
181 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__2x4_scalar_int, |
182 | .tile_size = 32, |
183 | }; |
184 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
185 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__2x4_scalar_int, |
186 | .tile_size = 32, |
187 | }; |
188 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
189 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__1x2_scalar, |
190 | .tile_size = 32, |
191 | }; |
192 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
193 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__2x4_scalar_int, |
194 | .tile_size = 32, |
195 | }; |
196 | transpose_config.xx = (struct xnn_transpose_subconfig) { |
197 | .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_scalar_memcpy, |
198 | .tile_size = 32, |
199 | }; |
200 | #elif XNN_ARCH_RISCV |
201 | transpose_config.copy = (xnn_vunary_ukernel_fn) xnn_xx_copy_ukernel__scalar_memcpy; |
202 | transpose_config.x8 = (struct xnn_transpose_subconfig) { |
203 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x8_transposec_ukernel__2x4_scalar_int, |
204 | .tile_size = 32, |
205 | }; |
206 | transpose_config.x16 = (struct xnn_transpose_subconfig) { |
207 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x16_transposec_ukernel__2x4_scalar_int, |
208 | .tile_size = 32, |
209 | }; |
210 | transpose_config.x24 = (struct xnn_transpose_subconfig) { |
211 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x24_transposec_ukernel__1x2_scalar, |
212 | .tile_size = 32, |
213 | }; |
214 | transpose_config.x32 = (struct xnn_transpose_subconfig) { |
215 | .const_size_ukernel = (xnn_transposec_ukernel_fn) xnn_x32_transposec_ukernel__2x4_scalar_int, |
216 | .tile_size = 32, |
217 | }; |
218 | transpose_config.xx = (struct xnn_transpose_subconfig) { |
219 | .variable_size_ukernel = xnn_xx_transposev_ukernel__1x1_scalar_memcpy, |
220 | .tile_size = 32, |
221 | }; |
222 | #else |
223 | #error "Unsupported architecture" |
224 | #endif |
225 | } |
226 | |
227 | #if XNN_PLATFORM_WINDOWS |
228 | static BOOL CALLBACK init_transpose_config_windows(PINIT_ONCE init_once, PVOID parameter, PVOID* context) { |
229 | init_transpose_config(); |
230 | return TRUE; |
231 | } |
232 | #endif |
233 | |
234 | const struct xnn_transpose_config* xnn_init_transpose_config() { |
235 | const struct xnn_hardware_config* hardware_config = xnn_init_hardware_config(); |
236 | if (hardware_config == NULL) { |
237 | return NULL; |
238 | } |
239 | #if XNN_PLATFORM_WINDOWS |
240 | InitOnceExecuteOnce(&init_guard, &init_transpose_config_windows, NULL, NULL); |
241 | #else |
242 | pthread_once(&init_guard, &init_transpose_config); |
243 | #endif |
244 | return &transpose_config; |
245 | } |
246 | |