1 | #include <stdbool.h> |
2 | #include <stdint.h> |
3 | #include <stddef.h> |
4 | |
5 | #include <pthread.h> |
6 | |
7 | #include <cpuinfo.h> |
8 | |
9 | #include <nnpack.h> |
10 | #include <nnpack/hwinfo.h> |
11 | #include <nnpack/blas.h> |
12 | #include <nnpack/transform.h> |
13 | #include <nnpack/relu.h> |
14 | #include <nnpack/softmax.h> |
15 | |
16 | struct hardware_info nnp_hwinfo = { }; |
17 | static pthread_once_t hwinfo_init_control = PTHREAD_ONCE_INIT; |
18 | |
19 | |
20 | #if (CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64) && !defined(__ANDROID__) |
21 | static void init_x86_hwinfo(void) { |
22 | const struct cpuinfo_cache* l1d = cpuinfo_get_l1d_cache(0); |
23 | if (l1d != NULL) { |
24 | nnp_hwinfo.cache.l1 = (struct cache_info) { |
25 | .size = l1d->size, |
26 | .associativity = l1d->associativity, |
27 | .threads = l1d->processor_count, |
28 | }; |
29 | const struct cpuinfo_cache* l2 = cpuinfo_get_l2_cache(0); |
30 | if (l2 != NULL) { |
31 | nnp_hwinfo.cache.l2 = (struct cache_info) { |
32 | .size = l2->size, |
33 | .associativity = l2->associativity, |
34 | .threads = l2->processor_count, |
35 | .inclusive = !!(l2->flags & CPUINFO_CACHE_INCLUSIVE), |
36 | }; |
37 | const struct cpuinfo_cache* l3 = cpuinfo_get_l3_cache(0); |
38 | if (l3 != NULL) { |
39 | nnp_hwinfo.cache.l3 = (struct cache_info) { |
40 | .size = l3->size, |
41 | .associativity = l3->associativity, |
42 | .threads = l3->processor_count, |
43 | .inclusive = !!(l3->flags & CPUINFO_CACHE_INCLUSIVE), |
44 | }; |
45 | const struct cpuinfo_cache* l4 = cpuinfo_get_l4_cache(0); |
46 | if (l4 != NULL) { |
47 | nnp_hwinfo.cache.l4 = (struct cache_info) { |
48 | .size = l4->size, |
49 | .associativity = l4->associativity, |
50 | .threads = l4->processor_count, |
51 | .inclusive = !!(l4->flags & CPUINFO_CACHE_INCLUSIVE), |
52 | }; |
53 | } |
54 | } |
55 | } |
56 | } |
57 | } |
58 | #endif |
59 | |
60 | #if !(CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64) || defined(__ANDROID__) |
61 | static void init_static_hwinfo(void) { |
62 | nnp_hwinfo.cache.l1 = (struct cache_info) { |
63 | .size = 16 * 1024, |
64 | .associativity = 4, |
65 | .threads = 1, |
66 | .inclusive = true, |
67 | }; |
68 | nnp_hwinfo.cache.l2 = (struct cache_info) { |
69 | .size = 128 * 1024, |
70 | .associativity = 4, |
71 | .threads = 1, |
72 | .inclusive = true, |
73 | }; |
74 | nnp_hwinfo.cache.l3 = (struct cache_info) { |
75 | .size = 2 * 1024 * 1024, |
76 | .associativity = 8, |
77 | .threads = 1, |
78 | .inclusive = true, |
79 | }; |
80 | } |
81 | #endif |
82 | |
83 | #if !CPUINFO_ARCH_X86 && !CPUINFO_ARCH_X86_64 && defined(__APPLE__) |
84 | static void init_static_ios_hwinfo(void) { |
85 | nnp_hwinfo.cache.l1 = (struct cache_info) { |
86 | .size = 32 * 1024, |
87 | .associativity = 1, |
88 | .threads = 1, |
89 | .inclusive = false, |
90 | }; |
91 | nnp_hwinfo.cache.l2 = (struct cache_info) { |
92 | .size = 1 * 1024 * 1024, |
93 | .associativity = 1, |
94 | .threads = 1, |
95 | .inclusive = false, |
96 | }; |
97 | nnp_hwinfo.cache.l3 = (struct cache_info) { |
98 | .size = 2 * 1024 * 1024, |
99 | .associativity = 8, |
100 | .threads = 1, |
101 | .inclusive = false, |
102 | }; |
103 | } |
104 | #endif |
105 | |
106 | #if !NNP_CONVOLUTION_ONLY |
107 | #if NNP_BACKEND_X86_64 |
108 | static const nnp_sdotxf_function sdotxf[8] = { |
109 | [0] = nnp_sdotxf1__avx2, |
110 | [1] = nnp_sdotxf2__avx2, |
111 | [2] = nnp_sdotxf3__avx2, |
112 | [3] = nnp_sdotxf4__avx2, |
113 | [4] = nnp_sdotxf5__avx2, |
114 | [5] = nnp_sdotxf6__avx2, |
115 | [6] = nnp_sdotxf7__avx2, |
116 | [7] = nnp_sdotxf8__avx2, |
117 | }; |
118 | |
119 | static const nnp_shdotxf_function shdotxf[8] = { |
120 | [0] = nnp_shdotxf1__avx2, |
121 | [1] = nnp_shdotxf2__avx2, |
122 | [2] = nnp_shdotxf3__avx2, |
123 | [3] = nnp_shdotxf4__avx2, |
124 | [4] = nnp_shdotxf5__avx2, |
125 | [5] = nnp_shdotxf6__avx2, |
126 | [6] = nnp_shdotxf7__avx2, |
127 | [7] = nnp_shdotxf8__avx2, |
128 | }; |
129 | #elif NNP_BACKEND_ARM |
130 | static const nnp_sdotxf_function sdotxf[8] = { |
131 | [0] = nnp_sdotxf1__neon, |
132 | [1] = nnp_sdotxf2__neon, |
133 | [2] = nnp_sdotxf3__neon, |
134 | [3] = nnp_sdotxf4__neon, |
135 | [4] = nnp_sdotxf5__neon, |
136 | [5] = nnp_sdotxf6__neon, |
137 | [6] = nnp_sdotxf7__neon, |
138 | [7] = nnp_sdotxf8__neon, |
139 | }; |
140 | |
141 | static const nnp_shdotxf_function shdotxf[8] = { |
142 | [0] = nnp_shdotxf1__psimd, |
143 | [1] = nnp_shdotxf2__psimd, |
144 | [2] = nnp_shdotxf3__psimd, |
145 | [3] = nnp_shdotxf4__psimd, |
146 | [4] = nnp_shdotxf5__psimd, |
147 | [5] = nnp_shdotxf6__psimd, |
148 | [6] = nnp_shdotxf7__psimd, |
149 | [7] = nnp_shdotxf8__psimd, |
150 | }; |
151 | #elif NNP_BACKEND_PSIMD |
152 | static const nnp_sdotxf_function sdotxf[8] = { |
153 | [0] = nnp_sdotxf1__psimd, |
154 | [1] = nnp_sdotxf2__psimd, |
155 | [2] = nnp_sdotxf3__psimd, |
156 | [3] = nnp_sdotxf4__psimd, |
157 | [4] = nnp_sdotxf5__psimd, |
158 | [5] = nnp_sdotxf6__psimd, |
159 | [6] = nnp_sdotxf7__psimd, |
160 | [7] = nnp_sdotxf8__psimd, |
161 | }; |
162 | |
163 | static const nnp_shdotxf_function shdotxf[8] = { |
164 | [0] = nnp_shdotxf1__psimd, |
165 | [1] = nnp_shdotxf2__psimd, |
166 | [2] = nnp_shdotxf3__psimd, |
167 | [3] = nnp_shdotxf4__psimd, |
168 | [4] = nnp_shdotxf5__psimd, |
169 | [5] = nnp_shdotxf6__psimd, |
170 | [6] = nnp_shdotxf7__psimd, |
171 | [7] = nnp_shdotxf8__psimd, |
172 | }; |
173 | #elif NNP_BACKEND_SCALAR |
174 | static const nnp_sdotxf_function sdotxf[8] = { |
175 | [0] = nnp_sdotxf1__scalar, |
176 | [1] = nnp_sdotxf2__scalar, |
177 | [2] = nnp_sdotxf3__scalar, |
178 | [3] = nnp_sdotxf4__scalar, |
179 | [4] = nnp_sdotxf5__scalar, |
180 | [5] = nnp_sdotxf6__scalar, |
181 | [6] = nnp_sdotxf7__scalar, |
182 | [7] = nnp_sdotxf8__scalar, |
183 | }; |
184 | |
185 | static const nnp_shdotxf_function shdotxf[8] = { |
186 | [0] = nnp_shdotxf1__scalar, |
187 | [1] = nnp_shdotxf2__scalar, |
188 | [2] = nnp_shdotxf3__scalar, |
189 | [3] = nnp_shdotxf4__scalar, |
190 | [4] = nnp_shdotxf5__scalar, |
191 | [5] = nnp_shdotxf6__scalar, |
192 | [6] = nnp_shdotxf7__scalar, |
193 | [7] = nnp_shdotxf8__scalar, |
194 | }; |
195 | #endif |
196 | #endif /* !NNP_CONVOLUTION_ONLY */ |
197 | |
198 | static void init_hwinfo(void) { |
199 | #if (CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64) && !defined(__ANDROID__) |
200 | init_x86_hwinfo(); |
201 | #elif !CPUINFO_ARCH_X86 && !CPUINFO_ARCH_X86_64 && defined(__APPLE__) |
202 | init_static_ios_hwinfo(); |
203 | #else |
204 | init_static_hwinfo(); |
205 | #endif |
206 | |
207 | /* Compute high-level cache blocking parameters */ |
208 | nnp_hwinfo.blocking.l1 = nnp_hwinfo.cache.l1.size; |
209 | if (nnp_hwinfo.cache.l1.threads > 1) { |
210 | nnp_hwinfo.blocking.l1 /= nnp_hwinfo.cache.l1.threads; |
211 | } |
212 | if (nnp_hwinfo.cache.l2.size != 0) { |
213 | nnp_hwinfo.blocking.l2 = nnp_hwinfo.cache.l2.size; |
214 | if (nnp_hwinfo.cache.l2.inclusive) { |
215 | nnp_hwinfo.blocking.l2 -= nnp_hwinfo.cache.l1.size; |
216 | } |
217 | if (nnp_hwinfo.cache.l2.threads > 1) { |
218 | nnp_hwinfo.blocking.l2 /= nnp_hwinfo.cache.l2.threads; |
219 | } |
220 | } |
221 | if (nnp_hwinfo.cache.l3.size != 0) { |
222 | nnp_hwinfo.blocking.l3 = nnp_hwinfo.cache.l3.size; |
223 | if (nnp_hwinfo.cache.l3.inclusive) { |
224 | nnp_hwinfo.blocking.l3 -= nnp_hwinfo.cache.l2.size; |
225 | } |
226 | } |
227 | nnp_hwinfo.blocking.l4 = nnp_hwinfo.cache.l4.size; |
228 | if (nnp_hwinfo.cache.l1.size && nnp_hwinfo.cache.l2.size && nnp_hwinfo.cache.l3.size) { |
229 | #if NNP_BACKEND_X86_64 |
230 | if (cpuinfo_has_x86_avx2() && cpuinfo_has_x86_fma3()) { |
231 | nnp_hwinfo.simd_width = 8; |
232 | nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset_and_store__avx2; |
233 | nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset_and_stream__avx2; |
234 | #if !NNP_INFERENCE_ONLY |
235 | nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__avx2; |
236 | #endif /* !NNP_INFERENCE_ONLY */ |
237 | nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__avx2; |
238 | nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__avx2; |
239 | nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset_and_store__avx2; |
240 | nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset_and_stream__avx2; |
241 | #if !NNP_INFERENCE_ONLY |
242 | nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__avx2; |
243 | #endif /* !NNP_INFERENCE_ONLY */ |
244 | nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__avx2; |
245 | nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__avx2; |
246 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset_and_store__avx2; |
247 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset_and_stream__avx2; |
248 | nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3_and_stream__avx2; |
249 | #if !NNP_INFERENCE_ONLY |
250 | nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R_and_stream__avx2; |
251 | nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__avx2; |
252 | #endif /* !NNP_INFERENCE_ONLY */ |
253 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__avx2; |
254 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__avx2; |
255 | #if !NNP_CONVOLUTION_ONLY |
256 | nnp_hwinfo.activations.relu = nnp_relu__avx2; |
257 | nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__avx2; |
258 | nnp_hwinfo.activations.grad_relu = nnp_grad_relu__avx2; |
259 | nnp_hwinfo.activations.softmax = nnp_softmax__avx2; |
260 | nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__avx2; |
261 | nnp_hwinfo.sdotxf = (struct sdotxf) { |
262 | .functions = sdotxf, |
263 | .fusion = NNP_COUNT_OF(sdotxf), |
264 | }; |
265 | nnp_hwinfo.shdotxf = (struct shdotxf) { |
266 | .functions = shdotxf, |
267 | .fusion = NNP_COUNT_OF(shdotxf), |
268 | }; |
269 | #endif /* !NNP_CONVOLUTION_ONLY */ |
270 | nnp_hwinfo.conv1x1 = (struct convolution) { |
271 | .mr = 2, |
272 | .nr = 4, |
273 | .only_mr_x_nr = nnp_conv1x1_only_2x4__fma3, |
274 | .upto_mr_x_nr = nnp_conv1x1_upto_2x4__fma3, |
275 | }; |
276 | nnp_hwinfo.sgemm = (struct sgemm) { |
277 | .mr = 4, |
278 | .nr = 24, |
279 | .only_mr_x_nr = nnp_sgemm_only_4x24__fma3, |
280 | .upto_mr_x_nr = nnp_sgemm_upto_4x24__fma3, |
281 | }; |
282 | nnp_hwinfo.sxgemm = (struct sxgemm) { |
283 | .mr = 3, |
284 | .nr = 4, |
285 | .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s8gemm_only_3x4__fma3, |
286 | .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s8gemm_upto_3x4__fma3, |
287 | }; |
288 | nnp_hwinfo.cxgemm = (struct cxgemm) { |
289 | .mr = 2, |
290 | .nr = 2, |
291 | #if !NNP_INFERENCE_ONLY |
292 | .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c6gemm_only_2x2__fma3, |
293 | .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c6gemm_upto_2x2__fma3, |
294 | .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c8gemm_only_2x2__fma3, |
295 | .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c8gemm_upto_2x2__fma3, |
296 | #endif /* !NNP_INFERENCE_ONLY */ |
297 | .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c6gemm_conjb_only_2x2__fma3, |
298 | .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c6gemm_conjb_upto_2x2__fma3, |
299 | .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c8gemm_conjb_only_2x2__fma3, |
300 | .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c8gemm_conjb_upto_2x2__fma3, |
301 | #if !NNP_INFERENCE_ONLY |
302 | .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c6gemm_conjb_transc_only_2x2__fma3, |
303 | .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c6gemm_conjb_transc_upto_2x2__fma3, |
304 | .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c8gemm_conjb_transc_only_2x2__fma3, |
305 | .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c8gemm_conjb_transc_upto_2x2__fma3, |
306 | #endif /* !NNP_INFERENCE_ONLY */ |
307 | }; |
308 | nnp_hwinfo.supported = true; |
309 | } |
310 | #elif NNP_BACKEND_PSIMD |
311 | nnp_hwinfo.simd_width = 4; |
312 | nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd; |
313 | nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd; |
314 | #if !NNP_INFERENCE_ONLY |
315 | nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__psimd; |
316 | #endif /* !NNP_INFERENCE_ONLY */ |
317 | nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__psimd; |
318 | nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__psimd; |
319 | nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd; |
320 | nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd; |
321 | #if !NNP_INFERENCE_ONLY |
322 | nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__psimd; |
323 | #endif /* !NNP_INFERENCE_ONLY */ |
324 | nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__psimd; |
325 | nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__psimd; |
326 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__psimd; |
327 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__psimd; |
328 | nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3__psimd; |
329 | #if !NNP_INFERENCE_ONLY |
330 | nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R__psimd; |
331 | nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__psimd; |
332 | #endif /* !NNP_INFERENCE_ONLY */ |
333 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__psimd; |
334 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__psimd; |
335 | #if !NNP_CONVOLUTION_ONLY |
336 | nnp_hwinfo.activations.relu = nnp_relu__psimd; |
337 | nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__psimd; |
338 | nnp_hwinfo.activations.grad_relu = nnp_grad_relu__psimd; |
339 | nnp_hwinfo.activations.softmax = nnp_softmax__psimd; |
340 | nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__psimd; |
341 | nnp_hwinfo.sdotxf = (struct sdotxf) { |
342 | .functions = sdotxf, |
343 | .fusion = NNP_COUNT_OF(sdotxf), |
344 | }; |
345 | nnp_hwinfo.shdotxf = (struct shdotxf) { |
346 | .functions = shdotxf, |
347 | .fusion = NNP_COUNT_OF(shdotxf), |
348 | }; |
349 | #endif /* !NNP_CONVOLUTION_ONLY */ |
350 | nnp_hwinfo.conv1x1 = (struct convolution) { |
351 | .mr = 2, |
352 | .nr = 4, |
353 | .only_mr_x_nr = nnp_conv1x1_only_2x4__psimd, |
354 | .upto_mr_x_nr = nnp_conv1x1_upto_2x4__psimd, |
355 | }; |
356 | nnp_hwinfo.sgemm = (struct sgemm) { |
357 | .mr = 4, |
358 | .nr = 8, |
359 | .only_mr_x_nr = nnp_sgemm_only_4x8__psimd, |
360 | .upto_mr_x_nr = nnp_sgemm_upto_4x8__psimd, |
361 | }; |
362 | nnp_hwinfo.sxgemm = (struct sxgemm) { |
363 | .mr = 3, |
364 | .nr = 4, |
365 | .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x4__psimd, |
366 | .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4gemm_upto_3x4__psimd, |
367 | }; |
368 | nnp_hwinfo.cxgemm = (struct cxgemm) { |
369 | .mr = 2, |
370 | .nr = 2, |
371 | #if !NNP_INFERENCE_ONLY |
372 | .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_only_2x2__psimd, |
373 | .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_upto_2x2__psimd, |
374 | .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_only_2x2__psimd, |
375 | .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_upto_2x2__psimd, |
376 | #endif /* !NNP_INFERENCE_ONLY */ |
377 | .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_only_2x2__psimd, |
378 | .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_upto_2x2__psimd, |
379 | .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_only_2x2__psimd, |
380 | .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_upto_2x2__psimd, |
381 | #if !NNP_INFERENCE_ONLY |
382 | .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_only_2x2__psimd, |
383 | .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_upto_2x2__psimd, |
384 | .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_transc_only_2x2__psimd, |
385 | .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_transc_upto_2x2__psimd, |
386 | #endif /* !NNP_INFERENCE_ONLY */ |
387 | }; |
388 | nnp_hwinfo.supported = true; |
389 | #elif NNP_BACKEND_ARM |
390 | nnp_hwinfo.simd_width = 4; |
391 | nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd; |
392 | nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd; |
393 | #if !NNP_INFERENCE_ONLY |
394 | nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__psimd; |
395 | #endif /* !NNP_INFERENCE_ONLY */ |
396 | nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__psimd; |
397 | nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__psimd; |
398 | nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd; |
399 | nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd; |
400 | #if !NNP_INFERENCE_ONLY |
401 | nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__psimd; |
402 | #endif /* !NNP_INFERENCE_ONLY */ |
403 | nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__psimd; |
404 | nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__psimd; |
405 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__neon; |
406 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__neon; |
407 | nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3__neon; |
408 | #if !NNP_INFERENCE_ONLY |
409 | nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R__neon; |
410 | nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__neon; |
411 | #endif /* !NNP_INFERENCE_ONLY */ |
412 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__neon; |
413 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__neon; |
414 | nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3s2_with_bias__neon; |
415 | nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3s2_with_bias_with_relu__neon; |
416 | if (cpuinfo_has_arm_neon_fp16()) { |
417 | nnp_hwinfo.transforms.iwt_f6x6_3x3_fp16_with_offset = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_fp16_with_offset__neonhp; |
418 | nnp_hwinfo.transforms.kwt_f6x6_3x3_fp16 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3_fp16__neonhp; |
419 | nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_fp16_with_bias__neonhp; |
420 | nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_fp16_with_bias_with_relu__neonhp; |
421 | } |
422 | #if !NNP_CONVOLUTION_ONLY |
423 | nnp_hwinfo.activations.relu = nnp_relu__neon; |
424 | nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__neon; |
425 | nnp_hwinfo.activations.grad_relu = nnp_grad_relu__neon; |
426 | nnp_hwinfo.activations.softmax = nnp_softmax__psimd; |
427 | nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__psimd; |
428 | nnp_hwinfo.sdotxf = (struct sdotxf) { |
429 | .functions = sdotxf, |
430 | .fusion = NNP_COUNT_OF(sdotxf), |
431 | }; |
432 | nnp_hwinfo.shdotxf = (struct shdotxf) { |
433 | .functions = shdotxf, |
434 | .fusion = NNP_COUNT_OF(shdotxf), |
435 | }; |
436 | #endif /* !NNP_CONVOLUTION_ONLY */ |
437 | nnp_hwinfo.conv1x1 = (struct convolution) { |
438 | .mr = 4, |
439 | .nr = 4, |
440 | .only_mr_x_nr = nnp_conv1x1_only_4x4__neon, |
441 | .upto_mr_x_nr = nnp_conv1x1_upto_4x4__neon, |
442 | }; |
443 | nnp_hwinfo.sgemm = (struct sgemm) { |
444 | .mr = 6, |
445 | .nr = 8, |
446 | #if CPUINFO_ARCH_ARM |
447 | .only_mr_x_nr = nnp_sgemm_only_6x8__aarch32_neon, |
448 | #else |
449 | .only_mr_x_nr = nnp_sgemm_only_6x8__neon, |
450 | #endif |
451 | .upto_mr_x_nr = nnp_sgemm_upto_6x8__neon, |
452 | }; |
453 | nnp_hwinfo.sxgemm = (struct sxgemm) { |
454 | .mr = 3, |
455 | .nr = 3, |
456 | #if CPUINFO_ARCH_ARM |
457 | .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x3__aarch32_neon, |
458 | #else |
459 | .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x3__neon, |
460 | #endif |
461 | .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4gemm_upto_3x3__neon, |
462 | }; |
463 | #if CPUINFO_ARCH_ARM |
464 | if (cpuinfo_has_arm_neon_fma()) { |
465 | nnp_hwinfo.sxgemm.only_mr_x_nr = |
466 | (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x3__aarch32_neon2; |
467 | } |
468 | #endif |
469 | if (cpuinfo_has_arm_neon_fp16()) { |
470 | nnp_hwinfo.hxgemm = (struct hxgemm) { |
471 | .mr = 3, |
472 | .nr = 3, |
473 | .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__neonhp, |
474 | .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_h4gemm_upto_3x3__neonhp, |
475 | }; |
476 | } |
477 | #if CPUINFO_ARCH_ARM |
478 | if (cpuinfo_has_arm_neon_fp16_arith()) { |
479 | nnp_hwinfo.hxgemm.only_mr_x_nr = |
480 | (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__aarch32_neonhparith; |
481 | nnp_hwinfo.hxgemm.upto_mr_x_nr = |
482 | (nnp_full_tuple_gemm_function) nnp_h4gemm_upto_3x3__aarch32_neon2; |
483 | } else if (cpuinfo_has_arm_neon_fma()) { |
484 | nnp_hwinfo.hxgemm.only_mr_x_nr = |
485 | (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__aarch32_neon2; |
486 | nnp_hwinfo.hxgemm.upto_mr_x_nr = |
487 | (nnp_full_tuple_gemm_function) nnp_h4gemm_upto_3x3__aarch32_neon2; |
488 | } else if (cpuinfo_has_arm_neon_fp16()) { |
489 | nnp_hwinfo.hxgemm.only_mr_x_nr = |
490 | (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__aarch32_neonhp; |
491 | } |
492 | #endif |
493 | nnp_hwinfo.cxgemm = (struct cxgemm) { |
494 | .mr = 2, |
495 | .nr = 2, |
496 | #if !NNP_INFERENCE_ONLY |
497 | .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_only_2x2__neon, |
498 | .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_upto_2x2__neon, |
499 | .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_only_2x2__neon, |
500 | .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_upto_2x2__neon, |
501 | #endif /* !NNP_INFERENCE_ONLY */ |
502 | .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_only_2x2__neon, |
503 | .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_upto_2x2__neon, |
504 | .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_only_2x2__neon, |
505 | .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_upto_2x2__neon, |
506 | #if !NNP_INFERENCE_ONLY |
507 | .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_only_2x2__neon, |
508 | .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_upto_2x2__neon, |
509 | .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_transc_only_2x2__neon, |
510 | .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_transc_upto_2x2__neon, |
511 | #endif /* !NNP_INFERENCE_ONLY */ |
512 | }; |
513 | nnp_hwinfo.supported = cpuinfo_has_arm_neon(); |
514 | #elif NNP_BACKEND_SCALAR |
515 | nnp_hwinfo.simd_width = 1; |
516 | nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__scalar; |
517 | nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__scalar; |
518 | #if !NNP_INFERENCE_ONLY |
519 | nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__scalar; |
520 | #endif /* !NNP_INFERENCE_ONLY */ |
521 | nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__scalar; |
522 | nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__scalar; |
523 | nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__scalar; |
524 | nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__scalar; |
525 | #if !NNP_INFERENCE_ONLY |
526 | nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__scalar; |
527 | #endif /* !NNP_INFERENCE_ONLY */ |
528 | nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__scalar; |
529 | nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__scalar; |
530 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__scalar; |
531 | nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__scalar; |
532 | nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3__scalar; |
533 | #if !NNP_INFERENCE_ONLY |
534 | nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R__scalar; |
535 | nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__scalar; |
536 | #endif /* !NNP_INFERENCE_ONLY */ |
537 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__scalar; |
538 | nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__scalar; |
539 | #if !NNP_CONVOLUTION_ONLY |
540 | nnp_hwinfo.activations.relu = nnp_relu__scalar; |
541 | nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__scalar; |
542 | nnp_hwinfo.activations.grad_relu = nnp_grad_relu__scalar; |
543 | nnp_hwinfo.activations.softmax = nnp_softmax__scalar; |
544 | nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__scalar; |
545 | nnp_hwinfo.sdotxf = (struct sdotxf) { |
546 | .functions = sdotxf, |
547 | .fusion = NNP_COUNT_OF(sdotxf), |
548 | }; |
549 | nnp_hwinfo.shdotxf = (struct shdotxf) { |
550 | .functions = shdotxf, |
551 | .fusion = NNP_COUNT_OF(shdotxf), |
552 | }; |
553 | #endif /* !NNP_CONVOLUTION_ONLY */ |
554 | nnp_hwinfo.conv1x1 = (struct convolution) { |
555 | .mr = 2, |
556 | .nr = 4, |
557 | .only_mr_x_nr = nnp_conv1x1_only_2x4__scalar, |
558 | .upto_mr_x_nr = nnp_conv1x1_upto_2x4__scalar, |
559 | }; |
560 | nnp_hwinfo.sgemm = (struct sgemm) { |
561 | .mr = 4, |
562 | .nr = 3, |
563 | .only_mr_x_nr = nnp_sgemm_only_4x3__scalar, |
564 | .upto_mr_x_nr = nnp_sgemm_upto_4x3__scalar, |
565 | }; |
566 | nnp_hwinfo.sxgemm = (struct sxgemm) { |
567 | .mr = 4, |
568 | .nr = 3, |
569 | .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_sgemm_only_4x3__scalar, |
570 | .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_sgemm_upto_4x3__scalar, |
571 | }; |
572 | nnp_hwinfo.cxgemm = (struct cxgemm) { |
573 | .mr = 2, |
574 | .nr = 2, |
575 | #if !NNP_INFERENCE_ONLY |
576 | .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s2gemm_only_2x2__scalar, |
577 | .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s2gemm_upto_2x2__scalar, |
578 | .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_cgemm_only_2x2__scalar, |
579 | .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_cgemm_upto_2x2__scalar, |
580 | #endif /* !NNP_INFERENCE_ONLY */ |
581 | .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s2gemm_only_2x2__scalar, |
582 | .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s2gemm_upto_2x2__scalar, |
583 | .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_cgemm_conjb_only_2x2__scalar, |
584 | .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_cgemm_conjb_upto_2x2__scalar, |
585 | #if !NNP_INFERENCE_ONLY |
586 | .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s2gemm_transc_only_2x2__scalar, |
587 | .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s2gemm_transc_upto_2x2__scalar, |
588 | .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_cgemm_conjb_transc_only_2x2__scalar, |
589 | .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_cgemm_conjb_transc_upto_2x2__scalar, |
590 | #endif /* !NNP_INFERENCE_ONLY */ |
591 | }; |
592 | nnp_hwinfo.supported = true; |
593 | #else |
594 | #error Unsupported backend |
595 | #endif |
596 | } |
597 | |
598 | nnp_hwinfo.initialized = true; |
599 | } |
600 | |
601 | enum nnp_status nnp_initialize(void) { |
602 | if (!cpuinfo_initialize()) { |
603 | return nnp_status_out_of_memory; |
604 | } |
605 | pthread_once(&hwinfo_init_control, &init_hwinfo); |
606 | if (nnp_hwinfo.supported) { |
607 | return nnp_status_success; |
608 | } else { |
609 | return nnp_status_unsupported_hardware; |
610 | } |
611 | } |
612 | |
613 | enum nnp_status nnp_deinitialize(void) { |
614 | cpuinfo_deinitialize(); |
615 | return nnp_status_success; |
616 | } |
617 | |