1#include <stdbool.h>
2#include <stdint.h>
3#include <stddef.h>
4
5#include <pthread.h>
6
7#include <cpuinfo.h>
8
9#include <nnpack.h>
10#include <nnpack/hwinfo.h>
11#include <nnpack/blas.h>
12#include <nnpack/transform.h>
13#include <nnpack/relu.h>
14#include <nnpack/softmax.h>
15
16struct hardware_info nnp_hwinfo = { };
17static pthread_once_t hwinfo_init_control = PTHREAD_ONCE_INIT;
18
19
20#if (CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64) && !defined(__ANDROID__)
21 static void init_x86_hwinfo(void) {
22 const struct cpuinfo_cache* l1d = cpuinfo_get_l1d_cache(0);
23 if (l1d != NULL) {
24 nnp_hwinfo.cache.l1 = (struct cache_info) {
25 .size = l1d->size,
26 .associativity = l1d->associativity,
27 .threads = l1d->processor_count,
28 };
29 const struct cpuinfo_cache* l2 = cpuinfo_get_l2_cache(0);
30 if (l2 != NULL) {
31 nnp_hwinfo.cache.l2 = (struct cache_info) {
32 .size = l2->size,
33 .associativity = l2->associativity,
34 .threads = l2->processor_count,
35 .inclusive = !!(l2->flags & CPUINFO_CACHE_INCLUSIVE),
36 };
37 const struct cpuinfo_cache* l3 = cpuinfo_get_l3_cache(0);
38 if (l3 != NULL) {
39 nnp_hwinfo.cache.l3 = (struct cache_info) {
40 .size = l3->size,
41 .associativity = l3->associativity,
42 .threads = l3->processor_count,
43 .inclusive = !!(l3->flags & CPUINFO_CACHE_INCLUSIVE),
44 };
45 const struct cpuinfo_cache* l4 = cpuinfo_get_l4_cache(0);
46 if (l4 != NULL) {
47 nnp_hwinfo.cache.l4 = (struct cache_info) {
48 .size = l4->size,
49 .associativity = l4->associativity,
50 .threads = l4->processor_count,
51 .inclusive = !!(l4->flags & CPUINFO_CACHE_INCLUSIVE),
52 };
53 }
54 }
55 }
56 }
57 }
58#endif
59
60#if !(CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64) || defined(__ANDROID__)
61 static void init_static_hwinfo(void) {
62 nnp_hwinfo.cache.l1 = (struct cache_info) {
63 .size = 16 * 1024,
64 .associativity = 4,
65 .threads = 1,
66 .inclusive = true,
67 };
68 nnp_hwinfo.cache.l2 = (struct cache_info) {
69 .size = 128 * 1024,
70 .associativity = 4,
71 .threads = 1,
72 .inclusive = true,
73 };
74 nnp_hwinfo.cache.l3 = (struct cache_info) {
75 .size = 2 * 1024 * 1024,
76 .associativity = 8,
77 .threads = 1,
78 .inclusive = true,
79 };
80 }
81#endif
82
83#if !CPUINFO_ARCH_X86 && !CPUINFO_ARCH_X86_64 && defined(__APPLE__)
84 static void init_static_ios_hwinfo(void) {
85 nnp_hwinfo.cache.l1 = (struct cache_info) {
86 .size = 32 * 1024,
87 .associativity = 1,
88 .threads = 1,
89 .inclusive = false,
90 };
91 nnp_hwinfo.cache.l2 = (struct cache_info) {
92 .size = 1 * 1024 * 1024,
93 .associativity = 1,
94 .threads = 1,
95 .inclusive = false,
96 };
97 nnp_hwinfo.cache.l3 = (struct cache_info) {
98 .size = 2 * 1024 * 1024,
99 .associativity = 8,
100 .threads = 1,
101 .inclusive = false,
102 };
103 }
104#endif
105
106#if !NNP_CONVOLUTION_ONLY
107 #if NNP_BACKEND_X86_64
108 static const nnp_sdotxf_function sdotxf[8] = {
109 [0] = nnp_sdotxf1__avx2,
110 [1] = nnp_sdotxf2__avx2,
111 [2] = nnp_sdotxf3__avx2,
112 [3] = nnp_sdotxf4__avx2,
113 [4] = nnp_sdotxf5__avx2,
114 [5] = nnp_sdotxf6__avx2,
115 [6] = nnp_sdotxf7__avx2,
116 [7] = nnp_sdotxf8__avx2,
117 };
118
119 static const nnp_shdotxf_function shdotxf[8] = {
120 [0] = nnp_shdotxf1__avx2,
121 [1] = nnp_shdotxf2__avx2,
122 [2] = nnp_shdotxf3__avx2,
123 [3] = nnp_shdotxf4__avx2,
124 [4] = nnp_shdotxf5__avx2,
125 [5] = nnp_shdotxf6__avx2,
126 [6] = nnp_shdotxf7__avx2,
127 [7] = nnp_shdotxf8__avx2,
128 };
129 #elif NNP_BACKEND_ARM
130 static const nnp_sdotxf_function sdotxf[8] = {
131 [0] = nnp_sdotxf1__neon,
132 [1] = nnp_sdotxf2__neon,
133 [2] = nnp_sdotxf3__neon,
134 [3] = nnp_sdotxf4__neon,
135 [4] = nnp_sdotxf5__neon,
136 [5] = nnp_sdotxf6__neon,
137 [6] = nnp_sdotxf7__neon,
138 [7] = nnp_sdotxf8__neon,
139 };
140
141 static const nnp_shdotxf_function shdotxf[8] = {
142 [0] = nnp_shdotxf1__psimd,
143 [1] = nnp_shdotxf2__psimd,
144 [2] = nnp_shdotxf3__psimd,
145 [3] = nnp_shdotxf4__psimd,
146 [4] = nnp_shdotxf5__psimd,
147 [5] = nnp_shdotxf6__psimd,
148 [6] = nnp_shdotxf7__psimd,
149 [7] = nnp_shdotxf8__psimd,
150 };
151 #elif NNP_BACKEND_PSIMD
152 static const nnp_sdotxf_function sdotxf[8] = {
153 [0] = nnp_sdotxf1__psimd,
154 [1] = nnp_sdotxf2__psimd,
155 [2] = nnp_sdotxf3__psimd,
156 [3] = nnp_sdotxf4__psimd,
157 [4] = nnp_sdotxf5__psimd,
158 [5] = nnp_sdotxf6__psimd,
159 [6] = nnp_sdotxf7__psimd,
160 [7] = nnp_sdotxf8__psimd,
161 };
162
163 static const nnp_shdotxf_function shdotxf[8] = {
164 [0] = nnp_shdotxf1__psimd,
165 [1] = nnp_shdotxf2__psimd,
166 [2] = nnp_shdotxf3__psimd,
167 [3] = nnp_shdotxf4__psimd,
168 [4] = nnp_shdotxf5__psimd,
169 [5] = nnp_shdotxf6__psimd,
170 [6] = nnp_shdotxf7__psimd,
171 [7] = nnp_shdotxf8__psimd,
172 };
173 #elif NNP_BACKEND_SCALAR
174 static const nnp_sdotxf_function sdotxf[8] = {
175 [0] = nnp_sdotxf1__scalar,
176 [1] = nnp_sdotxf2__scalar,
177 [2] = nnp_sdotxf3__scalar,
178 [3] = nnp_sdotxf4__scalar,
179 [4] = nnp_sdotxf5__scalar,
180 [5] = nnp_sdotxf6__scalar,
181 [6] = nnp_sdotxf7__scalar,
182 [7] = nnp_sdotxf8__scalar,
183 };
184
185 static const nnp_shdotxf_function shdotxf[8] = {
186 [0] = nnp_shdotxf1__scalar,
187 [1] = nnp_shdotxf2__scalar,
188 [2] = nnp_shdotxf3__scalar,
189 [3] = nnp_shdotxf4__scalar,
190 [4] = nnp_shdotxf5__scalar,
191 [5] = nnp_shdotxf6__scalar,
192 [6] = nnp_shdotxf7__scalar,
193 [7] = nnp_shdotxf8__scalar,
194 };
195 #endif
196#endif /* !NNP_CONVOLUTION_ONLY */
197
198static void init_hwinfo(void) {
199 #if (CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64) && !defined(__ANDROID__)
200 init_x86_hwinfo();
201 #elif !CPUINFO_ARCH_X86 && !CPUINFO_ARCH_X86_64 && defined(__APPLE__)
202 init_static_ios_hwinfo();
203 #else
204 init_static_hwinfo();
205 #endif
206
207 /* Compute high-level cache blocking parameters */
208 nnp_hwinfo.blocking.l1 = nnp_hwinfo.cache.l1.size;
209 if (nnp_hwinfo.cache.l1.threads > 1) {
210 nnp_hwinfo.blocking.l1 /= nnp_hwinfo.cache.l1.threads;
211 }
212 if (nnp_hwinfo.cache.l2.size != 0) {
213 nnp_hwinfo.blocking.l2 = nnp_hwinfo.cache.l2.size;
214 if (nnp_hwinfo.cache.l2.inclusive) {
215 nnp_hwinfo.blocking.l2 -= nnp_hwinfo.cache.l1.size;
216 }
217 if (nnp_hwinfo.cache.l2.threads > 1) {
218 nnp_hwinfo.blocking.l2 /= nnp_hwinfo.cache.l2.threads;
219 }
220 }
221 if (nnp_hwinfo.cache.l3.size != 0) {
222 nnp_hwinfo.blocking.l3 = nnp_hwinfo.cache.l3.size;
223 if (nnp_hwinfo.cache.l3.inclusive) {
224 nnp_hwinfo.blocking.l3 -= nnp_hwinfo.cache.l2.size;
225 }
226 }
227 nnp_hwinfo.blocking.l4 = nnp_hwinfo.cache.l4.size;
228 if (nnp_hwinfo.cache.l1.size && nnp_hwinfo.cache.l2.size && nnp_hwinfo.cache.l3.size) {
229 #if NNP_BACKEND_X86_64
230 if (cpuinfo_has_x86_avx2() && cpuinfo_has_x86_fma3()) {
231 nnp_hwinfo.simd_width = 8;
232 nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset_and_store__avx2;
233 nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset_and_stream__avx2;
234#if !NNP_INFERENCE_ONLY
235 nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__avx2;
236#endif /* !NNP_INFERENCE_ONLY */
237 nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__avx2;
238 nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__avx2;
239 nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset_and_store__avx2;
240 nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset_and_stream__avx2;
241#if !NNP_INFERENCE_ONLY
242 nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__avx2;
243#endif /* !NNP_INFERENCE_ONLY */
244 nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__avx2;
245 nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__avx2;
246 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset_and_store__avx2;
247 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset_and_stream__avx2;
248 nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3_and_stream__avx2;
249#if !NNP_INFERENCE_ONLY
250 nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R_and_stream__avx2;
251 nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__avx2;
252#endif /* !NNP_INFERENCE_ONLY */
253 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__avx2;
254 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__avx2;
255#if !NNP_CONVOLUTION_ONLY
256 nnp_hwinfo.activations.relu = nnp_relu__avx2;
257 nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__avx2;
258 nnp_hwinfo.activations.grad_relu = nnp_grad_relu__avx2;
259 nnp_hwinfo.activations.softmax = nnp_softmax__avx2;
260 nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__avx2;
261 nnp_hwinfo.sdotxf = (struct sdotxf) {
262 .functions = sdotxf,
263 .fusion = NNP_COUNT_OF(sdotxf),
264 };
265 nnp_hwinfo.shdotxf = (struct shdotxf) {
266 .functions = shdotxf,
267 .fusion = NNP_COUNT_OF(shdotxf),
268 };
269#endif /* !NNP_CONVOLUTION_ONLY */
270 nnp_hwinfo.conv1x1 = (struct convolution) {
271 .mr = 2,
272 .nr = 4,
273 .only_mr_x_nr = nnp_conv1x1_only_2x4__fma3,
274 .upto_mr_x_nr = nnp_conv1x1_upto_2x4__fma3,
275 };
276 nnp_hwinfo.sgemm = (struct sgemm) {
277 .mr = 4,
278 .nr = 24,
279 .only_mr_x_nr = nnp_sgemm_only_4x24__fma3,
280 .upto_mr_x_nr = nnp_sgemm_upto_4x24__fma3,
281 };
282 nnp_hwinfo.sxgemm = (struct sxgemm) {
283 .mr = 3,
284 .nr = 4,
285 .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s8gemm_only_3x4__fma3,
286 .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s8gemm_upto_3x4__fma3,
287 };
288 nnp_hwinfo.cxgemm = (struct cxgemm) {
289 .mr = 2,
290 .nr = 2,
291#if !NNP_INFERENCE_ONLY
292 .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c6gemm_only_2x2__fma3,
293 .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c6gemm_upto_2x2__fma3,
294 .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c8gemm_only_2x2__fma3,
295 .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c8gemm_upto_2x2__fma3,
296#endif /* !NNP_INFERENCE_ONLY */
297 .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c6gemm_conjb_only_2x2__fma3,
298 .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c6gemm_conjb_upto_2x2__fma3,
299 .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c8gemm_conjb_only_2x2__fma3,
300 .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c8gemm_conjb_upto_2x2__fma3,
301#if !NNP_INFERENCE_ONLY
302 .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c6gemm_conjb_transc_only_2x2__fma3,
303 .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c6gemm_conjb_transc_upto_2x2__fma3,
304 .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c8gemm_conjb_transc_only_2x2__fma3,
305 .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c8gemm_conjb_transc_upto_2x2__fma3,
306#endif /* !NNP_INFERENCE_ONLY */
307 };
308 nnp_hwinfo.supported = true;
309 }
310 #elif NNP_BACKEND_PSIMD
311 nnp_hwinfo.simd_width = 4;
312 nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd;
313 nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd;
314#if !NNP_INFERENCE_ONLY
315 nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__psimd;
316#endif /* !NNP_INFERENCE_ONLY */
317 nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__psimd;
318 nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__psimd;
319 nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd;
320 nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd;
321#if !NNP_INFERENCE_ONLY
322 nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__psimd;
323#endif /* !NNP_INFERENCE_ONLY */
324 nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__psimd;
325 nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__psimd;
326 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__psimd;
327 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__psimd;
328 nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3__psimd;
329#if !NNP_INFERENCE_ONLY
330 nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R__psimd;
331 nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__psimd;
332#endif /* !NNP_INFERENCE_ONLY */
333 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__psimd;
334 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__psimd;
335#if !NNP_CONVOLUTION_ONLY
336 nnp_hwinfo.activations.relu = nnp_relu__psimd;
337 nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__psimd;
338 nnp_hwinfo.activations.grad_relu = nnp_grad_relu__psimd;
339 nnp_hwinfo.activations.softmax = nnp_softmax__psimd;
340 nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__psimd;
341 nnp_hwinfo.sdotxf = (struct sdotxf) {
342 .functions = sdotxf,
343 .fusion = NNP_COUNT_OF(sdotxf),
344 };
345 nnp_hwinfo.shdotxf = (struct shdotxf) {
346 .functions = shdotxf,
347 .fusion = NNP_COUNT_OF(shdotxf),
348 };
349#endif /* !NNP_CONVOLUTION_ONLY */
350 nnp_hwinfo.conv1x1 = (struct convolution) {
351 .mr = 2,
352 .nr = 4,
353 .only_mr_x_nr = nnp_conv1x1_only_2x4__psimd,
354 .upto_mr_x_nr = nnp_conv1x1_upto_2x4__psimd,
355 };
356 nnp_hwinfo.sgemm = (struct sgemm) {
357 .mr = 4,
358 .nr = 8,
359 .only_mr_x_nr = nnp_sgemm_only_4x8__psimd,
360 .upto_mr_x_nr = nnp_sgemm_upto_4x8__psimd,
361 };
362 nnp_hwinfo.sxgemm = (struct sxgemm) {
363 .mr = 3,
364 .nr = 4,
365 .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x4__psimd,
366 .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4gemm_upto_3x4__psimd,
367 };
368 nnp_hwinfo.cxgemm = (struct cxgemm) {
369 .mr = 2,
370 .nr = 2,
371#if !NNP_INFERENCE_ONLY
372 .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_only_2x2__psimd,
373 .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_upto_2x2__psimd,
374 .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_only_2x2__psimd,
375 .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_upto_2x2__psimd,
376#endif /* !NNP_INFERENCE_ONLY */
377 .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_only_2x2__psimd,
378 .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_upto_2x2__psimd,
379 .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_only_2x2__psimd,
380 .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_upto_2x2__psimd,
381#if !NNP_INFERENCE_ONLY
382 .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_only_2x2__psimd,
383 .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_upto_2x2__psimd,
384 .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_transc_only_2x2__psimd,
385 .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_transc_upto_2x2__psimd,
386#endif /* !NNP_INFERENCE_ONLY */
387 };
388 nnp_hwinfo.supported = true;
389 #elif NNP_BACKEND_ARM
390 nnp_hwinfo.simd_width = 4;
391 nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd;
392 nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__psimd;
393#if !NNP_INFERENCE_ONLY
394 nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__psimd;
395#endif /* !NNP_INFERENCE_ONLY */
396 nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__psimd;
397 nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__psimd;
398 nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd;
399 nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__psimd;
400#if !NNP_INFERENCE_ONLY
401 nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__psimd;
402#endif /* !NNP_INFERENCE_ONLY */
403 nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__psimd;
404 nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__psimd;
405 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__neon;
406 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__neon;
407 nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3__neon;
408#if !NNP_INFERENCE_ONLY
409 nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R__neon;
410 nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__neon;
411#endif /* !NNP_INFERENCE_ONLY */
412 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__neon;
413 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__neon;
414 nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3s2_with_bias__neon;
415 nnp_hwinfo.transforms.owt_f6x6_3x3s2_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3s2_with_bias_with_relu__neon;
416 if (cpuinfo_has_arm_neon_fp16()) {
417 nnp_hwinfo.transforms.iwt_f6x6_3x3_fp16_with_offset = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_fp16_with_offset__neonhp;
418 nnp_hwinfo.transforms.kwt_f6x6_3x3_fp16 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3_fp16__neonhp;
419 nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_fp16_with_bias__neonhp;
420 nnp_hwinfo.transforms.owt_f6x6_3x3_fp16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_fp16_with_bias_with_relu__neonhp;
421 }
422#if !NNP_CONVOLUTION_ONLY
423 nnp_hwinfo.activations.relu = nnp_relu__neon;
424 nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__neon;
425 nnp_hwinfo.activations.grad_relu = nnp_grad_relu__neon;
426 nnp_hwinfo.activations.softmax = nnp_softmax__psimd;
427 nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__psimd;
428 nnp_hwinfo.sdotxf = (struct sdotxf) {
429 .functions = sdotxf,
430 .fusion = NNP_COUNT_OF(sdotxf),
431 };
432 nnp_hwinfo.shdotxf = (struct shdotxf) {
433 .functions = shdotxf,
434 .fusion = NNP_COUNT_OF(shdotxf),
435 };
436#endif /* !NNP_CONVOLUTION_ONLY */
437 nnp_hwinfo.conv1x1 = (struct convolution) {
438 .mr = 4,
439 .nr = 4,
440 .only_mr_x_nr = nnp_conv1x1_only_4x4__neon,
441 .upto_mr_x_nr = nnp_conv1x1_upto_4x4__neon,
442 };
443 nnp_hwinfo.sgemm = (struct sgemm) {
444 .mr = 6,
445 .nr = 8,
446 #if CPUINFO_ARCH_ARM
447 .only_mr_x_nr = nnp_sgemm_only_6x8__aarch32_neon,
448 #else
449 .only_mr_x_nr = nnp_sgemm_only_6x8__neon,
450 #endif
451 .upto_mr_x_nr = nnp_sgemm_upto_6x8__neon,
452 };
453 nnp_hwinfo.sxgemm = (struct sxgemm) {
454 .mr = 3,
455 .nr = 3,
456 #if CPUINFO_ARCH_ARM
457 .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x3__aarch32_neon,
458 #else
459 .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x3__neon,
460 #endif
461 .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4gemm_upto_3x3__neon,
462 };
463 #if CPUINFO_ARCH_ARM
464 if (cpuinfo_has_arm_neon_fma()) {
465 nnp_hwinfo.sxgemm.only_mr_x_nr =
466 (nnp_fast_tuple_gemm_function) nnp_s4gemm_only_3x3__aarch32_neon2;
467 }
468 #endif
469 if (cpuinfo_has_arm_neon_fp16()) {
470 nnp_hwinfo.hxgemm = (struct hxgemm) {
471 .mr = 3,
472 .nr = 3,
473 .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__neonhp,
474 .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_h4gemm_upto_3x3__neonhp,
475 };
476 }
477 #if CPUINFO_ARCH_ARM
478 if (cpuinfo_has_arm_neon_fp16_arith()) {
479 nnp_hwinfo.hxgemm.only_mr_x_nr =
480 (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__aarch32_neonhparith;
481 nnp_hwinfo.hxgemm.upto_mr_x_nr =
482 (nnp_full_tuple_gemm_function) nnp_h4gemm_upto_3x3__aarch32_neon2;
483 } else if (cpuinfo_has_arm_neon_fma()) {
484 nnp_hwinfo.hxgemm.only_mr_x_nr =
485 (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__aarch32_neon2;
486 nnp_hwinfo.hxgemm.upto_mr_x_nr =
487 (nnp_full_tuple_gemm_function) nnp_h4gemm_upto_3x3__aarch32_neon2;
488 } else if (cpuinfo_has_arm_neon_fp16()) {
489 nnp_hwinfo.hxgemm.only_mr_x_nr =
490 (nnp_fast_tuple_gemm_function) nnp_h4gemm_only_3x3__aarch32_neonhp;
491 }
492 #endif
493 nnp_hwinfo.cxgemm = (struct cxgemm) {
494 .mr = 2,
495 .nr = 2,
496#if !NNP_INFERENCE_ONLY
497 .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_only_2x2__neon,
498 .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_upto_2x2__neon,
499 .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_only_2x2__neon,
500 .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_upto_2x2__neon,
501#endif /* !NNP_INFERENCE_ONLY */
502 .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_only_2x2__neon,
503 .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_upto_2x2__neon,
504 .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_only_2x2__neon,
505 .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_upto_2x2__neon,
506#if !NNP_INFERENCE_ONLY
507 .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_only_2x2__neon,
508 .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s4c2gemm_conjb_transc_upto_2x2__neon,
509 .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_c4gemm_conjb_transc_only_2x2__neon,
510 .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_c4gemm_conjb_transc_upto_2x2__neon,
511#endif /* !NNP_INFERENCE_ONLY */
512 };
513 nnp_hwinfo.supported = cpuinfo_has_arm_neon();
514 #elif NNP_BACKEND_SCALAR
515 nnp_hwinfo.simd_width = 1;
516 nnp_hwinfo.transforms.fft8x8_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__scalar;
517 nnp_hwinfo.transforms.fft8x8_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft8x8_with_offset__scalar;
518#if !NNP_INFERENCE_ONLY
519 nnp_hwinfo.transforms.ifft8x8_with_offset = (nnp_transform_2d_with_offset) nnp_ifft8x8_with_offset__scalar;
520#endif /* !NNP_INFERENCE_ONLY */
521 nnp_hwinfo.transforms.ifft8x8_with_bias = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias__scalar;
522 nnp_hwinfo.transforms.ifft8x8_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft8x8_with_bias_with_relu__scalar;
523 nnp_hwinfo.transforms.fft16x16_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__scalar;
524 nnp_hwinfo.transforms.fft16x16_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_fft16x16_with_offset__scalar;
525#if !NNP_INFERENCE_ONLY
526 nnp_hwinfo.transforms.ifft16x16_with_offset = (nnp_transform_2d_with_offset) nnp_ifft16x16_with_offset__scalar;
527#endif /* !NNP_INFERENCE_ONLY */
528 nnp_hwinfo.transforms.ifft16x16_with_bias = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias__scalar;
529 nnp_hwinfo.transforms.ifft16x16_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_ifft16x16_with_bias_with_relu__scalar;
530 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_store = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__scalar;
531 nnp_hwinfo.transforms.iwt_f6x6_3x3_with_offset_and_stream = (nnp_transform_2d_with_offset) nnp_iwt8x8_3x3_with_offset__scalar;
532 nnp_hwinfo.transforms.kwt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_kwt8x8_3x3__scalar;
533#if !NNP_INFERENCE_ONLY
534 nnp_hwinfo.transforms.kwt_f6x6_3Rx3R = (nnp_transform_2d_with_offset) nnp_kwt8x8_3Rx3R__scalar;
535 nnp_hwinfo.transforms.owt_f6x6_3x3 = (nnp_transform_2d_with_offset) nnp_owt8x8_3x3__scalar;
536#endif /* !NNP_INFERENCE_ONLY */
537 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias__scalar;
538 nnp_hwinfo.transforms.owt_f6x6_3x3_with_bias_with_relu = (nnp_transform_2d_with_bias) nnp_owt8x8_3x3_with_bias_with_relu__scalar;
539#if !NNP_CONVOLUTION_ONLY
540 nnp_hwinfo.activations.relu = nnp_relu__scalar;
541 nnp_hwinfo.activations.inplace_relu = nnp_inplace_relu__scalar;
542 nnp_hwinfo.activations.grad_relu = nnp_grad_relu__scalar;
543 nnp_hwinfo.activations.softmax = nnp_softmax__scalar;
544 nnp_hwinfo.activations.inplace_softmax = nnp_inplace_softmax__scalar;
545 nnp_hwinfo.sdotxf = (struct sdotxf) {
546 .functions = sdotxf,
547 .fusion = NNP_COUNT_OF(sdotxf),
548 };
549 nnp_hwinfo.shdotxf = (struct shdotxf) {
550 .functions = shdotxf,
551 .fusion = NNP_COUNT_OF(shdotxf),
552 };
553#endif /* !NNP_CONVOLUTION_ONLY */
554 nnp_hwinfo.conv1x1 = (struct convolution) {
555 .mr = 2,
556 .nr = 4,
557 .only_mr_x_nr = nnp_conv1x1_only_2x4__scalar,
558 .upto_mr_x_nr = nnp_conv1x1_upto_2x4__scalar,
559 };
560 nnp_hwinfo.sgemm = (struct sgemm) {
561 .mr = 4,
562 .nr = 3,
563 .only_mr_x_nr = nnp_sgemm_only_4x3__scalar,
564 .upto_mr_x_nr = nnp_sgemm_upto_4x3__scalar,
565 };
566 nnp_hwinfo.sxgemm = (struct sxgemm) {
567 .mr = 4,
568 .nr = 3,
569 .only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_sgemm_only_4x3__scalar,
570 .upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_sgemm_upto_4x3__scalar,
571 };
572 nnp_hwinfo.cxgemm = (struct cxgemm) {
573 .mr = 2,
574 .nr = 2,
575#if !NNP_INFERENCE_ONLY
576 .s4cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s2gemm_only_2x2__scalar,
577 .s4cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s2gemm_upto_2x2__scalar,
578 .cX_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_cgemm_only_2x2__scalar,
579 .cX_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_cgemm_upto_2x2__scalar,
580#endif /* !NNP_INFERENCE_ONLY */
581 .s4cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s2gemm_only_2x2__scalar,
582 .s4cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s2gemm_upto_2x2__scalar,
583 .cX_conjb_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_cgemm_conjb_only_2x2__scalar,
584 .cX_conjb_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_cgemm_conjb_upto_2x2__scalar,
585#if !NNP_INFERENCE_ONLY
586 .s4cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_s2gemm_transc_only_2x2__scalar,
587 .s4cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_s2gemm_transc_upto_2x2__scalar,
588 .cX_conjb_transc_only_mr_x_nr = (nnp_fast_tuple_gemm_function) nnp_cgemm_conjb_transc_only_2x2__scalar,
589 .cX_conjb_transc_upto_mr_x_nr = (nnp_full_tuple_gemm_function) nnp_cgemm_conjb_transc_upto_2x2__scalar,
590#endif /* !NNP_INFERENCE_ONLY */
591 };
592 nnp_hwinfo.supported = true;
593 #else
594 #error Unsupported backend
595 #endif
596 }
597
598 nnp_hwinfo.initialized = true;
599}
600
601enum nnp_status nnp_initialize(void) {
602 if (!cpuinfo_initialize()) {
603 return nnp_status_out_of_memory;
604 }
605 pthread_once(&hwinfo_init_control, &init_hwinfo);
606 if (nnp_hwinfo.supported) {
607 return nnp_status_success;
608 } else {
609 return nnp_status_unsupported_hardware;
610 }
611}
612
613enum nnp_status nnp_deinitialize(void) {
614 cpuinfo_deinitialize();
615 return nnp_status_success;
616}
617