1 | /* |
2 | * Copyright (c) Facebook, Inc. and its affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under the BSD-style license found in the |
6 | * LICENSE file in the root directory of this source tree. |
7 | */ |
8 | |
9 | #include <stdbool.h> |
10 | #include <stddef.h> |
11 | #include <stdint.h> |
12 | |
13 | #ifdef _MSC_VER |
14 | #include <windows.h> |
15 | #else |
16 | #include <pthread.h> |
17 | #endif |
18 | |
19 | #include <cpuinfo.h> |
20 | #include <qnnpack.h> |
21 | #include <qnnpack/log.h> |
22 | #include <qnnpack/params.h> |
23 | #include <qnnpack/q8avgpool.h> |
24 | #include <qnnpack/q8conv.h> |
25 | #include <qnnpack/q8dwconv.h> |
26 | #include <qnnpack/q8gavgpool.h> |
27 | #include <qnnpack/q8gemm.h> |
28 | #include <qnnpack/q8vadd.h> |
29 | #include <qnnpack/u8clamp.h> |
30 | #include <qnnpack/u8lut32norm.h> |
31 | #include <qnnpack/u8maxpool.h> |
32 | #include <qnnpack/u8rmax.h> |
33 | #include <qnnpack/x8lut.h> |
34 | #include <qnnpack/x8zip.h> |
35 | |
36 | #ifdef _MSC_VER |
37 | static INIT_ONCE init_guard; |
38 | BOOL CALLBACK init_win(PINIT_ONCE InitOnce, PVOID Parameter, PVOID *lpContex); |
39 | #else |
40 | static pthread_once_t init_guard = PTHREAD_ONCE_INIT; |
41 | #endif |
42 | |
43 | struct qnnp_parameters qnnp_params = { |
44 | .initialized = false |
45 | }; |
46 | |
47 | static void init(void) { |
48 | #if CPUINFO_ARCH_ARM |
49 | if (!cpuinfo_has_arm_neon()) { |
50 | qnnp_log_error("QNNPACK initialization failed: NEON is not supported" ); |
51 | return; |
52 | } |
53 | qnnp_params.q8conv = (struct q8conv_parameters) { |
54 | .gemm = q8gemm_ukernel_4x8__aarch32_neon, |
55 | .conv = q8conv_ukernel_4x8__aarch32_neon, |
56 | .mr = 4, |
57 | .nr = 8, |
58 | .kr = 1, |
59 | }; |
60 | qnnp_params.q8conv_xzp = (struct q8conv_xzp_parameters) { |
61 | .gemm = q8gemm_xzp_ukernel_4x8c2__aarch32_neon, |
62 | .mr = 4, |
63 | .nr = 8, |
64 | .kr = 2, |
65 | .kc = 8, |
66 | .kthreshold = SIZE_MAX, |
67 | }; |
68 | /* setup xzp threshold based on measurements */ |
69 | switch (cpuinfo_get_core(0)->uarch) { |
70 | case cpuinfo_uarch_cortex_a72: |
71 | qnnp_params.q8conv_xzp.kthreshold = 64; |
72 | break; |
73 | case cpuinfo_uarch_cortex_a73: |
74 | qnnp_params.q8conv_xzp.kthreshold = 256; |
75 | break; |
76 | case cpuinfo_uarch_cortex_a75: |
77 | qnnp_params.q8conv_xzp.kthreshold = 32; |
78 | break; |
79 | case cpuinfo_uarch_cortex_a76: |
80 | qnnp_params.q8conv_xzp.kthreshold = 16; |
81 | break; |
82 | default: |
83 | break; |
84 | } |
85 | qnnp_params.q8dw9 = (struct q8dwconv_up_parameters) { |
86 | .updw = q8dwconv_ukernel_up8x9__aarch32_neon, |
87 | .cr = 8, |
88 | }; |
89 | qnnp_params.q8dw25 = (struct q8dwconv_mp_parameters) { |
90 | .mpdw = q8dwconv_ukernel_mp8x25__neon, |
91 | .cr = 8, |
92 | }; |
93 | qnnp_params.q8sum_rows = (struct q8sum_rows_parameters) { |
94 | .sum_rows = q8sumrows_ukernel_4x__neon, |
95 | .m = 4, |
96 | }; |
97 | qnnp_params.q8vadd = q8vadd_ukernel__neon; |
98 | qnnp_params.q8gavgpool = (struct q8gavgpool_parameters) { |
99 | .ltnr = q8gavgpool_ukernel_up8xm__neon, |
100 | .genr_lemr = q8gavgpool_ukernel_up8x7__neon, |
101 | .genr_gtmr = q8gavgpool_ukernel_mp8x7p7q__neon, |
102 | .mr = 7, |
103 | .nr = 8, |
104 | }; |
105 | qnnp_params.q8avgpool = (struct q8avgpool_parameters) { |
106 | .ltkr = q8avgpool_ukernel_up8xm__neon, |
107 | .gekr_lemr = q8avgpool_ukernel_up8x9__neon, |
108 | .gekr_gtmr = q8avgpool_ukernel_mp8x9p8q__neon, |
109 | .mr = 9, |
110 | .qr = 8, |
111 | .kr = 8, |
112 | }; |
113 | qnnp_params.u8maxpool = (struct u8maxpool_parameters) { |
114 | .ltkr = u8maxpool_ukernel_sub16__neon, |
115 | .gekr = u8maxpool_ukernel_16x9p8q__neon, |
116 | .mr = 9, |
117 | .qr = 8, |
118 | .kr = 16, |
119 | }; |
120 | qnnp_params.x8zip = (struct x8zip_parameters) { |
121 | .x2 = qnnp_x8zip_x2__neon, |
122 | .x3 = qnnp_x8zip_x3__neon, |
123 | .x4 = qnnp_x8zip_x4__neon, |
124 | .xm = qnnp_x8zip_xm__neon, |
125 | }; |
126 | qnnp_params.u8clamp = u8clamp_ukernel__neon; |
127 | qnnp_params.u8rmax = u8rmax_ukernel__neon; |
128 | qnnp_params.u8lut32norm = u8lut32norm_ukernel__scalar; |
129 | qnnp_params.x8lut = x8lut_ukernel__scalar; |
130 | #elif CPUINFO_ARCH_ARM64 |
131 | qnnp_params.q8conv = (struct q8conv_parameters) { |
132 | .gemm = q8gemm_ukernel_8x8__aarch64_neon, |
133 | .conv = q8conv_ukernel_8x8__aarch64_neon, |
134 | .mr = 8, |
135 | .nr = 8, |
136 | .kr = 1, |
137 | }; |
138 | qnnp_params.q8conv_xzp = (struct q8conv_xzp_parameters) { |
139 | .kthreshold = SIZE_MAX, |
140 | }; |
141 | qnnp_params.q8dw9 = (struct q8dwconv_up_parameters) { |
142 | .updw = q8dwconv_ukernel_up8x9__neon, |
143 | .cr = 8, |
144 | }; |
145 | qnnp_params.q8dw25 = (struct q8dwconv_mp_parameters) { |
146 | .mpdw = q8dwconv_ukernel_mp8x25__neon, |
147 | .cr = 8, |
148 | }; |
149 | qnnp_params.q8vadd = q8vadd_ukernel__neon; |
150 | qnnp_params.q8gavgpool = (struct q8gavgpool_parameters) { |
151 | .ltnr = q8gavgpool_ukernel_up8xm__neon, |
152 | .genr_lemr = q8gavgpool_ukernel_up8x7__neon, |
153 | .genr_gtmr = q8gavgpool_ukernel_mp8x7p7q__neon, |
154 | .mr = 7, |
155 | .nr = 8, |
156 | }; |
157 | qnnp_params.q8avgpool = (struct q8avgpool_parameters) { |
158 | .ltkr = q8avgpool_ukernel_up8xm__neon, |
159 | .gekr_lemr = q8avgpool_ukernel_up8x9__neon, |
160 | .gekr_gtmr = q8avgpool_ukernel_mp8x9p8q__neon, |
161 | .mr = 9, |
162 | .qr = 8, |
163 | .kr = 8, |
164 | }; |
165 | qnnp_params.u8maxpool = (struct u8maxpool_parameters) { |
166 | .ltkr = u8maxpool_ukernel_sub16__neon, |
167 | .gekr = u8maxpool_ukernel_16x9p8q__neon, |
168 | .mr = 9, |
169 | .qr = 8, |
170 | .kr = 16, |
171 | }; |
172 | qnnp_params.x8zip = (struct x8zip_parameters) { |
173 | .x2 = qnnp_x8zip_x2__neon, |
174 | .x3 = qnnp_x8zip_x3__neon, |
175 | .x4 = qnnp_x8zip_x4__neon, |
176 | .xm = qnnp_x8zip_xm__neon, |
177 | }; |
178 | qnnp_params.u8clamp = u8clamp_ukernel__neon; |
179 | qnnp_params.u8rmax = u8rmax_ukernel__neon; |
180 | qnnp_params.u8lut32norm = u8lut32norm_ukernel__scalar; |
181 | qnnp_params.x8lut = x8lut_ukernel__scalar; |
182 | #elif CPUINFO_ARCH_X86 || CPUINFO_ARCH_X86_64 |
183 | if (!cpuinfo_has_x86_sse2()) { |
184 | qnnp_log_error("QNNPACK initialization failed: SSE2 is not supported" ); |
185 | return; |
186 | } |
187 | qnnp_params.q8conv = (struct q8conv_parameters){ |
188 | .gemm = q8gemm_ukernel_4x4c2__sse2, |
189 | .conv = q8conv_ukernel_4x4c2__sse2, |
190 | .mr = 4, |
191 | .nr = 4, |
192 | .kr = 2, |
193 | }; |
194 | qnnp_params.q8conv_xzp = (struct q8conv_xzp_parameters) { |
195 | .kthreshold = SIZE_MAX, |
196 | }; |
197 | qnnp_params.q8dw9 = (struct q8dwconv_up_parameters) { |
198 | .updw = q8dwconv_ukernel_up8x9__sse2, |
199 | .cr = 8, |
200 | }; |
201 | qnnp_params.q8dw25 = (struct q8dwconv_mp_parameters) { |
202 | .mpdw = q8dwconv_ukernel_mp8x25__sse2, |
203 | .cr = 8, |
204 | }; |
205 | qnnp_params.q8vadd = q8vadd_ukernel__sse2; |
206 | qnnp_params.q8gavgpool = (struct q8gavgpool_parameters) { |
207 | .ltnr = q8gavgpool_ukernel_up8xm__sse2, |
208 | .genr_lemr = q8gavgpool_ukernel_up8x7__sse2, |
209 | .genr_gtmr = q8gavgpool_ukernel_mp8x7p7q__sse2, |
210 | .mr = 7, |
211 | .nr = 8, |
212 | }; |
213 | qnnp_params.q8avgpool = (struct q8avgpool_parameters) { |
214 | .ltkr = q8avgpool_ukernel_up8xm__sse2, |
215 | .gekr_lemr = q8avgpool_ukernel_up8x9__sse2, |
216 | .gekr_gtmr = q8avgpool_ukernel_mp8x9p8q__sse2, |
217 | .mr = 9, |
218 | .qr = 8, |
219 | .kr = 8, |
220 | }; |
221 | qnnp_params.u8maxpool = (struct u8maxpool_parameters) { |
222 | .ltkr = u8maxpool_ukernel_sub16__sse2, |
223 | .gekr = u8maxpool_ukernel_16x9p8q__sse2, |
224 | .mr = 9, |
225 | .qr = 8, |
226 | .kr = 16, |
227 | }; |
228 | qnnp_params.x8zip = (struct x8zip_parameters) { |
229 | .x2 = qnnp_x8zip_x2__sse2, |
230 | .x3 = qnnp_x8zip_x3__sse2, |
231 | .x4 = qnnp_x8zip_x4__sse2, |
232 | .xm = qnnp_x8zip_xm__sse2, |
233 | }; |
234 | qnnp_params.u8clamp = u8clamp_ukernel__sse2; |
235 | qnnp_params.u8rmax = u8rmax_ukernel__sse2; |
236 | qnnp_params.u8lut32norm = u8lut32norm_ukernel__scalar; |
237 | qnnp_params.x8lut = x8lut_ukernel__scalar; |
238 | #else |
239 | #error "Unsupported architecture" |
240 | #endif |
241 | qnnp_params.initialized = true; |
242 | } |
243 | |
244 | enum qnnp_status qnnp_initialize(void) { |
245 | if (!cpuinfo_initialize()) { |
246 | return qnnp_status_out_of_memory; |
247 | } |
248 | #ifdef _MSC_VER |
249 | InitOnceExecuteOnce(&init_guard, init_win, NULL, NULL); |
250 | #else |
251 | pthread_once(&init_guard, &init); |
252 | #endif |
253 | if (qnnp_params.initialized) { |
254 | return qnnp_status_success; |
255 | } else { |
256 | return qnnp_status_unsupported_hardware; |
257 | } |
258 | } |
259 | |
260 | enum qnnp_status qnnp_deinitialize(void) { |
261 | cpuinfo_deinitialize(); |
262 | return qnnp_status_success; |
263 | } |
264 | |
265 | #ifdef _MSC_VER |
266 | BOOL CALLBACK init_win(PINIT_ONCE InitOnce, PVOID Parameter, PVOID *lpContex) { |
267 | init(); |
268 | return TRUE; |
269 | } |
270 | #endif |
271 | |