1// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11#include <string.h>
12
13#include <fp16.h>
14
15#include <xnnpack/log.h>
16#include <xnnpack/math.h>
17#include <xnnpack/operator.h>
18#include <xnnpack/pack.h>
19#include <xnnpack/unaligned.h>
20
21
22void xnn_pack_f32_gemm_goi_w(
23 size_t g,
24 size_t nc,
25 size_t kc,
26 size_t nr,
27 size_t kr,
28 size_t sr,
29 const float* k,
30 const float* b,
31 float* packed_weights,
32 size_t extra_bytes,
33 const void* params)
34{
35 assert(g != 0);
36 assert(nr >= sr);
37 assert(k != NULL);
38 assert(packed_weights != NULL);
39
40 const size_t skr = sr * kr;
41 do {
42 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
43 const size_t nr_block_size = min(nc - nr_block_start, nr);
44 if XNN_LIKELY(b != NULL) {
45 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
46 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
47 }
48 }
49 packed_weights += nr;
50
51 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
52 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
53 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
54 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
55 if (kc_idx < kc) {
56 packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
57 }
58 }
59 packed_weights += kr;
60 }
61 packed_weights += (nr - nr_block_size) * kr;
62 }
63 packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
64 }
65 k += nc * kc;
66 if XNN_UNPREDICTABLE(b != NULL) {
67 b += nc;
68 }
69 } while (--g != 0);
70}
71
72void xnn_pack_f16_gemm_goi_w(
73 size_t g,
74 size_t nc,
75 size_t kc,
76 size_t nr,
77 size_t kr,
78 size_t sr,
79 const uint16_t* k,
80 const uint16_t* b,
81 uint16_t* packed_weights,
82 size_t extra_bytes,
83 const void* params)
84{
85 assert(g != 0);
86 assert(nr >= sr);
87 assert(k != NULL);
88 assert(packed_weights != NULL);
89
90 const size_t skr = sr * kr;
91 do {
92 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
93 const size_t nr_block_size = min(nc - nr_block_start, nr);
94 if XNN_LIKELY(b != NULL) {
95 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
96 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
97 }
98 }
99 packed_weights += nr;
100
101 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
102 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
103 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
104 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
105 if (kc_idx < kc) {
106 packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
107 }
108 }
109 packed_weights += kr;
110 }
111 packed_weights += (nr - nr_block_size) * kr;
112 }
113 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
114 }
115 k += nc * kc;
116 if XNN_UNPREDICTABLE(b != NULL) {
117 b += nc;
118 }
119 } while (--g != 0);
120}
121
122void xnn_pack_f32_to_f16_gemm_goi_w(
123 size_t g,
124 size_t nc,
125 size_t kc,
126 size_t nr,
127 size_t kr,
128 size_t sr,
129 const float* k,
130 const float* b,
131 uint16_t* packed_weights,
132 size_t extra_bytes,
133 const void* params)
134{
135 assert(g != 0);
136 assert(nr >= sr);
137 assert(k != NULL);
138 assert(packed_weights != NULL);
139
140 const size_t skr = sr * kr;
141 do {
142 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
143 const size_t nr_block_size = min(nc - nr_block_start, nr);
144 if XNN_LIKELY(b != NULL) {
145 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
146 packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
147 }
148 }
149 packed_weights += nr;
150
151 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
152 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
153 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
154 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
155 if (kc_idx < kc) {
156 packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
157 }
158 }
159 packed_weights += kr;
160 }
161 packed_weights += (nr - nr_block_size) * kr;
162 }
163 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
164 }
165 k += nc * kc;
166 if XNN_UNPREDICTABLE(b != NULL) {
167 b += nc;
168 }
169 } while (--g != 0);
170}
171
172void xnn_pack_qu8_gemm_goi_w(
173 size_t g,
174 size_t nc,
175 size_t kc,
176 size_t nr,
177 size_t kr,
178 size_t sr,
179 const uint8_t* k,
180 const int32_t* b,
181 void* packed_weights,
182 size_t extra_bytes,
183 const struct xnn_qu8_packing_params* params)
184{
185 assert(g != 0);
186 assert(nr >= sr);
187 assert(k != NULL);
188 assert(packed_weights != NULL);
189
190 const size_t skr = sr * kr;
191 const int32_t izp = (int32_t) params->input_zero_point;
192 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
193 do {
194 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
195 const size_t nr_block_size = min(nc - nr_block_start, nr);
196 int32_t* packed_b = (int32_t*) packed_weights;
197 if XNN_LIKELY(b != NULL) {
198 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
199 unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
200 packed_weights = (int32_t*) packed_weights + 1;
201 }
202 } else {
203 size_t n = nr_block_size;
204 do {
205 unaligned_store_s32(packed_weights, bzp);
206 packed_weights = (int32_t*) packed_weights + 1;
207 } while (--n != 0);
208 }
209 packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
210
211 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
212 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
213 int32_t ksum = 0;
214 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
215 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
216 if (kc_idx < kc) {
217 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
218 ksum += (int32_t) kv;
219 ((uint8_t*) packed_weights)[kr_block_offset] = kv;
220 }
221 }
222 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
223 packed_weights = (uint8_t*) packed_weights + kr;
224 }
225 packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
226 }
227 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
228 }
229 k += nc * kc;
230 if XNN_UNPREDICTABLE(b != NULL) {
231 b += nc;
232 }
233 } while (--g != 0);
234}
235
236void xnn_pack_qs8_gemm_goi_w(
237 size_t g,
238 size_t nc,
239 size_t kc,
240 size_t nr,
241 size_t kr,
242 size_t sr,
243 const int8_t* k,
244 const int32_t* b,
245 void* packed_weights,
246 size_t extra_bytes,
247 const struct xnn_qs8_packing_params* params)
248{
249 assert(g != 0);
250 assert(nr >= sr);
251 assert(k != NULL);
252 assert(packed_weights != NULL);
253
254 const size_t skr = sr * kr;
255 const uint32_t izp = (uint32_t) params->input_zero_point;
256 do {
257 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
258 const size_t nr_block_size = min(nc - nr_block_start, nr);
259 int32_t* packed_b = (int32_t*) packed_weights;
260 if XNN_LIKELY(b != NULL) {
261 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
262 unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
263 packed_weights = (int32_t*) packed_weights + 1;
264 }
265 } else {
266 size_t n = nr_block_size;
267 do {
268 unaligned_store_s32(packed_weights, 0);
269 packed_weights = (int32_t*) packed_weights + 1;
270 } while (--n != 0);
271 }
272 packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
273
274 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
275 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
276 uint32_t ksum = 0;
277 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
278 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
279 if (kc_idx < kc) {
280 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
281 ksum += (uint32_t) kv;
282 ((int8_t*) packed_weights)[kr_block_offset] = kv;
283 }
284 }
285 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
286 packed_weights = (int8_t*) packed_weights + kr;
287 }
288 packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
289 }
290 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
291 }
292 k += nc * kc;
293 if XNN_UNPREDICTABLE(b != NULL) {
294 b += nc;
295 }
296 } while (--g != 0);
297}
298
299void xnn_pack_qs8_gemm_xw_goi_w(
300 size_t g,
301 size_t nc,
302 size_t kc,
303 size_t nr,
304 size_t kr,
305 size_t sr,
306 const int8_t* k,
307 const int32_t* b,
308 void* packed_weights,
309 size_t extra_bytes,
310 const struct xnn_qs8_packing_params* params)
311{
312 assert(g != 0);
313 assert(nr >= sr);
314 assert(k != NULL);
315 assert(packed_weights != NULL);
316
317 const size_t skr = sr * kr;
318 const uint32_t izp = (uint32_t) params->input_zero_point;
319 do {
320 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
321 const size_t nr_block_size = min(nc - nr_block_start, nr);
322 int32_t* packed_b = (int32_t*) packed_weights;
323 if XNN_LIKELY(b != NULL) {
324 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
325 unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
326 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
327 }
328 } else {
329 size_t n = nr_block_size;
330 do {
331 unaligned_store_s32(packed_weights, 0);
332 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
333 } while (--n != 0);
334 }
335 packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
336
337 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
338 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
339 uint32_t ksum = 0;
340 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
341 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
342 if (kc_idx < kc) {
343 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
344 ksum += (uint32_t) kv;
345 ((int16_t*) packed_weights)[kr_block_offset] = (int16_t) kv;
346 }
347 }
348 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
349 packed_weights = (int16_t*) packed_weights + kr;
350 }
351 packed_weights = (int16_t*) packed_weights + (nr - nr_block_size) * kr;
352 }
353 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
354 }
355 k += nc * kc;
356 if XNN_UNPREDICTABLE(b != NULL) {
357 b += nc;
358 }
359 } while (--g != 0);
360}
361
362void xnn_pack_f32_gemm_io_w(
363 size_t nc,
364 size_t kc,
365 size_t nr,
366 size_t kr,
367 size_t sr,
368 const float* k,
369 const float* b,
370 float* packed_weights,
371 const void* params)
372{
373 assert(nr >= sr);
374 assert(k != NULL);
375 assert(packed_weights != NULL);
376
377 const size_t skr = sr * kr;
378 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
379 const size_t nr_block_size = min(nc - nr_block_start, nr);
380 if XNN_LIKELY(b != NULL) {
381 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
382 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
383 }
384 }
385 packed_weights += nr;
386
387 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
388 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
389 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
390 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
391 if (kc_idx < kc) {
392 packed_weights[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
393 }
394 }
395 packed_weights += kr;
396 }
397 packed_weights += (nr - nr_block_size) * kr;
398 }
399 }
400}
401
402void xnn_pack_f16_gemm_io_w(
403 size_t nc,
404 size_t kc,
405 size_t nr,
406 size_t kr,
407 size_t sr,
408 const uint16_t* k,
409 const uint16_t* b,
410 uint16_t* packed_weights,
411 const void* params)
412{
413 assert(nr >= sr);
414 assert(k != NULL);
415 assert(packed_weights != NULL);
416
417 const size_t skr = sr * kr;
418 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
419 const size_t nr_block_size = min(nc - nr_block_start, nr);
420 if XNN_LIKELY(b != NULL) {
421 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
422 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
423 }
424 }
425 packed_weights += nr;
426
427 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
428 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
429 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
430 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
431 if (kc_idx < kc) {
432 packed_weights[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
433 }
434 }
435 packed_weights += kr;
436 }
437 packed_weights += (nr - nr_block_size) * kr;
438 }
439 }
440}
441
442void xnn_pack_f32_to_f16_gemm_io_w(
443 size_t nc,
444 size_t kc,
445 size_t nr,
446 size_t kr,
447 size_t sr,
448 const float* k,
449 const float* b,
450 uint16_t* packed_weights,
451 const void* params)
452{
453 assert(nr >= sr);
454 assert(k != NULL);
455 assert(packed_weights != NULL);
456
457 const size_t skr = sr * kr;
458 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
459 const size_t nr_block_size = min(nc - nr_block_start, nr);
460 if XNN_LIKELY(b != NULL) {
461 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
462 packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
463 }
464 }
465 packed_weights += nr;
466
467 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
468 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
469 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
470 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
471 if (kc_idx < kc) {
472 packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
473 }
474 }
475 packed_weights += kr;
476 }
477 packed_weights += (nr - nr_block_size) * kr;
478 }
479 }
480}
481
482void xnn_pack_qu8_gemm_io_w(
483 size_t nc,
484 size_t kc,
485 size_t nr,
486 size_t kr,
487 size_t sr,
488 const uint8_t* k,
489 const int32_t* b,
490 void* packed_weights,
491 const struct xnn_qu8_packing_params* params)
492{
493 assert(nr >= sr);
494 assert(k != NULL);
495 assert(packed_weights != NULL);
496
497 const size_t skr = sr * kr;
498 const int32_t izp = (int32_t) params->input_zero_point;
499 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
500 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
501 const size_t nr_block_size = min(nc - nr_block_start, nr);
502 int32_t* packed_b = (int32_t*) packed_weights;
503 if XNN_LIKELY(b != NULL) {
504 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
505 unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
506 packed_weights = (int32_t*) packed_weights + 1;
507 }
508 } else {
509 size_t n = nr_block_size;
510 do {
511 unaligned_store_s32(packed_weights, bzp);
512 packed_weights = (int32_t*) packed_weights + 1;
513 } while (--n != 0);
514 }
515 packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
516
517 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
518 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
519 int32_t ksum = 0;
520 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
521 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
522 if (kc_idx < kc) {
523 const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
524 ksum += (int32_t) kv;
525 ((uint8_t*) packed_weights)[kr_block_offset] = kv;
526 }
527 }
528 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
529 packed_weights = (uint8_t*) packed_weights + kr;
530 }
531 packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
532 }
533 }
534}
535
536void xnn_pack_qs8_gemm_io_w(
537 size_t nc,
538 size_t kc,
539 size_t nr,
540 size_t kr,
541 size_t sr,
542 const int8_t* k,
543 const int32_t* b,
544 void* packed_weights,
545 const struct xnn_qs8_packing_params* params)
546{
547 assert(nr >= sr);
548 assert(k != NULL);
549 assert(packed_weights != NULL);
550
551 const size_t skr = sr * kr;
552 const uint32_t izp = (uint32_t) params->input_zero_point;
553 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
554 const size_t nr_block_size = min(nc - nr_block_start, nr);
555 int32_t* packed_b = (int32_t*) packed_weights;
556 if XNN_LIKELY(b != NULL) {
557 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
558 unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
559 packed_weights = (int32_t*) packed_weights + 1;
560 }
561 } else {
562 size_t n = nr_block_size;
563 do {
564 unaligned_store_s32(packed_weights, 0);
565 packed_weights = (int32_t*) packed_weights + 1;
566 } while (--n != 0);
567 }
568 packed_weights = (uint32_t*) packed_weights + (nr - nr_block_size);
569
570 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
571 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
572 uint32_t ksum = 0;
573 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
574 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
575 if (kc_idx < kc) {
576 const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
577 ksum += (uint32_t) kv;
578 ((int8_t*) packed_weights)[kr_block_offset] = kv;
579 }
580 }
581 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
582 packed_weights = (int8_t*) packed_weights + kr;
583 }
584 packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
585 }
586 }
587}
588
589void xnn_pack_f32_conv_goki_w(
590 size_t g,
591 size_t nc,
592 size_t ks,
593 size_t kc,
594 size_t nr,
595 size_t kr,
596 size_t sr,
597 const float* k,
598 const float* b,
599 float* packed_weights,
600 size_t extra_bytes,
601 const void* params)
602{
603 assert(g != 0);
604 assert(nr >= sr);
605 assert(k != NULL);
606 assert(packed_weights != NULL);
607
608 const size_t skr = sr * kr;
609 do {
610 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
611 const size_t nr_block_size = min(nc - nr_block_start, nr);
612 if XNN_LIKELY(b != NULL) {
613 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
614 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
615 }
616 }
617 packed_weights += nr;
618
619 for (size_t ki = 0; ki < ks; ki++) {
620 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
621 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
622 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
623 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
624 if (kc_idx < kc) {
625 packed_weights[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
626 }
627 }
628 packed_weights += kr;
629 }
630 packed_weights += (nr - nr_block_size) * kr;
631 }
632 }
633 packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
634 }
635 k += ks * kc * nc;
636 if XNN_UNPREDICTABLE(b != NULL) {
637 b += nc;
638 }
639 } while (--g != 0);
640}
641
642void xnn_pack_f16_conv_goki_w(
643 size_t g,
644 size_t nc,
645 size_t ks,
646 size_t kc,
647 size_t nr,
648 size_t kr,
649 size_t sr,
650 const uint16_t* k,
651 const uint16_t* b,
652 uint16_t* packed_weights,
653 size_t extra_bytes,
654 const void* params)
655{
656 assert(g != 0);
657 assert(nr >= sr);
658 assert(k != NULL);
659 assert(packed_weights != NULL);
660
661 const size_t skr = sr * kr;
662 do {
663 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
664 const size_t nr_block_size = min(nc - nr_block_start, nr);
665 if XNN_LIKELY(b != NULL) {
666 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
667 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
668 }
669 }
670 packed_weights += nr;
671
672 for (size_t ki = 0; ki < ks; ki++) {
673 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
674 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
675 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
676 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
677 if (kc_idx < kc) {
678 packed_weights[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
679 }
680 }
681 packed_weights += kr;
682 }
683 packed_weights += (nr - nr_block_size) * kr;
684 }
685 }
686 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
687 }
688 k += ks * kc * nc;
689 if XNN_UNPREDICTABLE(b != NULL) {
690 b += nc;
691 }
692 } while (--g != 0);
693}
694
695void xnn_pack_f32_to_f16_conv_goki_w(
696 size_t g,
697 size_t nc,
698 size_t ks,
699 size_t kc,
700 size_t nr,
701 size_t kr,
702 size_t sr,
703 const float* k,
704 const float* b,
705 uint16_t* packed_weights,
706 size_t extra_bytes,
707 const void* params)
708{
709 assert(g != 0);
710 assert(nr >= sr);
711 assert(k != NULL);
712 assert(packed_weights != NULL);
713
714 const size_t skr = sr * kr;
715 do {
716 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
717 const size_t nr_block_size = min(nc - nr_block_start, nr);
718 if XNN_LIKELY(b != NULL) {
719 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
720 packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
721 }
722 }
723 packed_weights += nr;
724
725 for (size_t ki = 0; ki < ks; ki++) {
726 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
727 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
728 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
729 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
730 if (kc_idx < kc) {
731 packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
732 }
733 }
734 packed_weights += kr;
735 }
736 packed_weights += (nr - nr_block_size) * kr;
737 }
738 }
739 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
740 }
741 k += ks * kc * nc;
742 if XNN_UNPREDICTABLE(b != NULL) {
743 b += nc;
744 }
745 } while (--g != 0);
746}
747
748void xnn_pack_qu8_conv_goki_w(
749 size_t g,
750 size_t nc,
751 size_t ks,
752 size_t kc,
753 size_t nr,
754 size_t kr,
755 size_t sr,
756 const uint8_t* k,
757 const int32_t* b,
758 void* packed_weights,
759 size_t extra_bytes,
760 const struct xnn_qu8_packing_params* params)
761{
762 assert(g != 0);
763 assert(nr >= sr);
764 assert(k != NULL);
765 assert(packed_weights != NULL);
766
767 const size_t skr = sr * kr;
768 const int32_t izp = (int32_t) params->input_zero_point;
769 const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
770 do {
771 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
772 const size_t nr_block_size = min(nc - nr_block_start, nr);
773 int32_t* packed_b = (int32_t*) packed_weights;
774 if XNN_LIKELY(b != NULL) {
775 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
776 unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
777 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
778 }
779 } else {
780 size_t n = nr_block_size;
781 do {
782 unaligned_store_s32(packed_weights, bzp);
783 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
784 } while (--n != 0);
785 }
786 packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
787
788 for (size_t ki = 0; ki < ks; ki++) {
789 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
790 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
791 int32_t ksum = 0;
792 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
793 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
794 if (kc_idx < kc) {
795 const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
796 ksum += (int32_t) kv;
797 ((uint8_t*) packed_weights)[kr_block_offset] = kv;
798 }
799 }
800 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
801 packed_weights = (uint8_t*) packed_weights + kr;
802 }
803 packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
804 }
805 }
806 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
807 }
808 k += ks * kc * nc;
809 if XNN_UNPREDICTABLE(b != NULL) {
810 b += nc;
811 }
812 } while (--g != 0);
813}
814
815void xnn_pack_qs8_conv_goki_w(
816 size_t g,
817 size_t nc,
818 size_t ks,
819 size_t kc,
820 size_t nr,
821 size_t kr,
822 size_t sr,
823 const int8_t* k,
824 const int32_t* b,
825 void* packed_weights,
826 size_t extra_bytes,
827 const struct xnn_qs8_packing_params* params)
828{
829 assert(g != 0);
830 assert(nr >= sr);
831 assert(k != NULL);
832 assert(packed_weights != NULL);
833
834 const size_t skr = sr * kr;
835 const uint32_t izp = (int32_t) params->input_zero_point;
836 do {
837 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
838 const size_t nr_block_size = min(nc - nr_block_start, nr);
839 int32_t* packed_b = (int32_t*) packed_weights;
840 if XNN_LIKELY(b != NULL) {
841 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
842 unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
843 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
844 }
845 } else {
846 size_t n = nr_block_size;
847 do {
848 unaligned_store_s32(packed_weights, 0);
849 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
850 } while (--n != 0);
851 }
852 packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
853
854 for (size_t ki = 0; ki < ks; ki++) {
855 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
856 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
857 uint32_t ksum = 0;
858 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
859 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
860 if (kc_idx < kc) {
861 const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
862 ksum += (uint32_t) kv;
863 ((int8_t*) packed_weights)[kr_block_offset] = kv;
864 }
865 }
866 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
867 packed_weights = (int8_t*) packed_weights + kr;
868 }
869 packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
870 }
871 }
872 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
873 }
874 k += ks * kc * nc;
875 if XNN_UNPREDICTABLE(b != NULL) {
876 b += nc;
877 }
878 } while (--g != 0);
879}
880
881void xnn_pack_f32_conv_kgo_w(
882 size_t g,
883 size_t nc,
884 size_t ks,
885 size_t nr,
886 size_t kr,
887 size_t sr,
888 const float* k,
889 const float* b,
890 float* packed_weights,
891 size_t extra_bytes,
892 const void* params)
893{
894 assert(g != 0);
895 assert(nr >= sr);
896 assert(k != NULL);
897 assert(packed_weights != NULL);
898
899 for (size_t i = 0; i < g; i++) {
900 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
901 const size_t nr_block_size = min(nc - nr_block_start, nr);
902 if XNN_LIKELY(b != NULL) {
903 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
904 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
905 }
906 }
907 packed_weights += nr;
908
909 for (size_t ki = 0; ki < ks; ki++) {
910 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
911 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
912 packed_weights[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
913 }
914 packed_weights += nr * kr;
915 }
916 }
917 packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
918 }
919 k += nc;
920 if XNN_UNPREDICTABLE(b != NULL) {
921 b += nc;
922 }
923 }
924}
925
926void xnn_pack_f16_conv_kgo_w(
927 size_t g,
928 size_t nc,
929 size_t ks,
930 size_t nr,
931 size_t kr,
932 size_t sr,
933 const uint16_t* k,
934 const uint16_t* b,
935 uint16_t* packed_weights,
936 size_t extra_bytes,
937 const void* params)
938{
939 assert(g != 0);
940 assert(nr >= sr);
941 assert(k != NULL);
942 assert(packed_weights != NULL);
943
944 for (size_t i = 0; i < g; i++) {
945 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
946 const size_t nr_block_size = min(nc - nr_block_start, nr);
947 if XNN_LIKELY(b != NULL) {
948 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
949 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
950 }
951 }
952 packed_weights += nr;
953
954 for (size_t ki = 0; ki < ks; ki++) {
955 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
956 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
957 packed_weights[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
958 }
959 packed_weights += nr * kr;
960 }
961 }
962 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
963 }
964 k += nc;
965 if XNN_UNPREDICTABLE(b != NULL) {
966 b += nc;
967 }
968 }
969}
970
971void xnn_pack_f32_to_f16_conv_kgo_w(
972 size_t g,
973 size_t nc,
974 size_t ks,
975 size_t nr,
976 size_t kr,
977 size_t sr,
978 const float* k,
979 const float* b,
980 uint16_t* packed_weights,
981 size_t extra_bytes,
982 const void* params)
983{
984 assert(g != 0);
985 assert(nr >= sr);
986 assert(k != NULL);
987 assert(packed_weights != NULL);
988
989 for (size_t i = 0; i < g; i++) {
990 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
991 const size_t nr_block_size = min(nc - nr_block_start, nr);
992 if XNN_LIKELY(b != NULL) {
993 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
994 packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
995 }
996 }
997 packed_weights += nr;
998
999 for (size_t ki = 0; ki < ks; ki++) {
1000 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1001 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1002 packed_weights[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
1003 }
1004 packed_weights += nr * kr;
1005 }
1006 }
1007 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1008 }
1009 k += nc;
1010 if XNN_UNPREDICTABLE(b != NULL) {
1011 b += nc;
1012 }
1013 }
1014}
1015
1016void xnn_pack_qu8_conv_kgo_w(
1017 size_t g,
1018 size_t nc,
1019 size_t ks,
1020 size_t nr,
1021 size_t kr,
1022 size_t sr,
1023 const uint8_t* k,
1024 const int32_t* b,
1025 void* packed_weights,
1026 size_t extra_bytes,
1027 const struct xnn_qu8_packing_params* params)
1028{
1029 assert(g != 0);
1030 assert(nr >= sr);
1031 assert(k != NULL);
1032 assert(packed_weights != NULL);
1033
1034 const int32_t izp = (int32_t) params->input_zero_point;
1035 const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
1036 for (size_t i = 0; i < g; i++) {
1037 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1038 const size_t nr_block_size = min(nc - nr_block_start, nr);
1039 int32_t* packed_b = (int32_t*) packed_weights;
1040 if XNN_LIKELY(b != NULL) {
1041 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1042 unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
1043 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1044 }
1045 } else {
1046 size_t n = nr_block_size;
1047 do {
1048 unaligned_store_s32(packed_weights, bzp);
1049 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1050 } while (--n != 0);
1051 }
1052 packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1053
1054 for (size_t ki = 0; ki < ks; ki++) {
1055 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1056 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1057 const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1058 ((uint8_t*) packed_weights)[nr_block_offset * kr] = kv;
1059 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - (int32_t) kv * izp);
1060 }
1061 packed_weights = (uint8_t*) packed_weights + nr * kr;
1062 }
1063 }
1064 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1065 }
1066 k += nc;
1067 if XNN_UNPREDICTABLE(b != NULL) {
1068 b += nc;
1069 }
1070 }
1071}
1072
1073void xnn_pack_qs8_conv_kgo_w(
1074 size_t g,
1075 size_t nc,
1076 size_t ks,
1077 size_t nr,
1078 size_t kr,
1079 size_t sr,
1080 const int8_t* k,
1081 const int32_t* b,
1082 void* packed_weights,
1083 size_t extra_bytes,
1084 const struct xnn_qs8_packing_params* params)
1085{
1086 assert(g != 0);
1087 assert(nr >= sr);
1088 assert(k != NULL);
1089 assert(packed_weights != NULL);
1090
1091 const uint32_t izp = (uint32_t) params->input_zero_point;
1092 for (size_t i = 0; i < g; i++) {
1093 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1094 const size_t nr_block_size = min(nc - nr_block_start, nr);
1095 int32_t* packed_b = (int32_t*) packed_weights;
1096 if XNN_LIKELY(b != NULL) {
1097 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1098 unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1099 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1100 }
1101 } else {
1102 size_t n = nr_block_size;
1103 do {
1104 unaligned_store_s32(packed_weights, 0);
1105 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1106 } while (--n != 0);
1107 }
1108 packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1109
1110 for (size_t ki = 0; ki < ks; ki++) {
1111 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1112 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1113 const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1114 ((int8_t*) packed_weights)[nr_block_offset * kr] = kv;
1115 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - (uint32_t) kv * izp);
1116 }
1117 packed_weights = (int8_t*) packed_weights + nr * kr;
1118 }
1119 }
1120 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1121 }
1122 k += nc;
1123 if XNN_UNPREDICTABLE(b != NULL) {
1124 b += nc;
1125 }
1126 }
1127}
1128
1129void xnn_pack_f32_deconv_goki_w(
1130 size_t g,
1131 size_t nc,
1132 size_t kh,
1133 size_t kw,
1134 size_t kc,
1135 size_t sh,
1136 size_t sw,
1137 size_t nr,
1138 size_t kr,
1139 size_t sr,
1140 const float* k,
1141 const float* b,
1142 float* packed_weights,
1143 struct subconvolution_params* subconv_params,
1144 const void* params)
1145{
1146 assert(g != 0);
1147 assert(nr >= sr);
1148 assert(k != NULL);
1149 assert(packed_weights != NULL);
1150
1151 const size_t skr = sr * kr;
1152 for (size_t i = 0; i < g; i++) {
1153 for (size_t oy = 0; oy < sh; oy++) {
1154 for (size_t ox = 0; ox < sw; ox++) {
1155 if (i == 0) {
1156 (*subconv_params++).weights = packed_weights;
1157 }
1158 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1159 const size_t nr_block_size = min(nc - nr_block_start, nr);
1160 if XNN_LIKELY(b != NULL) {
1161 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1162 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1163 }
1164 }
1165 packed_weights += nr;
1166 for (size_t ky = oy; ky < kh; ky += sh) {
1167 for (size_t kx = ox; kx < kw; kx += sw) {
1168 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1169 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1170 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1171 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1172 if (kc_idx < kc) {
1173 packed_weights[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1174 }
1175 }
1176 packed_weights += kr;
1177 }
1178 packed_weights += (nr - nr_block_size) * kr;
1179 }
1180 }
1181 }
1182 }
1183 }
1184 }
1185 k += kh * kw * kc * nc;
1186 if XNN_UNPREDICTABLE(b != NULL) {
1187 b += nc;
1188 }
1189 }
1190}
1191
1192void xnn_pack_f16_deconv_goki_w(
1193 size_t g,
1194 size_t nc,
1195 size_t kh,
1196 size_t kw,
1197 size_t kc,
1198 size_t sh,
1199 size_t sw,
1200 size_t nr,
1201 size_t kr,
1202 size_t sr,
1203 const uint16_t* k,
1204 const uint16_t* b,
1205 uint16_t* packed_weights,
1206 struct subconvolution_params* subconv_params,
1207 const void* params)
1208{
1209 assert(g != 0);
1210 assert(nr >= sr);
1211 assert(k != NULL);
1212 assert(packed_weights != NULL);
1213
1214 const size_t skr = sr * kr;
1215 for (size_t i = 0; i < g; i++) {
1216 for (size_t oy = 0; oy < sh; oy++) {
1217 for (size_t ox = 0; ox < sw; ox++) {
1218 if (i == 0) {
1219 (*subconv_params++).weights = packed_weights;
1220 }
1221 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1222 const size_t nr_block_size = min(nc - nr_block_start, nr);
1223 if XNN_LIKELY(b != NULL) {
1224 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1225 packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1226 }
1227 }
1228 packed_weights += nr;
1229 for (size_t ky = oy; ky < kh; ky += sh) {
1230 for (size_t kx = ox; kx < kw; kx += sw) {
1231 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1232 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1233 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1234 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1235 if (kc_idx < kc) {
1236 packed_weights[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1237 }
1238 }
1239 packed_weights += kr;
1240 }
1241 packed_weights += (nr - nr_block_size) * kr;
1242 }
1243 }
1244 }
1245 }
1246 }
1247 }
1248 k += kh * kw * kc * nc;
1249 if XNN_UNPREDICTABLE(b != NULL) {
1250 b += nc;
1251 }
1252 }
1253}
1254
1255void xnn_pack_f32_to_f16_deconv_goki_w(
1256 size_t g,
1257 size_t nc,
1258 size_t kh,
1259 size_t kw,
1260 size_t kc,
1261 size_t sh,
1262 size_t sw,
1263 size_t nr,
1264 size_t kr,
1265 size_t sr,
1266 const float* k,
1267 const float* b,
1268 uint16_t* packed_weights,
1269 struct subconvolution_params* subconv_params,
1270 const void* params)
1271{
1272 assert(g != 0);
1273 assert(nr >= sr);
1274 assert(k != NULL);
1275 assert(packed_weights != NULL);
1276
1277 const size_t skr = sr * kr;
1278 for (size_t i = 0; i < g; i++) {
1279 for (size_t oy = 0; oy < sh; oy++) {
1280 for (size_t ox = 0; ox < sw; ox++) {
1281 if (i == 0) {
1282 (*subconv_params++).weights = packed_weights;
1283 }
1284 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1285 const size_t nr_block_size = min(nc - nr_block_start, nr);
1286 if XNN_LIKELY(b != NULL) {
1287 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1288 packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1289 }
1290 }
1291 packed_weights += nr;
1292 for (size_t ky = oy; ky < kh; ky += sh) {
1293 for (size_t kx = ox; kx < kw; kx += sw) {
1294 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1295 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1296 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1297 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1298 if (kc_idx < kc) {
1299 packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx]);
1300 }
1301 }
1302 packed_weights += kr;
1303 }
1304 packed_weights += (nr - nr_block_size) * kr;
1305 }
1306 }
1307 }
1308 }
1309 }
1310 }
1311 k += kh * kw * kc * nc;
1312 if XNN_UNPREDICTABLE(b != NULL) {
1313 b += nc;
1314 }
1315 }
1316}
1317
1318void xnn_pack_qs8_deconv_goki_w(
1319 size_t g,
1320 size_t nc,
1321 size_t kh,
1322 size_t kw,
1323 size_t kc,
1324 size_t sh,
1325 size_t sw,
1326 size_t nr,
1327 size_t kr,
1328 size_t sr,
1329 const int8_t* k,
1330 const int32_t* b,
1331 void* packed_weights,
1332 struct subconvolution_params* subconv_params,
1333 const struct xnn_qs8_packing_params* params)
1334{
1335 assert(g != 0);
1336 assert(nr >= sr);
1337 assert(k != NULL);
1338 assert(packed_weights != NULL);
1339
1340 const size_t skr = sr * kr;
1341 const uint32_t izp = (uint32_t) params->input_zero_point;
1342 for (size_t i = 0; i < g; i++) {
1343 for (size_t oy = 0; oy < sh; oy++) {
1344 for (size_t ox = 0; ox < sw; ox++) {
1345 if (i == 0) {
1346 (*subconv_params++).weights = packed_weights;
1347 }
1348 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1349 const size_t nr_block_size = min(nc - nr_block_start, nr);
1350 int32_t* packed_b = (int32_t*) packed_weights;
1351 if XNN_LIKELY(b != 0) {
1352 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1353 unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1354 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1355 }
1356 } else {
1357 size_t n = nr_block_size;
1358 do {
1359 unaligned_store_s32(packed_weights, 0);
1360 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1361 } while (--n != 0);
1362 }
1363 packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1364 for (size_t ky = oy; ky < kh; ky += sh) {
1365 for (size_t kx = ox; kx < kw; kx += sw) {
1366 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1367 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1368 uint32_t ksum = 0;
1369 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1370 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1371 if (kc_idx < kc) {
1372 const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1373 ksum += (uint32_t) kv;
1374 ((int8_t*) packed_weights)[kr_block_offset] = kv;
1375 }
1376 }
1377 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
1378 packed_weights = (int8_t*) packed_weights + kr;
1379 }
1380 packed_weights = (int8_t*) packed_weights + (nr - nr_block_size) * kr;
1381 }
1382 }
1383 }
1384 }
1385 }
1386 }
1387 k += kh * kw * kc * nc;
1388 if XNN_UNPREDICTABLE(b != NULL) {
1389 b += nc;
1390 }
1391 }
1392}
1393
1394void xnn_pack_qu8_deconv_goki_w(
1395 size_t g,
1396 size_t nc,
1397 size_t kh,
1398 size_t kw,
1399 size_t kc,
1400 size_t sh,
1401 size_t sw,
1402 size_t nr,
1403 size_t kr,
1404 size_t sr,
1405 const uint8_t* k,
1406 const int32_t* b,
1407 void* packed_weights,
1408 struct subconvolution_params* subconv_params,
1409 const struct xnn_qu8_packing_params* params)
1410{
1411 assert(g != 0);
1412 assert(nr >= sr);
1413 assert(k != NULL);
1414 assert(packed_weights != NULL);
1415
1416 const size_t skr = sr * kr;
1417 const int32_t izp = (int32_t) params->input_zero_point;
1418 const int32_t kzp = (int32_t) params->kernel_zero_point;
1419 for (size_t i = 0; i < g; i++) {
1420 for (size_t oy = 0; oy < sh; oy++) {
1421 for (size_t ox = 0; ox < sw; ox++) {
1422 if (i == 0) {
1423 (*subconv_params++).weights = packed_weights;
1424 }
1425 const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1426 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1427 const size_t nr_block_size = min(nc - nr_block_start, nr);
1428 int32_t* packed_b = (int32_t*) packed_weights;
1429 if XNN_LIKELY(b != 0) {
1430 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1431 unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
1432 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1433 }
1434 } else {
1435 size_t n = nr_block_size;
1436 do {
1437 unaligned_store_s32(packed_weights, bzp);
1438 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1439 } while (--n != 0);
1440 }
1441 packed_weights = (void*) ((uintptr_t) packed_weights + (nr - nr_block_size) * sizeof(int32_t));
1442 for (size_t ky = oy; ky < kh; ky += sh) {
1443 for (size_t kx = ox; kx < kw; kx += sw) {
1444 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1445 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1446 int32_t ksum = 0;
1447 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1448 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1449 if (kc_idx < kc) {
1450 const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1451 ksum += (int32_t) kv;
1452 ((uint8_t*) packed_weights)[kr_block_offset] = kv;
1453 }
1454 }
1455 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
1456 packed_weights = (uint8_t*) packed_weights + kr;
1457 }
1458 packed_weights = (uint8_t*) packed_weights + (nr - nr_block_size) * kr;
1459 }
1460 }
1461 }
1462 }
1463 }
1464 }
1465 k += kh * kw * kc * nc;
1466 if XNN_UNPREDICTABLE(b != NULL) {
1467 b += nc;
1468 }
1469 }
1470}
1471
1472void xnn_pack_f32_dwconv_ghw_w(
1473 size_t primary_tile,
1474 size_t h,
1475 size_t w,
1476 size_t c,
1477 size_t cr,
1478 const float* k,
1479 const float* b,
1480 float* packed_weights,
1481 size_t extra_bytes,
1482 const void* params)
1483{
1484 assert(primary_tile >= h * w);
1485 xnn_pack_f32_dwconv_multipass_ghw_w(
1486 primary_tile,
1487 /*middle_pass_tile=*/0,
1488 /*last_pass_tile=*/0,
1489 h,
1490 w,
1491 c,
1492 cr,
1493 cr,
1494 cr,
1495 k,
1496 b,
1497 packed_weights,
1498 extra_bytes,
1499 params);
1500}
1501
1502// Helper function to advance x and y indices.
1503inline static void advance_x_y(size_t h, size_t* x, size_t* y) {
1504 if (++*y == h) {
1505 *y = 0;
1506 ++*x;
1507 }
1508}
1509
1510void xnn_pack_f32_dwconv_multipass_ghw_w(
1511 size_t first_pass_tile,
1512 size_t middle_pass_tile,
1513 size_t last_pass_tile,
1514 size_t h,
1515 size_t w,
1516 size_t c,
1517 size_t channel_tile,
1518 size_t channel_subtile,
1519 size_t channel_round,
1520 const float* k,
1521 const float* b,
1522 float* packed_weights,
1523 size_t extra_bytes,
1524 const void* params)
1525{
1526 assert(k != NULL);
1527 assert(packed_weights != NULL);
1528 size_t kernel_size = h * w;
1529 if (middle_pass_tile == 0) {
1530 // Uni-pass DWCONV.
1531 assert(last_pass_tile == 0);
1532 } else {
1533 // Multi-pass DWCONV.
1534 assert(kernel_size > first_pass_tile);
1535 }
1536
1537 // Stores the x and y index that should be processed next.
1538 size_t processed_x = 0;
1539 size_t processed_y = 0;
1540 size_t x = 0;
1541 size_t y = 0;
1542 // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
1543 const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
1544
1545 // Pack in blocks of channel_tile, then in blocks of channel_subtile.
1546 {
1547 size_t cr_block_start = 0;
1548 for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
1549 const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1550 if XNN_LIKELY(b != NULL) {
1551 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1552 *packed_weights++ = b[cr_block_start + cr_block_offset];
1553 }
1554 } else {
1555 size_t n = cr_block_size;
1556 do {
1557 *packed_weights++ = 0.0f;
1558 } while (--n != 0);
1559 }
1560 packed_weights += channel_tile - cr_block_size;
1561
1562 x = 0;
1563 y = 0;
1564 // kernel_size can be less than the first_pass_tile, in this case, pack up
1565 // to the smaller of the two.
1566 for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
1567 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1568 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1569 *packed_weights++ = kv;
1570 }
1571 packed_weights += channel_tile - cr_block_size;
1572 advance_x_y(h, &x, &y);
1573 }
1574 // And make sure to skip weights if kernel_size < first_pass_tile.
1575 packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
1576 }
1577
1578 for (; cr_block_start < c; cr_block_start += channel_subtile) {
1579 const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1580 if XNN_LIKELY(b != NULL) {
1581 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1582 *packed_weights++ = b[cr_block_start + cr_block_offset];
1583 }
1584 } else {
1585 size_t n = cr_block_size;
1586 do {
1587 *packed_weights++ = 0.0f;
1588 } while (--n != 0);
1589 }
1590 packed_weights += channel_subtile - cr_block_size;
1591
1592 x = 0;
1593 y = 0;
1594 // kernel_size can be less than the first_pass_tile, in this case, pack up
1595 // to the smaller of the two.
1596 for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
1597 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1598 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1599 *packed_weights++ = kv;
1600 }
1601 packed_weights += channel_subtile - cr_block_size;
1602 advance_x_y(h, &x, &y);
1603 }
1604 // And make sure to skip weights if kernel_size < first_pass_tile.
1605 packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
1606 }
1607 }
1608
1609 if (kernel_size <= first_pass_tile) {
1610 return;
1611 }
1612
1613 kernel_size -= first_pass_tile;
1614
1615 processed_x = x;
1616 processed_y = y;
1617
1618 // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
1619 // middle_pass_tile * cr weights.
1620 for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
1621 assert(kernel_size >= middle_pass_tile);
1622 size_t cr_block_start = 0;
1623 for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
1624 x = processed_x;
1625 y = processed_y;
1626 const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1627 for (size_t j = 0; j < middle_pass_tile; j++) {
1628 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1629 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1630 *packed_weights++ = kv;
1631 }
1632 packed_weights += channel_tile - cr_block_size;
1633 advance_x_y(h, &x, &y);
1634 }
1635 }
1636 for (; cr_block_start < c; cr_block_start += channel_subtile) {
1637 x = processed_x;
1638 y = processed_y;
1639 const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1640 for (size_t j = 0; j < middle_pass_tile; j++) {
1641 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1642 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1643 *packed_weights++ = kv;
1644 }
1645 packed_weights += channel_subtile - cr_block_size;
1646 advance_x_y(h, &x, &y);
1647 }
1648 }
1649 processed_x = x;
1650 processed_y = y;
1651 }
1652
1653 // Last pass.
1654 {
1655 assert(kernel_size <= last_pass_tile);
1656 size_t cr_block_start = 0;
1657 for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
1658 // Last pass does not pack to rounded c, since it handles remainder.
1659 x = processed_x;
1660 y = processed_y;
1661 const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1662 for (size_t i = 0; i < kernel_size; i++) {
1663 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1664 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1665 *packed_weights++ = kv;
1666 }
1667 packed_weights += channel_tile - cr_block_size;
1668 advance_x_y(h, &x, &y);
1669 }
1670 // Pad so that we can always read last_pass_tile weights in the last pass.
1671 packed_weights += (last_pass_tile - kernel_size) * channel_tile;
1672 // TODO(zhin): support extra bytes for channel_tile and subtile.
1673 packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
1674 }
1675 for (; cr_block_start < c; cr_block_start += channel_subtile) {
1676 // Last pass does not pack to rounded c, since it handles remainder.
1677 x = processed_x;
1678 y = processed_y;
1679 const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1680 for (size_t i = 0; i < kernel_size; i++) {
1681 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1682 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1683 *packed_weights++ = kv;
1684 }
1685 packed_weights += channel_subtile - cr_block_size;
1686 advance_x_y(h, &x, &y);
1687 }
1688 // Pad so that we can always read last_pass_tile weights in the last pass.
1689 packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
1690 // TODO(zhin): support extra bytes for channel_tile and subtile.
1691 packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
1692 }
1693 }
1694}
1695
1696void xnn_pack_f16_dwconv_ghw_w(
1697 size_t primary_tile,
1698 size_t h,
1699 size_t w,
1700 size_t c,
1701 size_t cr,
1702 const uint16_t* k,
1703 const uint16_t* b,
1704 uint16_t* packed_weights,
1705 size_t extra_bytes,
1706 const void* params)
1707{
1708 assert(k != NULL);
1709 assert(packed_weights != NULL);
1710
1711 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1712 const size_t cr_block_size = min(c - cr_block_start, cr);
1713 if XNN_LIKELY(b != NULL) {
1714 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1715 *packed_weights++ = b[cr_block_start + cr_block_offset];
1716 }
1717 } else {
1718 size_t n = cr_block_size;
1719 do {
1720 *packed_weights++ = 0;
1721 } while (--n != 0);
1722 }
1723 packed_weights += cr - cr_block_size;
1724 for (size_t x = 0; x < w; x++) {
1725 for (size_t y = 0; y < h; y++) {
1726 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1727 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1728 *packed_weights++ = kv;
1729 }
1730 packed_weights += cr - cr_block_size;
1731 }
1732 }
1733 packed_weights += (primary_tile - (h * w)) * cr_block_size;
1734 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1735 }
1736}
1737
1738void xnn_pack_f32_to_f16_dwconv_ghw_w(
1739 size_t primary_tile,
1740 size_t h,
1741 size_t w,
1742 size_t c,
1743 size_t cr,
1744 const float* k,
1745 const float* b,
1746 uint16_t* packed_weights,
1747 size_t extra_bytes,
1748 const void* params)
1749{
1750 assert(k != NULL);
1751 assert(packed_weights != NULL);
1752
1753 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1754 const size_t cr_block_size = min(c - cr_block_start, cr);
1755 if XNN_LIKELY(b != NULL) {
1756 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1757 *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1758 }
1759 } else {
1760 size_t n = cr_block_size;
1761 do {
1762 *packed_weights++ = 0;
1763 } while (--n != 0);
1764 }
1765 packed_weights += cr - cr_block_size;
1766 for (size_t x = 0; x < w; x++) {
1767 for (size_t y = 0; y < h; y++) {
1768 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1769 const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1770 *packed_weights++ = kv;
1771 }
1772 packed_weights += cr - cr_block_size;
1773 }
1774 }
1775 packed_weights += (primary_tile - (h * w)) * cr_block_size;
1776 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1777 }
1778}
1779
1780void xnn_pack_qu8_dwconv_ghw_w(
1781 size_t primary_tile,
1782 size_t h,
1783 size_t w,
1784 size_t c,
1785 size_t cr,
1786 const uint8_t* k,
1787 const int32_t* b,
1788 void* packed_weights,
1789 size_t extra_bytes,
1790 const struct xnn_qu8_packing_params* params)
1791{
1792 assert(k != NULL);
1793 assert(packed_weights != NULL);
1794
1795 const int32_t izp = (int32_t) params->input_zero_point;
1796 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1797 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1798 const size_t cr_block_size = min(c - cr_block_start, cr);
1799 int32_t* packed_b = (int32_t*) packed_weights;
1800 if XNN_LIKELY(b != NULL) {
1801 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1802 unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
1803 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1804 }
1805 } else {
1806 size_t n = cr_block_size;
1807 do {
1808 unaligned_store_s32(packed_weights, boff);
1809 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1810 } while (--n != 0);
1811 }
1812 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(int32_t));
1813 for (size_t x = 0; x < w; x++) {
1814 for (size_t y = 0; y < h; y++) {
1815 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1816 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1817 unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1818 *((uint8_t*) packed_weights) = kv;
1819 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
1820 }
1821 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(uint8_t));
1822 }
1823 }
1824 packed_weights = (void*) ((uintptr_t) packed_weights + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
1825 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1826 }
1827}
1828
1829void xnn_pack_qs8_dwconv_ghw_w(
1830 size_t primary_tile,
1831 size_t h,
1832 size_t w,
1833 size_t c,
1834 size_t cr,
1835 const int8_t* k,
1836 const int32_t* b,
1837 void* packed_weights,
1838 size_t extra_bytes,
1839 const struct xnn_qs8_packing_params* params)
1840{
1841 assert(k != NULL);
1842 assert(packed_weights != NULL);
1843
1844 const uint32_t izp = (uint32_t) params->input_zero_point;
1845 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1846 const size_t cr_block_size = min(c - cr_block_start, cr);
1847 int32_t* packed_b = (int32_t*) packed_weights;
1848 if XNN_LIKELY(b != NULL) {
1849 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1850 unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
1851 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1852 }
1853 } else {
1854 size_t n = cr_block_size;
1855 do {
1856 unaligned_store_s32(packed_weights, 0);
1857 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
1858 } while (--n != 0);
1859 }
1860 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(int32_t));
1861 for (size_t x = 0; x < w; x++) {
1862 for (size_t y = 0; y < h; y++) {
1863 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1864 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1865 unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1866 *((int8_t*) packed_weights) = kv;
1867 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
1868 }
1869 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(int8_t));
1870 }
1871 }
1872 packed_weights = (void*) ((uintptr_t) packed_weights + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
1873 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1874 }
1875}
1876
1877void xnn_pack_f32_dwconv_hwg_w(
1878 size_t primary_tile,
1879 size_t h,
1880 size_t w,
1881 size_t c,
1882 size_t cr,
1883 const float* k,
1884 const float* b,
1885 float* packed_weights,
1886 size_t extra_bytes,
1887 const void* params)
1888{
1889 assert(primary_tile >= h * w);
1890 xnn_pack_f32_dwconv_multipass_hwg_w(
1891 primary_tile,
1892 /*middle_pass_tile=*/0,
1893 /*last_pass_tile=*/0,
1894 h,
1895 w,
1896 c,
1897 cr,
1898 cr,
1899 cr,
1900 k,
1901 b,
1902 packed_weights,
1903 extra_bytes,
1904 params);
1905}
1906
1907void xnn_pack_f32_dwconv_multipass_hwg_w(
1908 size_t first_pass_tile,
1909 size_t middle_pass_tile,
1910 size_t last_pass_tile,
1911 size_t h,
1912 size_t w,
1913 size_t c,
1914 size_t channel_tile,
1915 size_t channel_subtile,
1916 size_t channel_round,
1917 const float* k,
1918 const float* b,
1919 float* packed_weights,
1920 size_t extra_bytes,
1921 const void* params)
1922{
1923 assert(k != NULL);
1924 assert(packed_weights != NULL);
1925 size_t kernel_size = h * w;
1926 if (middle_pass_tile == 0) {
1927 // Uni-pass DWCONV.
1928 assert(last_pass_tile == 0);
1929 } else {
1930 // Multi-pass DWCONV.
1931 assert(kernel_size > first_pass_tile);
1932 }
1933
1934 // Stores the x and y index that should be processed next.
1935 size_t processed_x = 0;
1936 size_t processed_y = 0;
1937 size_t x = 0;
1938 size_t y = 0;
1939 // First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
1940 const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
1941
1942 // Pack in blocks of channel_tile, then in blocks of channel_subtile.
1943 {
1944 size_t cr_block_start = 0;
1945 for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
1946 const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1947 if XNN_LIKELY(b != NULL) {
1948 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1949 *packed_weights++ = b[cr_block_start + cr_block_offset];
1950 }
1951 } else {
1952 size_t n = cr_block_size;
1953 do {
1954 *packed_weights++ = 0.0f;
1955 } while (--n != 0);
1956 }
1957 packed_weights += channel_tile - cr_block_size;
1958
1959 x = processed_x;
1960 y = processed_y;
1961 // kernel_size can be less than the first_pass_tile, in this case, pack up
1962 // to the smaller of the two.
1963 for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
1964 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1965 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1966 *packed_weights++ = kv;
1967 }
1968 packed_weights += channel_tile - cr_block_size;
1969 if (++y == h) {
1970 y = 0;
1971 x++;
1972 }
1973 }
1974 // And make sure to skip weights if kernel_size < first_pass_tile.
1975 packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
1976 }
1977 for (; cr_block_start < c; cr_block_start += channel_subtile) {
1978 const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1979 if XNN_LIKELY(b != NULL) {
1980 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1981 *packed_weights++ = b[cr_block_start + cr_block_offset];
1982 }
1983 } else {
1984 size_t n = cr_block_size;
1985 do {
1986 *packed_weights++ = 0.0f;
1987 } while (--n != 0);
1988 }
1989 packed_weights += channel_subtile - cr_block_size;
1990
1991 x = processed_x;
1992 y = processed_y;
1993 // kernel_size can be less than the first_pass_tile, in this case, pack up
1994 // to the smaller of the two.
1995 for (size_t i = 0; i < min(first_pass_tile, kernel_size); i++) {
1996 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1997 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1998 *packed_weights++ = kv;
1999 }
2000 packed_weights += channel_subtile - cr_block_size;
2001 if (++y == h) {
2002 y = 0;
2003 x++;
2004 }
2005 }
2006 // And make sure to skip weights if kernel_size < first_pass_tile.
2007 packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2008 }
2009 }
2010
2011 if (kernel_size <= first_pass_tile) {
2012 return;
2013 }
2014
2015 kernel_size -= first_pass_tile;
2016
2017 processed_x = x;
2018 processed_y = y;
2019
2020 // Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
2021 // middle_pass_tile * cr weights.
2022 for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
2023 assert(kernel_size >= middle_pass_tile);
2024 size_t cr_block_start = 0;
2025 for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2026 x = processed_x;
2027 y = processed_y;
2028 const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2029 for (size_t j = 0; j < middle_pass_tile; j++) {
2030 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2031 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2032 *packed_weights++ = kv;
2033 }
2034 packed_weights += channel_tile - cr_block_size;
2035 if (++y == h) {
2036 y = 0;
2037 x++;
2038 }
2039 }
2040 }
2041 for (; cr_block_start < c; cr_block_start += channel_subtile) {
2042 x = processed_x;
2043 y = processed_y;
2044 const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2045 for (size_t j = 0; j < middle_pass_tile; j++) {
2046 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2047 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2048 *packed_weights++ = kv;
2049 }
2050 packed_weights += channel_subtile - cr_block_size;
2051 if (++y == h) {
2052 y = 0;
2053 x++;
2054 }
2055 }
2056 }
2057 processed_x = x;
2058 processed_y = y;
2059 }
2060
2061 // Last pass.
2062 {
2063 assert(kernel_size <= last_pass_tile);
2064 size_t cr_block_start = 0;
2065 for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
2066 // for (; cr_block_start <= c - channel_tile; cr_block_start += channel_tile) {
2067 x = processed_x;
2068 y = processed_y;
2069 const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2070 for (size_t i = 0; i < kernel_size; i++) {
2071 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2072 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2073 *packed_weights++ = kv;
2074 }
2075 packed_weights += channel_tile - cr_block_size;
2076 if (++y == h) {
2077 y = 0;
2078 x++;
2079 }
2080 }
2081 // Pad so that we can always read last_pass_tile weights in the last pass.
2082 packed_weights += (last_pass_tile - kernel_size) * channel_tile;
2083 // TODO(zhin): support extra bytes for channel_tile and subtile.
2084 packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
2085 }
2086 for (; cr_block_start < c; cr_block_start += channel_subtile) {
2087 x = processed_x;
2088 y = processed_y;
2089 const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2090 for (size_t i = 0; i < kernel_size; i++) {
2091 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2092 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2093 *packed_weights++ = kv;
2094 }
2095 packed_weights += channel_subtile - cr_block_size;
2096 if (++y == h) {
2097 y = 0;
2098 x++;
2099 }
2100 }
2101 // Pad so that we can always read last_pass_tile weights in the last pass.
2102 packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
2103 // TODO(zhin): support extra bytes for channel_tile and subtile.
2104 packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
2105 }
2106 }
2107}
2108
2109void xnn_pack_f16_dwconv_hwg_w(
2110 size_t primary_tile,
2111 size_t h,
2112 size_t w,
2113 size_t c,
2114 size_t cr,
2115 const uint16_t* k,
2116 const uint16_t* b,
2117 uint16_t* packed_weights,
2118 size_t extra_bytes,
2119 const void* params)
2120{
2121 assert(k != NULL);
2122 assert(packed_weights != NULL);
2123
2124 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2125 const size_t cr_block_size = min(c - cr_block_start, cr);
2126 if XNN_LIKELY(b != NULL) {
2127 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2128 *packed_weights++ = b[cr_block_start + cr_block_offset];
2129 }
2130 } else {
2131 size_t n = cr_block_size;
2132 do {
2133 *packed_weights++ = 0;
2134 } while (--n != 0);
2135 }
2136 packed_weights += cr - cr_block_size;
2137 for (size_t x = 0; x < w; x++) {
2138 for (size_t y = 0; y < h; y++) {
2139 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2140 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2141 *packed_weights++ = kv;
2142 }
2143 packed_weights += cr - cr_block_size;
2144 }
2145 }
2146 packed_weights += (primary_tile - (h * w)) * cr_block_size;
2147 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
2148 }
2149}
2150
2151void xnn_pack_f32_to_f16_dwconv_hwg_w(
2152 size_t primary_tile,
2153 size_t h,
2154 size_t w,
2155 size_t c,
2156 size_t cr,
2157 const float* k,
2158 const float* b,
2159 uint16_t* packed_weights,
2160 size_t extra_bytes,
2161 const void* params)
2162{
2163 assert(k != NULL);
2164 assert(packed_weights != NULL);
2165
2166 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2167 const size_t cr_block_size = min(c - cr_block_start, cr);
2168 if XNN_LIKELY(b != NULL) {
2169 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2170 *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2171 }
2172 } else {
2173 size_t n = cr_block_size;
2174 do {
2175 *packed_weights++ = 0;
2176 } while (--n != 0);
2177 }
2178 packed_weights += cr - cr_block_size;
2179 for (size_t x = 0; x < w; x++) {
2180 for (size_t y = 0; y < h; y++) {
2181 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2182 const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
2183 *packed_weights++ = kv;
2184 }
2185 packed_weights += cr - cr_block_size;
2186 }
2187 }
2188 packed_weights += (primary_tile - (h * w)) * cr_block_size;
2189 packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
2190 }
2191}
2192
2193void xnn_pack_qu8_dwconv_hwg_w(
2194 size_t primary_tile,
2195 size_t h,
2196 size_t w,
2197 size_t c,
2198 size_t cr,
2199 const uint8_t* k,
2200 const int32_t* b,
2201 void* packed_weights,
2202 size_t extra_bytes,
2203 const struct xnn_qu8_packing_params* params)
2204{
2205 assert(k != NULL);
2206 assert(packed_weights != NULL);
2207
2208 const int32_t izp = (int32_t) params->input_zero_point;
2209 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
2210 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2211 const size_t cr_block_size = min(c - cr_block_start, cr);
2212 int32_t* packed_b = (int32_t*) packed_weights;
2213 if XNN_LIKELY(b != NULL) {
2214 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2215 unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
2216 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2217 }
2218 } else {
2219 size_t n = cr_block_size;
2220 do {
2221 unaligned_store_s32(packed_weights, boff);
2222 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2223 } while (--n != 0);
2224 }
2225 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(int32_t));
2226 for (size_t x = 0; x < w; x++) {
2227 for (size_t y = 0; y < h; y++) {
2228 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2229 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2230 unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
2231 *((uint8_t*) packed_weights) = kv;
2232 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(uint8_t));
2233 }
2234 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(uint8_t));
2235 }
2236 }
2237 packed_weights = (void*) ((uintptr_t) packed_weights + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
2238 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
2239 }
2240}
2241
2242void xnn_pack_qs8_dwconv_hwg_w(
2243 size_t primary_tile,
2244 size_t h,
2245 size_t w,
2246 size_t c,
2247 size_t cr,
2248 const int8_t* k,
2249 const int32_t* b,
2250 void* packed_weights,
2251 size_t extra_bytes,
2252 const struct xnn_qs8_packing_params* params)
2253{
2254 assert(k != NULL);
2255 assert(packed_weights != NULL);
2256
2257 const uint32_t izp = (int32_t) params->input_zero_point;
2258 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2259 const size_t cr_block_size = min(c - cr_block_start, cr);
2260 int32_t* packed_b = (int32_t*) packed_weights;
2261 if XNN_LIKELY(b != NULL) {
2262 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2263 unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
2264 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2265 }
2266 } else {
2267 size_t n = cr_block_size;
2268 do {
2269 unaligned_store_s32(packed_weights, 0);
2270 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int32_t));
2271 } while (--n != 0);
2272 }
2273 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(int32_t));
2274 for (size_t x = 0; x < w; x++) {
2275 for (size_t y = 0; y < h; y++) {
2276 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2277 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2278 unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
2279 *((int8_t*) packed_weights) = kv;
2280 packed_weights = (void*) ((uintptr_t) packed_weights + sizeof(int8_t));
2281 }
2282 packed_weights = (void*) ((uintptr_t) packed_weights + (cr - cr_block_size) * sizeof(int8_t));
2283 }
2284 }
2285 packed_weights = (void*) ((uintptr_t) packed_weights + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
2286 packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
2287 }
2288}
2289
2290void xnn_pack_f32_gemminc_goi_w(
2291 size_t g,
2292 size_t nc,
2293 size_t kc,
2294 size_t nr,
2295 size_t kr,
2296 size_t sr,
2297 const float* k,
2298 float* packed_weights,
2299 const void* params)
2300{
2301 assert(g != 0);
2302 assert(nr >= sr);
2303 assert(k != NULL);
2304 assert(packed_weights != NULL);
2305
2306 const size_t skr = sr * kr;
2307 do {
2308 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
2309 const size_t nr_block_size = min(nc - nr_block_start, nr);
2310
2311 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
2312 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
2313 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
2314 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
2315 if (kc_idx < kc) {
2316 packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
2317 }
2318 }
2319 packed_weights += kr;
2320 }
2321 packed_weights += (nr - nr_block_size) * kr;
2322 }
2323 }
2324 k += nc * kc;
2325 } while (--g != 0);
2326}
2327
2328void xnn_pack_f16_gemminc_goi_w(
2329 size_t g,
2330 size_t nc,
2331 size_t kc,
2332 size_t nr,
2333 size_t kr,
2334 size_t sr,
2335 const uint16_t* k,
2336 uint16_t* packed_weights,
2337 const void* params)
2338{
2339 assert(g != 0);
2340 assert(nr >= sr);
2341 assert(k != NULL);
2342 assert(packed_weights != NULL);
2343
2344 const size_t skr = sr * kr;
2345 do {
2346 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
2347 const size_t nr_block_size = min(nc - nr_block_start, nr);
2348
2349 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
2350 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
2351 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
2352 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
2353 if (kc_idx < kc) {
2354 packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
2355 }
2356 }
2357 packed_weights += kr;
2358 }
2359 packed_weights += (nr - nr_block_size) * kr;
2360 }
2361 }
2362 k += nc * kc;
2363 } while (--g != 0);
2364}
2365
2366void xnn_pack_f32_dconv_oki_w(
2367 size_t nc,
2368 size_t kc,
2369 size_t nr,
2370 size_t kh,
2371 size_t kw,
2372 const float* k,
2373 const float* b,
2374 float* packed_weights,
2375 const void* params)
2376{
2377 assert(k != NULL);
2378 assert(packed_weights != NULL);
2379
2380 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
2381 const size_t nr_block_size = min(nc - nr_block_start, nr);
2382 if XNN_LIKELY(b != NULL) {
2383 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
2384 *packed_weights++ = b[min(nr_block_offset, nr_block_size - 1)];
2385 }
2386 } else {
2387 size_t n = nr;
2388 do {
2389 *packed_weights++ = 0.0f;
2390 } while (--n != 0);
2391 }
2392
2393 for (size_t kx = 0; kx < kw; kx++) {
2394 for (size_t c = 0; c < kc; c++) {
2395 for (size_t ky = 0; ky < kh; ky++) {
2396 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
2397 *packed_weights++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
2398 }
2399 }
2400 }
2401 }
2402 if XNN_UNPREDICTABLE(b != NULL) {
2403 b += nr;
2404 }
2405 }
2406}
2407
2408void xnn_pack_f32_to_f16_dconv_oki_w(
2409 size_t nc,
2410 size_t kc,
2411 size_t nr,
2412 size_t kh,
2413 size_t kw,
2414 const float* k,
2415 const float* b,
2416 uint16_t* packed_weights,
2417 const void* params)
2418{
2419 assert(k != NULL);
2420 assert(packed_weights != NULL);
2421
2422 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
2423 const size_t nr_block_size = min(nc - nr_block_start, nr);
2424 if XNN_LIKELY(b != NULL) {
2425 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
2426 *packed_weights++ = fp16_ieee_from_fp32_value(b[min(nr_block_offset, nr_block_size - 1)]);
2427 }
2428 } else {
2429 size_t n = nr;
2430 do {
2431 *packed_weights++ = 0;
2432 } while (--n != 0);
2433 }
2434
2435 for (size_t kx = 0; kx < kw; kx++) {
2436 for (size_t c = 0; c < kc; c++) {
2437 for (size_t ky = 0; ky < kh; ky++) {
2438 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
2439 *packed_weights++ = fp16_ieee_from_fp32_value(k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c]);
2440 }
2441 }
2442 }
2443 }
2444 if XNN_UNPREDICTABLE(b != NULL) {
2445 b += nr;
2446 }
2447 }
2448}
2449
2450void xnn_pack_f16_dconv_oki_w(
2451 size_t nc,
2452 size_t kc,
2453 size_t nr,
2454 size_t kh,
2455 size_t kw,
2456 const uint16_t* k,
2457 const uint16_t* b,
2458 uint16_t* packed_weights,
2459 const void* params)
2460{
2461 assert(k != NULL);
2462 assert(packed_weights != NULL);
2463
2464 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
2465 const size_t nr_block_size = min(nc - nr_block_start, nr);
2466 if XNN_LIKELY(b != NULL) {
2467 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
2468 *packed_weights++ = b[min(nr_block_offset, nr_block_size - 1)];
2469 }
2470 } else {
2471 size_t n = nr;
2472 do {
2473 *packed_weights++ = 0;
2474 } while (--n != 0);
2475 }
2476
2477 for (size_t kx = 0; kx < kw; kx++) {
2478 for (size_t c = 0; c < kc; c++) {
2479 for (size_t ky = 0; ky < kh; ky++) {
2480 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
2481 *packed_weights++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
2482 }
2483 }
2484 }
2485 }
2486 if XNN_UNPREDICTABLE(b != NULL) {
2487 b += nr;
2488 }
2489 }
2490}
2491
2492void xnn_pack_f32_chw_dwconv_ghw_w(
2493 size_t kernel_size,
2494 size_t groups,
2495 const float* k,
2496 const float* b,
2497 float* packed_weights,
2498 const void* params)
2499{
2500 assert(k != NULL);
2501 assert(packed_weights != NULL);
2502
2503 for (size_t g = 0; g < groups; g++) {
2504 if XNN_LIKELY(b != NULL) {
2505 *packed_weights = *b++;
2506 } else {
2507 *packed_weights = 0.0f;
2508 }
2509 packed_weights += 1;
2510 for (size_t i = 0; i < kernel_size; i++) {
2511 *packed_weights++ = k[g * kernel_size + i];
2512 }
2513 }
2514}
2515
2516void xnn_pack_f32_to_f16_chw_dwconv_ghw_w(
2517 size_t kernel_size,
2518 size_t groups,
2519 const float* k,
2520 const float* b,
2521 uint16_t* packed_weights,
2522 const void* params)
2523{
2524 assert(k != NULL);
2525 assert(packed_weights != NULL);
2526
2527 for (size_t g = 0; g < groups; g++) {
2528 if XNN_LIKELY(b != NULL) {
2529 *packed_weights = fp16_ieee_from_fp32_value(*b++);
2530 } else {
2531 *packed_weights = 0;
2532 }
2533 packed_weights += 1;
2534 for (size_t i = 0; i < kernel_size; i++) {
2535 *packed_weights++ = fp16_ieee_from_fp32_value(k[g * kernel_size + i]);
2536 }
2537 }
2538}
2539
2540void xnn_pack_f16_chw_dwconv_ghw_w(
2541 size_t kernel_size,
2542 size_t groups,
2543 const uint16_t* k,
2544 const uint16_t* b,
2545 uint16_t* packed_weights,
2546 const void* params)
2547{
2548 assert(k != NULL);
2549 assert(packed_weights != NULL);
2550
2551 for (size_t g = 0; g < groups; g++) {
2552 if XNN_LIKELY(b != NULL) {
2553 *packed_weights = *b++;
2554 } else {
2555 *packed_weights = 0;
2556 }
2557 packed_weights += 1;
2558 for (size_t i = 0; i < kernel_size; i++) {
2559 *packed_weights++ = k[g * kernel_size + i];
2560 }
2561 }
2562}
2563
2564void xnn_pack_f32_chw_dwconv_hwg_w(
2565 size_t kernel_size,
2566 size_t groups,
2567 const float* k,
2568 const float* b,
2569 float* packed_weights,
2570 const void* params)
2571{
2572 assert(k != NULL);
2573 assert(packed_weights != NULL);
2574
2575 for (size_t g = 0; g < groups; g++) {
2576 if XNN_LIKELY(b != NULL) {
2577 *packed_weights = *b++;
2578 } else {
2579 *packed_weights = 0.0f;
2580 }
2581 packed_weights += 1;
2582 for (size_t i = 0; i < kernel_size; i++) {
2583 *packed_weights++ = k[i * groups + g];
2584 }
2585 }
2586}
2587
2588void xnn_pack_f16_chw_dwconv_hwg_w(
2589 size_t kernel_size,
2590 size_t groups,
2591 const uint16_t* k,
2592 const uint16_t* b,
2593 uint16_t* packed_weights,
2594 const void* params)
2595{
2596 assert(k != NULL);
2597 assert(packed_weights != NULL);
2598
2599 for (size_t g = 0; g < groups; g++) {
2600 if XNN_LIKELY(b != NULL) {
2601 *packed_weights = *b++;
2602 } else {
2603 *packed_weights = 0;
2604 }
2605 packed_weights += 1;
2606 for (size_t i = 0; i < kernel_size; i++) {
2607 *packed_weights++ = k[i * groups + g];
2608 }
2609 }
2610}
2611
2612void xnn_pack_f32_to_f16_chw_dwconv_hwg_w(
2613 size_t kernel_size,
2614 size_t groups,
2615 const float* k,
2616 const float* b,
2617 uint16_t* packed_weights,
2618 const void* params)
2619{
2620 assert(k != NULL);
2621 assert(packed_weights != NULL);
2622
2623 for (size_t g = 0; g < groups; g++) {
2624 if XNN_LIKELY(b != NULL) {
2625 *packed_weights = fp16_ieee_from_fp32_value(*b++);
2626 } else {
2627 *packed_weights = 0;
2628 }
2629 packed_weights += 1;
2630 for (size_t i = 0; i < kernel_size; i++) {
2631 *packed_weights++ = fp16_ieee_from_fp32_value(k[i * groups + g]);
2632 }
2633 }
2634}
2635
2636
2637void xnn_pack_f32_vmulcaddc_w(
2638 size_t c,
2639 size_t cr,
2640 const float* s,
2641 const float* b,
2642 float* packed_weights,
2643 const void* params)
2644{
2645 assert(s != NULL);
2646 assert(packed_weights != NULL);
2647
2648 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2649 const size_t cr_block_size = min(c - cr_block_start, cr);
2650 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2651 *packed_weights++ = s[cr_block_start + cr_block_offset];
2652 }
2653 packed_weights += cr - cr_block_size;
2654 if XNN_LIKELY(b != NULL) {
2655 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2656 *packed_weights++ = b[cr_block_start + cr_block_offset];
2657 }
2658 } else {
2659 size_t n = cr_block_size;
2660 do {
2661 *packed_weights++ = 0.0f;
2662 } while (--n != 0);
2663 }
2664 packed_weights += cr - cr_block_size;
2665 }
2666}
2667
2668void xnn_pack_f16_vmulcaddc_w(
2669 size_t c,
2670 size_t cr,
2671 const uint16_t* s,
2672 const uint16_t* b,
2673 uint16_t* packed_weights,
2674 const void* params)
2675{
2676 assert(s != NULL);
2677 assert(packed_weights != NULL);
2678
2679 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2680 const size_t cr_block_size = min(c - cr_block_start, cr);
2681 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2682 *packed_weights++ = s[cr_block_start + cr_block_offset];
2683 }
2684 packed_weights += cr - cr_block_size;
2685 if XNN_LIKELY(b != NULL) {
2686 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2687 *packed_weights++ = b[cr_block_start + cr_block_offset];
2688 }
2689 } else {
2690 size_t n = cr_block_size;
2691 do {
2692 *packed_weights++ = 0;
2693 } while (--n != 0);
2694 }
2695 packed_weights += cr - cr_block_size;
2696 }
2697}
2698
2699void xnn_pack_f32_to_f16_vmulcaddc_w(
2700 size_t c,
2701 size_t cr,
2702 const float* s,
2703 const float* b,
2704 uint16_t* packed_weights,
2705 const void* params)
2706{
2707 assert(s != NULL);
2708 assert(packed_weights != NULL);
2709
2710 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2711 const size_t cr_block_size = min(c - cr_block_start, cr);
2712 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2713 *packed_weights++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2714 }
2715 packed_weights += cr - cr_block_size;
2716 if XNN_LIKELY(b != NULL) {
2717 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2718 *packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2719 }
2720 } else {
2721 size_t n = cr_block_size;
2722 do {
2723 *packed_weights++ = 0;
2724 } while (--n != 0);
2725 }
2726 packed_weights += cr - cr_block_size;
2727 }
2728}
2729
2730void xnn_pack_f32_prelu_w(
2731 size_t c,
2732 const float* s,
2733 float* packed_weights)
2734{
2735 assert(s != NULL);
2736 assert(packed_weights != NULL);
2737
2738 memcpy(packed_weights, s, c * sizeof(float));
2739}
2740
2741void xnn_pack_f16_prelu_w(
2742 size_t c,
2743 const uint16_t* s,
2744 uint16_t* packed_weights)
2745{
2746 assert(s != NULL);
2747 assert(packed_weights != NULL);
2748
2749 memcpy(packed_weights, s, c * sizeof(uint16_t));
2750}
2751
2752void xnn_pack_f32_to_f16_prelu_w(
2753 size_t c,
2754 const float* s,
2755 uint16_t* packed_weights)
2756{
2757 assert(s != NULL);
2758 assert(packed_weights != NULL);
2759
2760 do {
2761 *packed_weights++ = fp16_ieee_from_fp32_value(*s++);
2762 } while (--c != 0);
2763}
2764
2765void xnn_analyze_f32_spmm(
2766 size_t group_output_channels,
2767 size_t group_input_channels,
2768 const float* kernel,
2769 struct xnn_spmm_packing_params* params)
2770{
2771 assert(kernel != NULL);
2772 assert(params != NULL);
2773
2774 // Count number of non-zero values.
2775 size_t num_nonzeroes = 0;
2776 size_t num_nonzero_blocks2 = 0;
2777 size_t num_nonzero_blocks4 = 0;
2778 for (size_t oc = 0; oc < round_down_po2(group_output_channels, 4); oc += 4) {
2779 for (size_t ic = 0; ic < group_input_channels; ic++) {
2780 const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != 0.0f);
2781 const size_t row1_nonzero = (size_t) (kernel[(oc + 1) * group_input_channels + ic] != 0.0f);
2782 const size_t row2_nonzero = (size_t) (kernel[(oc + 2) * group_input_channels + ic] != 0.0f);
2783 const size_t row3_nonzero = (size_t) (kernel[(oc + 3) * group_input_channels + ic] != 0.0f);
2784 num_nonzeroes += row0_nonzero + row1_nonzero + row2_nonzero + row3_nonzero;
2785 num_nonzero_blocks2 += (row0_nonzero | row1_nonzero) + (row2_nonzero | row3_nonzero);
2786 num_nonzero_blocks4 += (row0_nonzero | row1_nonzero | row2_nonzero | row3_nonzero);
2787 }
2788 }
2789 const size_t num_block4_nonzeroes = num_nonzeroes;
2790 for (size_t oc = round_down_po2(group_output_channels, 4); oc < round_down_po2(group_output_channels, 2); oc += 2) {
2791 for (size_t ic = 0; ic < group_input_channels; ic++) {
2792 const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != 0.0f);
2793 const size_t row1_nonzero = (size_t) (kernel[(oc + 1) * group_input_channels + ic] != 0.0f);
2794 num_nonzeroes += row0_nonzero + row1_nonzero;
2795 num_nonzero_blocks2 += (row0_nonzero | row1_nonzero);
2796 }
2797 }
2798 const size_t num_block2_nonzeroes = num_nonzeroes;
2799 for (size_t oc = round_down_po2(group_output_channels, 2); oc < group_output_channels; oc++) {
2800 for (size_t ic = 0; ic < group_input_channels; ic++) {
2801 num_nonzeroes += (size_t) (kernel[oc * group_input_channels + ic] != 0.0f);
2802 }
2803 }
2804 params->num_nonzeroes = num_nonzeroes;
2805 params->num_nonzero_blocks2 = num_nonzero_blocks2;
2806 params->num_nonzero_blocks4 = num_nonzero_blocks4;
2807 params->num_block2_nonzeroes = num_block2_nonzeroes;
2808 params->num_block4_nonzeroes = num_block4_nonzeroes;
2809}
2810
2811void xnn_analyze_f16_spmm(
2812 size_t group_output_channels,
2813 size_t group_input_channels,
2814 const uint16_t* kernel,
2815 struct xnn_spmm_packing_params* params)
2816{
2817 assert(kernel != NULL);
2818 assert(params != NULL);
2819
2820 // Count number of non-zero values.
2821 size_t num_nonzeroes = 0;
2822 size_t num_nonzero_blocks2 = 0;
2823 size_t num_nonzero_blocks4 = 0;
2824 for (size_t oc = 0; oc < round_down_po2(group_output_channels, 4); oc += 4) {
2825 for (size_t ic = 0; ic < group_input_channels; ic++) {
2826 const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != 0);
2827 const size_t row1_nonzero = (size_t) (kernel[(oc + 1) * group_input_channels + ic] != 0);
2828 const size_t row2_nonzero = (size_t) (kernel[(oc + 2) * group_input_channels + ic] != 0);
2829 const size_t row3_nonzero = (size_t) (kernel[(oc + 3) * group_input_channels + ic] != 0);
2830 num_nonzeroes += row0_nonzero + row1_nonzero + row2_nonzero + row3_nonzero;
2831 num_nonzero_blocks2 += (row0_nonzero | row1_nonzero) + (row2_nonzero | row3_nonzero);
2832 num_nonzero_blocks4 += (row0_nonzero | row1_nonzero | row2_nonzero | row3_nonzero);
2833 }
2834 }
2835 const size_t num_block4_nonzeroes = num_nonzeroes;
2836 for (size_t oc = round_down_po2(group_output_channels, 4); oc < round_down_po2(group_output_channels, 2); oc += 2) {
2837 for (size_t ic = 0; ic < group_input_channels; ic++) {
2838 const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != 0);
2839 const size_t row1_nonzero = (size_t) (kernel[(oc + 1) * group_input_channels + ic] != 0);
2840 num_nonzeroes += row0_nonzero + row1_nonzero;
2841 num_nonzero_blocks2 += (row0_nonzero | row1_nonzero);
2842 }
2843 }
2844 const size_t num_block2_nonzeroes = num_nonzeroes;
2845 for (size_t oc = round_down_po2(group_output_channels, 2); oc < group_output_channels; oc++) {
2846 for (size_t ic = 0; ic < group_input_channels; ic++) {
2847 num_nonzeroes += (size_t) (kernel[oc * group_input_channels + ic] != 0);
2848 }
2849 }
2850 params->num_nonzeroes = num_nonzeroes;
2851 params->num_nonzero_blocks2 = num_nonzero_blocks2;
2852 params->num_nonzero_blocks4 = num_nonzero_blocks4;
2853 params->num_block2_nonzeroes = num_block2_nonzeroes;
2854 params->num_block4_nonzeroes = num_block4_nonzeroes;
2855}
2856
2857enum xnn_status xnn_pack_f32_spmm(
2858 size_t group_output_channels,
2859 size_t output_channels_block_size,
2860 size_t group_input_channels,
2861 const float* kernel,
2862 const float* bias,
2863 int32_t* input_channel_diffs,
2864 uint32_t* output_channel_nonzeros,
2865 float* nonzero_values,
2866 size_t* first_input_channel)
2867{
2868 size_t first_ic = 0, last_ic = 0;
2869 bool first_nonzero = true;
2870 for (size_t ocb = 0; ocb < round_down_po2(group_output_channels, output_channels_block_size); ocb += output_channels_block_size) {
2871 if XNN_LIKELY(bias != NULL) {
2872 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2873 *nonzero_values++ = bias[ocb + oco];
2874 }
2875 } else {
2876 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2877 *nonzero_values++ = 0.0f;
2878 }
2879 }
2880 for (size_t ic = 0; ic < group_input_channels; ic++) {
2881 bool is_nonzero_block = false;
2882 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2883 is_nonzero_block |= (kernel[(ocb + oco) * group_input_channels + ic] != 0.0f);
2884 }
2885 if (is_nonzero_block) {
2886 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2887 *nonzero_values++ = kernel[(ocb + oco) * group_input_channels + ic];
2888 }
2889 if (first_nonzero) {
2890 first_ic = ic;
2891 } else {
2892 const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(float);
2893 if (diff != (int64_t) (int32_t) diff) {
2894 xnn_log_error("failed to convert kernel to sparse representation: "
2895 "scaled difference in input channels exceeds int32_t range");
2896 return xnn_status_unsupported_parameter;
2897 }
2898 *input_channel_diffs++ = (int32_t) diff;
2899 }
2900 first_nonzero = false;
2901 last_ic = ic;
2902 *output_channel_nonzeros += 1;
2903 }
2904 }
2905 output_channel_nonzeros += 1;
2906 }
2907 for (size_t oc = round_down_po2(group_output_channels, output_channels_block_size); oc < group_output_channels; oc++) {
2908 if XNN_LIKELY(bias != NULL) {
2909 *nonzero_values++ = bias[oc];
2910 } else {
2911 *nonzero_values++ = 0.0f;
2912 }
2913 for (size_t ic = 0; ic < group_input_channels; ic++) {
2914 const float weight = kernel[oc * group_input_channels + ic];
2915 if (weight != 0.0f) {
2916 *nonzero_values++ = weight;
2917 if (first_nonzero) {
2918 first_ic = ic;
2919 } else {
2920 const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(float);
2921 if (diff != (int64_t) (int32_t) diff) {
2922 xnn_log_error("failed to convert kernel to sparse representation: "
2923 "scaled difference in input channels exceeds int32_t range");
2924 return xnn_status_unsupported_parameter;
2925 }
2926 *input_channel_diffs++ = (int32_t) diff;
2927 }
2928 first_nonzero = false;
2929 last_ic = ic;
2930 *output_channel_nonzeros += 1;
2931 }
2932 }
2933 output_channel_nonzeros += 1;
2934 }
2935 // If there are any non-zero elements, we have to return to the initial input channel.
2936 if (!first_nonzero) {
2937 const int64_t diff = (int64_t) ((uint64_t) first_ic - (uint64_t) last_ic) * (int64_t) sizeof(float);
2938 if (diff != (int64_t) (int32_t) diff) {
2939 xnn_log_error("failed to convert kernel to sparse representation: "
2940 "scaled difference in input channels exceeds int32_t range");
2941 return xnn_status_unsupported_parameter;
2942 }
2943 *input_channel_diffs++ = (int32_t) diff;
2944 }
2945 *first_input_channel = first_ic;
2946 return xnn_status_success;
2947}
2948
2949
2950enum xnn_status xnn_pack_f32_to_f16_spmm(
2951 size_t group_output_channels,
2952 size_t output_channels_block_size,
2953 size_t group_input_channels,
2954 const float* kernel,
2955 const float* bias,
2956 int32_t* input_channel_diffs,
2957 uint32_t* output_channel_nonzeros,
2958 uint16_t* nonzero_values, // fp16 values
2959 size_t* first_input_channel)
2960{
2961 size_t first_ic = 0, last_ic = 0;
2962 bool first_nonzero = true;
2963 for (size_t ocb = 0; ocb < round_down_po2(group_output_channels, output_channels_block_size); ocb += output_channels_block_size) {
2964 if XNN_LIKELY(bias != NULL) {
2965 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2966 *nonzero_values++ = fp16_ieee_from_fp32_value(bias[ocb + oco]);
2967 }
2968 } else {
2969 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2970 *nonzero_values++ = 0;
2971 }
2972 }
2973 for (size_t ic = 0; ic < group_input_channels; ic++) {
2974 bool is_nonzero_block = false;
2975 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2976 is_nonzero_block |= (kernel[(ocb + oco) * group_input_channels + ic] != 0.0f);
2977 }
2978 if (is_nonzero_block) {
2979 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
2980 *nonzero_values++ = fp16_ieee_from_fp32_value(kernel[(ocb + oco) * group_input_channels + ic]);
2981 }
2982 if (first_nonzero) {
2983 first_ic = ic;
2984 } else {
2985 const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
2986 if (diff != (int64_t) (int32_t) diff) {
2987 xnn_log_error("failed to convert kernel to sparse representation: "
2988 "scaled difference in input channels exceeds int32_t range");
2989 return xnn_status_unsupported_parameter;
2990 }
2991 *input_channel_diffs++ = (int32_t) diff;
2992 }
2993 first_nonzero = false;
2994 last_ic = ic;
2995 *output_channel_nonzeros += 1;
2996 }
2997 }
2998 output_channel_nonzeros += 1;
2999 }
3000 for (size_t oc = round_down_po2(group_output_channels, output_channels_block_size); oc < group_output_channels; oc++) {
3001 if XNN_LIKELY(bias != NULL) {
3002 *nonzero_values++ = fp16_ieee_from_fp32_value(bias[oc]);
3003 } else {
3004 *nonzero_values++ = 0;
3005 }
3006 for (size_t ic = 0; ic < group_input_channels; ic++) {
3007 const float weight = kernel[oc * group_input_channels + ic];
3008 if (weight != 0.0f) {
3009 *nonzero_values++ = fp16_ieee_from_fp32_value(weight);
3010 if (first_nonzero) {
3011 first_ic = ic;
3012 } else {
3013 const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3014 if (diff != (int64_t) (int32_t) diff) {
3015 xnn_log_error("failed to convert kernel to sparse representation: "
3016 "scaled difference in input channels exceeds int32_t range");
3017 return xnn_status_unsupported_parameter;
3018 }
3019 *input_channel_diffs++ = (int32_t) diff;
3020 }
3021 first_nonzero = false;
3022 last_ic = ic;
3023 *output_channel_nonzeros += 1;
3024 }
3025 }
3026 output_channel_nonzeros += 1;
3027 }
3028 // If there are any non-zero elements, we have to return to the initial input channel.
3029 if (!first_nonzero) {
3030 const int64_t diff = (int64_t) ((uint64_t) first_ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3031 if (diff != (int64_t) (int32_t) diff) {
3032 xnn_log_error("failed to convert kernel to sparse representation: "
3033 "scaled difference in input channels exceeds int32_t range");
3034 return xnn_status_unsupported_parameter;
3035 }
3036 *input_channel_diffs++ = (int32_t) diff;
3037 }
3038 *first_input_channel = first_ic;
3039 return xnn_status_success;
3040}
3041
3042enum xnn_status xnn_pack_f16_spmm(
3043 size_t group_output_channels,
3044 size_t output_channels_block_size,
3045 size_t group_input_channels,
3046 const uint16_t* kernel, // fp16 values
3047 const uint16_t* bias, // fp16 values
3048 int32_t* input_channel_diffs,
3049 uint32_t* output_channel_nonzeros,
3050 uint16_t* nonzero_values, // fp16 values
3051 size_t* first_input_channel)
3052{
3053 size_t first_ic = 0, last_ic = 0;
3054 bool first_nonzero = true;
3055 for (size_t ocb = 0; ocb < round_down_po2(group_output_channels, output_channels_block_size); ocb += output_channels_block_size) {
3056 if XNN_LIKELY(bias != NULL) {
3057 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
3058 *nonzero_values++ = bias[ocb + oco];
3059 }
3060 } else {
3061 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
3062 *nonzero_values++ = 0;
3063 }
3064 }
3065 for (size_t ic = 0; ic < group_input_channels; ic++) {
3066 bool is_nonzero_block = false;
3067 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
3068 is_nonzero_block |= (kernel[(ocb + oco) * group_input_channels + ic] != 0);
3069 }
3070 if (is_nonzero_block) {
3071 for (size_t oco = 0; oco < output_channels_block_size; oco++) {
3072 *nonzero_values++ = kernel[(ocb + oco) * group_input_channels + ic];
3073 }
3074 if (first_nonzero) {
3075 first_ic = ic;
3076 } else {
3077 const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3078 if (diff != (int64_t) (int32_t) diff) {
3079 xnn_log_error("failed to convert kernel to sparse representation: "
3080 "scaled difference in input channels exceeds int32_t range");
3081 return xnn_status_unsupported_parameter;
3082 }
3083 *input_channel_diffs++ = (int32_t) diff;
3084 }
3085 first_nonzero = false;
3086 last_ic = ic;
3087 *output_channel_nonzeros += 1;
3088 }
3089 }
3090 output_channel_nonzeros += 1;
3091 }
3092 for (size_t oc = round_down_po2(group_output_channels, output_channels_block_size); oc < group_output_channels; oc++) {
3093 if XNN_LIKELY(bias != NULL) {
3094 *nonzero_values++ = bias[oc];
3095 } else {
3096 *nonzero_values++ = 0;
3097 }
3098 for (size_t ic = 0; ic < group_input_channels; ic++) {
3099 const float weight = kernel[oc * group_input_channels + ic];
3100 if (weight != 0) {
3101 *nonzero_values++ = weight;
3102 if (first_nonzero) {
3103 first_ic = ic;
3104 } else {
3105 const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3106 if (diff != (int64_t) (int32_t) diff) {
3107 xnn_log_error("failed to convert kernel to sparse representation: "
3108 "scaled difference in input channels exceeds int32_t range");
3109 return xnn_status_unsupported_parameter;
3110 }
3111 *input_channel_diffs++ = (int32_t) diff;
3112 }
3113 first_nonzero = false;
3114 last_ic = ic;
3115 *output_channel_nonzeros += 1;
3116 }
3117 }
3118 output_channel_nonzeros += 1;
3119 }
3120 // If there are any non-zero elements, we have to return to the initial input channel.
3121 if (!first_nonzero) {
3122 const int64_t diff = (int64_t) ((uint64_t) first_ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3123 if (diff != (int64_t) (int32_t) diff) {
3124 xnn_log_error("failed to convert kernel to sparse representation: "
3125 "scaled difference in input channels exceeds int32_t range");
3126 return xnn_status_unsupported_parameter;
3127 }
3128 *input_channel_diffs++ = (int32_t) diff;
3129 }
3130 *first_input_channel = first_ic;
3131 return xnn_status_success;
3132}
3133