1// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stdint.h>
10#include <stddef.h>
11#include <string.h>
12
13#include <fp16.h>
14
15#include <xnnpack/math.h>
16#include <xnnpack/operator.h>
17#include <xnnpack/pack.h>
18#include <xnnpack/unaligned.h>
19
20
21void xnn_pack_f32_gemm_goi_w(
22 size_t g,
23 size_t nc,
24 size_t kc,
25 size_t nr,
26 size_t kr,
27 size_t sr,
28 const float* k,
29 const float* b,
30 float* packed_w,
31 size_t extra_bytes,
32 const void* params)
33{
34 assert(nr >= sr);
35
36 const size_t skr = sr * kr;
37 do {
38 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
39 const size_t nr_block_size = min(nc - nr_block_start, nr);
40 if XNN_LIKELY(b != NULL) {
41 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
42 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
43 }
44 }
45 packed_w += nr;
46
47 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
48 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
49 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
50 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
51 if (kc_idx < kc) {
52 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
53 }
54 }
55 packed_w += kr;
56 }
57 packed_w += (nr - nr_block_size) * kr;
58 }
59 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
60 }
61 k += nc * kc;
62 if XNN_UNPREDICTABLE(b != NULL) {
63 b += nc;
64 }
65 } while (--g != 0);
66}
67
68void xnn_pack_f16_gemm_goi_w(
69 size_t g,
70 size_t nc,
71 size_t kc,
72 size_t nr,
73 size_t kr,
74 size_t sr,
75 const uint16_t* k,
76 const uint16_t* b,
77 uint16_t* packed_w,
78 size_t extra_bytes,
79 const void* params)
80{
81 assert(nr >= sr);
82
83 const size_t skr = sr * kr;
84 do {
85 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
86 const size_t nr_block_size = min(nc - nr_block_start, nr);
87 if XNN_LIKELY(b != NULL) {
88 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
89 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
90 }
91 }
92 packed_w += nr;
93
94 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
95 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
96 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
97 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
98 if (kc_idx < kc) {
99 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
100 }
101 }
102 packed_w += kr;
103 }
104 packed_w += (nr - nr_block_size) * kr;
105 }
106 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
107 }
108 k += nc * kc;
109 if XNN_UNPREDICTABLE(b != NULL) {
110 b += nc;
111 }
112 } while (--g != 0);
113}
114
115void xnn_pack_f32_to_f16_gemm_goi_w(
116 size_t g,
117 size_t nc,
118 size_t kc,
119 size_t nr,
120 size_t kr,
121 size_t sr,
122 const float* k,
123 const float* b,
124 uint16_t* packed_w,
125 size_t extra_bytes,
126 const void* params)
127{
128 assert(nr >= sr);
129
130 const size_t skr = sr * kr;
131 do {
132 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
133 const size_t nr_block_size = min(nc - nr_block_start, nr);
134 if XNN_LIKELY(b != NULL) {
135 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
136 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
137 }
138 }
139 packed_w += nr;
140
141 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
142 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
143 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
144 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
145 if (kc_idx < kc) {
146 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
147 }
148 }
149 packed_w += kr;
150 }
151 packed_w += (nr - nr_block_size) * kr;
152 }
153 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
154 }
155 k += nc * kc;
156 if XNN_UNPREDICTABLE(b != NULL) {
157 b += nc;
158 }
159 } while (--g != 0);
160}
161
162void xnn_pack_qu8_gemm_goi_w(
163 size_t g,
164 size_t nc,
165 size_t kc,
166 size_t nr,
167 size_t kr,
168 size_t sr,
169 const uint8_t* k,
170 const int32_t* b,
171 void* packed_w,
172 size_t extra_bytes,
173 const struct xnn_qu8_packing_params* params)
174{
175 assert(nr >= sr);
176
177 const size_t skr = sr * kr;
178 const int32_t izp = (int32_t) params->input_zero_point;
179 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
180 do {
181 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
182 const size_t nr_block_size = min(nc - nr_block_start, nr);
183 int32_t* packed_b = (int32_t*) packed_w;
184 if XNN_LIKELY(b != NULL) {
185 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
186 unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
187 packed_w = (int32_t*) packed_w + 1;
188 }
189 } else {
190 size_t n = nr_block_size;
191 do {
192 unaligned_store_s32(packed_w, bzp);
193 packed_w = (int32_t*) packed_w + 1;
194 } while (--n != 0);
195 }
196 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
197
198 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
199 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
200 int32_t ksum = 0;
201 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
202 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
203 if (kc_idx < kc) {
204 const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
205 ksum += (int32_t) kv;
206 ((uint8_t*) packed_w)[kr_block_offset] = kv;
207 }
208 }
209 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
210 packed_w = (uint8_t*) packed_w + kr;
211 }
212 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
213 }
214 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
215 }
216 k += nc * kc;
217 if XNN_UNPREDICTABLE(b != NULL) {
218 b += nc;
219 }
220 } while (--g != 0);
221}
222
223void xnn_pack_qs8_gemm_goi_w(
224 size_t g,
225 size_t nc,
226 size_t kc,
227 size_t nr,
228 size_t kr,
229 size_t sr,
230 const int8_t* k,
231 const int32_t* b,
232 void* packed_w,
233 size_t extra_bytes,
234 const struct xnn_qs8_packing_params* params)
235{
236 assert(nr >= sr);
237
238 const size_t skr = sr * kr;
239 const uint32_t izp = (uint32_t) params->input_zero_point;
240 do {
241 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
242 const size_t nr_block_size = min(nc - nr_block_start, nr);
243 int32_t* packed_b = (int32_t*) packed_w;
244 if XNN_LIKELY(b != NULL) {
245 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
246 unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
247 packed_w = (int32_t*) packed_w + 1;
248 }
249 } else {
250 size_t n = nr_block_size;
251 do {
252 unaligned_store_s32(packed_w, 0);
253 packed_w = (int32_t*) packed_w + 1;
254 } while (--n != 0);
255 }
256 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
257
258 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
259 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
260 uint32_t ksum = 0;
261 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
262 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
263 if (kc_idx < kc) {
264 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
265 ksum += (uint32_t) kv;
266 ((int8_t*) packed_w)[kr_block_offset] = kv;
267 }
268 }
269 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
270 packed_w = (int8_t*) packed_w + kr;
271 }
272 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
273 }
274 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
275 }
276 k += nc * kc;
277 if XNN_UNPREDICTABLE(b != NULL) {
278 b += nc;
279 }
280 } while (--g != 0);
281}
282
283void xnn_pack_qs8_gemm_xw_goi_w(
284 size_t g,
285 size_t nc,
286 size_t kc,
287 size_t nr,
288 size_t kr,
289 size_t sr,
290 const int8_t* k,
291 const int32_t* b,
292 void* packed_w,
293 size_t extra_bytes,
294 const struct xnn_qs8_packing_params* params)
295{
296 assert(nr >= sr);
297
298 const size_t skr = sr * kr;
299 const uint32_t izp = (uint32_t) params->input_zero_point;
300 do {
301 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
302 const size_t nr_block_size = min(nc - nr_block_start, nr);
303 int32_t* packed_b = (int32_t*) packed_w;
304 if XNN_LIKELY(b != NULL) {
305 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
306 unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
307 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
308 }
309 } else {
310 size_t n = nr_block_size;
311 do {
312 unaligned_store_s32(packed_w, 0);
313 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
314 } while (--n != 0);
315 }
316 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
317
318 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
319 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
320 uint32_t ksum = 0;
321 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
322 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
323 if (kc_idx < kc) {
324 const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
325 ksum += (uint32_t) kv;
326 ((int16_t*) packed_w)[kr_block_offset] = (int16_t) kv;
327 }
328 }
329 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
330 packed_w = (int16_t*) packed_w + kr;
331 }
332 packed_w = (int16_t*) packed_w + (nr - nr_block_size) * kr;
333 }
334 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
335 }
336 k += nc * kc;
337 if XNN_UNPREDICTABLE(b != NULL) {
338 b += nc;
339 }
340 } while (--g != 0);
341}
342
343void xnn_pack_f32_gemm_io_w(
344 size_t nc,
345 size_t kc,
346 size_t nr,
347 size_t kr,
348 size_t sr,
349 const float* k,
350 const float* b,
351 float* packed_w,
352 const void* params)
353{
354 assert(nr >= sr);
355
356 const size_t skr = sr * kr;
357 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
358 const size_t nr_block_size = min(nc - nr_block_start, nr);
359 if XNN_LIKELY(b != NULL) {
360 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
361 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
362 }
363 }
364 packed_w += nr;
365
366 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
367 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
368 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
369 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
370 if (kc_idx < kc) {
371 packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
372 }
373 }
374 packed_w += kr;
375 }
376 packed_w += (nr - nr_block_size) * kr;
377 }
378 }
379}
380
381void xnn_pack_f16_gemm_io_w(
382 size_t nc,
383 size_t kc,
384 size_t nr,
385 size_t kr,
386 size_t sr,
387 const uint16_t* k,
388 const uint16_t* b,
389 uint16_t* packed_w,
390 const void* params)
391{
392 assert(nr >= sr);
393
394 const size_t skr = sr * kr;
395 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
396 const size_t nr_block_size = min(nc - nr_block_start, nr);
397 if XNN_LIKELY(b != NULL) {
398 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
399 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
400 }
401 }
402 packed_w += nr;
403
404 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
405 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
406 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
407 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
408 if (kc_idx < kc) {
409 packed_w[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
410 }
411 }
412 packed_w += kr;
413 }
414 packed_w += (nr - nr_block_size) * kr;
415 }
416 }
417}
418
419void xnn_pack_f32_to_f16_gemm_io_w(
420 size_t nc,
421 size_t kc,
422 size_t nr,
423 size_t kr,
424 size_t sr,
425 const float* k,
426 const float* b,
427 uint16_t* packed_w,
428 const void* params)
429{
430 assert(nr >= sr);
431
432 const size_t skr = sr * kr;
433 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
434 const size_t nr_block_size = min(nc - nr_block_start, nr);
435 if XNN_LIKELY(b != NULL) {
436 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
437 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
438 }
439 }
440 packed_w += nr;
441
442 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
443 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
444 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
445 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
446 if (kc_idx < kc) {
447 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
448 }
449 }
450 packed_w += kr;
451 }
452 packed_w += (nr - nr_block_size) * kr;
453 }
454 }
455}
456
457void xnn_pack_qu8_gemm_io_w(
458 size_t nc,
459 size_t kc,
460 size_t nr,
461 size_t kr,
462 size_t sr,
463 const uint8_t* k,
464 const int32_t* b,
465 void* packed_w,
466 const struct xnn_qu8_packing_params* params)
467{
468 assert(nr >= sr);
469
470 const size_t skr = sr * kr;
471 const int32_t izp = (int32_t) params->input_zero_point;
472 const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
473 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
474 const size_t nr_block_size = min(nc - nr_block_start, nr);
475 int32_t* packed_b = (int32_t*) packed_w;
476 if XNN_LIKELY(b != NULL) {
477 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
478 unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
479 packed_w = (int32_t*) packed_w + 1;
480 }
481 } else {
482 size_t n = nr_block_size;
483 do {
484 unaligned_store_s32(packed_w, bzp);
485 packed_w = (int32_t*) packed_w + 1;
486 } while (--n != 0);
487 }
488 packed_w = (int32_t*) packed_w + (nr - nr_block_size);
489
490 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
491 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
492 int32_t ksum = 0;
493 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
494 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
495 if (kc_idx < kc) {
496 const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
497 ksum += (int32_t) kv;
498 ((uint8_t*) packed_w)[kr_block_offset] = kv;
499 }
500 }
501 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
502 packed_w = (uint8_t*) packed_w + kr;
503 }
504 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
505 }
506 }
507}
508
509void xnn_pack_qs8_gemm_io_w(
510 size_t nc,
511 size_t kc,
512 size_t nr,
513 size_t kr,
514 size_t sr,
515 const int8_t* k,
516 const int32_t* b,
517 void* packed_w,
518 const struct xnn_qs8_packing_params* params)
519{
520 assert(nr >= sr);
521
522 const size_t skr = sr * kr;
523 const uint32_t izp = (uint32_t) params->input_zero_point;
524 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
525 const size_t nr_block_size = min(nc - nr_block_start, nr);
526 int32_t* packed_b = (int32_t*) packed_w;
527 if XNN_LIKELY(b != NULL) {
528 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
529 unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
530 packed_w = (int32_t*) packed_w + 1;
531 }
532 } else {
533 size_t n = nr_block_size;
534 do {
535 unaligned_store_s32(packed_w, 0);
536 packed_w = (int32_t*) packed_w + 1;
537 } while (--n != 0);
538 }
539 packed_w = (uint32_t*) packed_w + (nr - nr_block_size);
540
541 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
542 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
543 uint32_t ksum = 0;
544 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
545 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
546 if (kc_idx < kc) {
547 const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
548 ksum += (uint32_t) kv;
549 ((int8_t*) packed_w)[kr_block_offset] = kv;
550 }
551 }
552 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
553 packed_w = (int8_t*) packed_w + kr;
554 }
555 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
556 }
557 }
558}
559
560void xnn_pack_f32_conv_goki_w(
561 size_t g,
562 size_t nc,
563 size_t ks,
564 size_t kc,
565 size_t nr,
566 size_t kr,
567 size_t sr,
568 const float* k,
569 const float* b,
570 float* packed_w,
571 size_t extra_bytes,
572 const void* params)
573{
574 assert(nr >= sr);
575
576 const size_t skr = sr * kr;
577 do {
578 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
579 const size_t nr_block_size = min(nc - nr_block_start, nr);
580 if XNN_LIKELY(b != NULL) {
581 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
582 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
583 }
584 }
585 packed_w += nr;
586
587 for (size_t ki = 0; ki < ks; ki++) {
588 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
589 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
590 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
591 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
592 if (kc_idx < kc) {
593 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
594 }
595 }
596 packed_w += kr;
597 }
598 packed_w += (nr - nr_block_size) * kr;
599 }
600 }
601 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
602 }
603 k += ks * kc * nc;
604 if XNN_UNPREDICTABLE(b != NULL) {
605 b += nc;
606 }
607 } while (--g != 0);
608}
609
610void xnn_pack_f16_conv_goki_w(
611 size_t g,
612 size_t nc,
613 size_t ks,
614 size_t kc,
615 size_t nr,
616 size_t kr,
617 size_t sr,
618 const uint16_t* k,
619 const uint16_t* b,
620 uint16_t* packed_w,
621 size_t extra_bytes,
622 const void* params)
623{
624 assert(nr >= sr);
625
626 const size_t skr = sr * kr;
627 do {
628 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
629 const size_t nr_block_size = min(nc - nr_block_start, nr);
630 if XNN_LIKELY(b != NULL) {
631 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
632 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
633 }
634 }
635 packed_w += nr;
636
637 for (size_t ki = 0; ki < ks; ki++) {
638 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
639 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
640 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
641 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
642 if (kc_idx < kc) {
643 packed_w[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
644 }
645 }
646 packed_w += kr;
647 }
648 packed_w += (nr - nr_block_size) * kr;
649 }
650 }
651 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
652 }
653 k += ks * kc * nc;
654 if XNN_UNPREDICTABLE(b != NULL) {
655 b += nc;
656 }
657 } while (--g != 0);
658}
659
660void xnn_pack_f32_to_f16_conv_goki_w(
661 size_t g,
662 size_t nc,
663 size_t ks,
664 size_t kc,
665 size_t nr,
666 size_t kr,
667 size_t sr,
668 const float* k,
669 const float* b,
670 uint16_t* packed_w,
671 size_t extra_bytes,
672 const void* params)
673{
674 assert(nr >= sr);
675
676 const size_t skr = sr * kr;
677 do {
678 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
679 const size_t nr_block_size = min(nc - nr_block_start, nr);
680 if XNN_LIKELY(b != NULL) {
681 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
682 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
683 }
684 }
685 packed_w += nr;
686
687 for (size_t ki = 0; ki < ks; ki++) {
688 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
689 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
690 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
691 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
692 if (kc_idx < kc) {
693 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
694 }
695 }
696 packed_w += kr;
697 }
698 packed_w += (nr - nr_block_size) * kr;
699 }
700 }
701 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
702 }
703 k += ks * kc * nc;
704 if XNN_UNPREDICTABLE(b != NULL) {
705 b += nc;
706 }
707 } while (--g != 0);
708}
709
710void xnn_pack_qu8_conv_goki_w(
711 size_t g,
712 size_t nc,
713 size_t ks,
714 size_t kc,
715 size_t nr,
716 size_t kr,
717 size_t sr,
718 const uint8_t* k,
719 const int32_t* b,
720 void* packed_w,
721 size_t extra_bytes,
722 const struct xnn_qu8_packing_params* params)
723{
724 assert(nr >= sr);
725
726 const size_t skr = sr * kr;
727 const int32_t izp = (int32_t) params->input_zero_point;
728 const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
729 do {
730 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
731 const size_t nr_block_size = min(nc - nr_block_start, nr);
732 int32_t* packed_b = (int32_t*) packed_w;
733 if XNN_LIKELY(b != NULL) {
734 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
735 unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
736 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
737 }
738 } else {
739 size_t n = nr_block_size;
740 do {
741 unaligned_store_s32(packed_w, bzp);
742 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
743 } while (--n != 0);
744 }
745 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
746
747 for (size_t ki = 0; ki < ks; ki++) {
748 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
749 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
750 int32_t ksum = 0;
751 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
752 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
753 if (kc_idx < kc) {
754 const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
755 ksum += (int32_t) kv;
756 ((uint8_t*) packed_w)[kr_block_offset] = kv;
757 }
758 }
759 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
760 packed_w = (uint8_t*) packed_w + kr;
761 }
762 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
763 }
764 }
765 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
766 }
767 k += ks * kc * nc;
768 if XNN_UNPREDICTABLE(b != NULL) {
769 b += nc;
770 }
771 } while (--g != 0);
772}
773
774void xnn_pack_qs8_conv_goki_w(
775 size_t g,
776 size_t nc,
777 size_t ks,
778 size_t kc,
779 size_t nr,
780 size_t kr,
781 size_t sr,
782 const int8_t* k,
783 const int32_t* b,
784 void* packed_w,
785 size_t extra_bytes,
786 const struct xnn_qs8_packing_params* params)
787{
788 assert(nr >= sr);
789
790 const size_t skr = sr * kr;
791 const uint32_t izp = (int32_t) params->input_zero_point;
792 do {
793 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
794 const size_t nr_block_size = min(nc - nr_block_start, nr);
795 int32_t* packed_b = (int32_t*) packed_w;
796 if XNN_LIKELY(b != NULL) {
797 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
798 unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
799 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
800 }
801 } else {
802 size_t n = nr_block_size;
803 do {
804 unaligned_store_s32(packed_w, 0);
805 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
806 } while (--n != 0);
807 }
808 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
809
810 for (size_t ki = 0; ki < ks; ki++) {
811 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
812 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
813 uint32_t ksum = 0;
814 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
815 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
816 if (kc_idx < kc) {
817 const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
818 ksum += (uint32_t) kv;
819 ((int8_t*) packed_w)[kr_block_offset] = kv;
820 }
821 }
822 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
823 packed_w = (int8_t*) packed_w + kr;
824 }
825 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
826 }
827 }
828 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
829 }
830 k += ks * kc * nc;
831 if XNN_UNPREDICTABLE(b != NULL) {
832 b += nc;
833 }
834 } while (--g != 0);
835}
836
837void xnn_pack_f32_conv_kgo_w(
838 size_t g,
839 size_t nc,
840 size_t ks,
841 size_t nr,
842 size_t kr,
843 size_t sr,
844 const float* k,
845 const float* b,
846 float* packed_w,
847 size_t extra_bytes,
848 const void* params)
849{
850 assert(nr >= sr);
851
852 for (size_t i = 0; i < g; i++) {
853 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
854 const size_t nr_block_size = min(nc - nr_block_start, nr);
855 if XNN_LIKELY(b != NULL) {
856 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
857 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
858 }
859 }
860 packed_w += nr;
861
862 for (size_t ki = 0; ki < ks; ki++) {
863 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
864 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
865 packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
866 }
867 packed_w += nr * kr;
868 }
869 }
870 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
871 }
872 k += nc;
873 if XNN_UNPREDICTABLE(b != NULL) {
874 b += nc;
875 }
876 }
877}
878
879void xnn_pack_f16_conv_kgo_w(
880 size_t g,
881 size_t nc,
882 size_t ks,
883 size_t nr,
884 size_t kr,
885 size_t sr,
886 const uint16_t* k,
887 const uint16_t* b,
888 uint16_t* packed_w,
889 size_t extra_bytes,
890 const void* params)
891{
892 assert(nr >= sr);
893
894 for (size_t i = 0; i < g; i++) {
895 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
896 const size_t nr_block_size = min(nc - nr_block_start, nr);
897 if XNN_LIKELY(b != NULL) {
898 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
899 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
900 }
901 }
902 packed_w += nr;
903
904 for (size_t ki = 0; ki < ks; ki++) {
905 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
906 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
907 packed_w[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
908 }
909 packed_w += nr * kr;
910 }
911 }
912 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
913 }
914 k += nc;
915 if XNN_UNPREDICTABLE(b != NULL) {
916 b += nc;
917 }
918 }
919}
920
921void xnn_pack_f32_to_f16_conv_kgo_w(
922 size_t g,
923 size_t nc,
924 size_t ks,
925 size_t nr,
926 size_t kr,
927 size_t sr,
928 const float* k,
929 const float* b,
930 uint16_t* packed_w,
931 size_t extra_bytes,
932 const void* params)
933{
934 assert(nr >= sr);
935
936 for (size_t i = 0; i < g; i++) {
937 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
938 const size_t nr_block_size = min(nc - nr_block_start, nr);
939 if XNN_LIKELY(b != NULL) {
940 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
941 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
942 }
943 }
944 packed_w += nr;
945
946 for (size_t ki = 0; ki < ks; ki++) {
947 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
948 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
949 packed_w[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
950 }
951 packed_w += nr * kr;
952 }
953 }
954 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
955 }
956 k += nc;
957 if XNN_UNPREDICTABLE(b != NULL) {
958 b += nc;
959 }
960 }
961}
962
963void xnn_pack_qu8_conv_kgo_w(
964 size_t g,
965 size_t nc,
966 size_t ks,
967 size_t nr,
968 size_t kr,
969 size_t sr,
970 const uint8_t* k,
971 const int32_t* b,
972 void* packed_w,
973 size_t extra_bytes,
974 const struct xnn_qu8_packing_params* params)
975{
976 assert(nr >= sr);
977
978 const int32_t izp = (int32_t) params->input_zero_point;
979 const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
980 for (size_t i = 0; i < g; i++) {
981 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
982 const size_t nr_block_size = min(nc - nr_block_start, nr);
983 int32_t* packed_b = (int32_t*) packed_w;
984 if XNN_LIKELY(b != NULL) {
985 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
986 unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
987 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
988 }
989 } else {
990 size_t n = nr_block_size;
991 do {
992 unaligned_store_s32(packed_w, bzp);
993 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
994 } while (--n != 0);
995 }
996 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
997
998 for (size_t ki = 0; ki < ks; ki++) {
999 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1000 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1001 const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1002 ((uint8_t*) packed_w)[nr_block_offset * kr] = kv;
1003 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - (int32_t) kv * izp);
1004 }
1005 packed_w = (uint8_t*) packed_w + nr * kr;
1006 }
1007 }
1008 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1009 }
1010 k += nc;
1011 if XNN_UNPREDICTABLE(b != NULL) {
1012 b += nc;
1013 }
1014 }
1015}
1016
1017void xnn_pack_qs8_conv_kgo_w(
1018 size_t g,
1019 size_t nc,
1020 size_t ks,
1021 size_t nr,
1022 size_t kr,
1023 size_t sr,
1024 const int8_t* k,
1025 const int32_t* b,
1026 void* packed_w,
1027 size_t extra_bytes,
1028 const struct xnn_qs8_packing_params* params)
1029{
1030 assert(nr >= sr);
1031
1032 const uint32_t izp = (uint32_t) params->input_zero_point;
1033 for (size_t i = 0; i < g; i++) {
1034 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1035 const size_t nr_block_size = min(nc - nr_block_start, nr);
1036 int32_t* packed_b = (int32_t*) packed_w;
1037 if XNN_LIKELY(b != NULL) {
1038 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1039 unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
1040 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1041 }
1042 } else {
1043 size_t n = nr_block_size;
1044 do {
1045 unaligned_store_s32(packed_w, 0);
1046 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1047 } while (--n != 0);
1048 }
1049 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1050
1051 for (size_t ki = 0; ki < ks; ki++) {
1052 for (size_t sr_block_offset = 0; sr_block_offset < sr; sr_block_offset++) {
1053 for (size_t nr_block_offset = (-sr_block_offset) & (sr - 1); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1054 const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1055 ((int8_t*) packed_w)[nr_block_offset * kr] = kv;
1056 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - (uint32_t) kv * izp);
1057 }
1058 packed_w = (int8_t*) packed_w + nr * kr;
1059 }
1060 }
1061 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1062 }
1063 k += nc;
1064 if XNN_UNPREDICTABLE(b != NULL) {
1065 b += nc;
1066 }
1067 }
1068}
1069
1070void xnn_pack_f32_deconv_goki_w(
1071 size_t g,
1072 size_t nc,
1073 size_t kh,
1074 size_t kw,
1075 size_t kc,
1076 size_t sh,
1077 size_t sw,
1078 size_t nr,
1079 size_t kr,
1080 size_t sr,
1081 const float* k,
1082 const float* b,
1083 float* packed_w,
1084 struct subconvolution_params* subconv_params,
1085 const void* params)
1086{
1087 assert(nr >= sr);
1088
1089 const size_t skr = sr * kr;
1090 for (size_t i = 0; i < g; i++) {
1091 for (size_t oy = 0; oy < sh; oy++) {
1092 for (size_t ox = 0; ox < sw; ox++) {
1093 if (i == 0) {
1094 (*subconv_params++).weights = packed_w;
1095 }
1096 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1097 const size_t nr_block_size = min(nc - nr_block_start, nr);
1098 if XNN_LIKELY(b != NULL) {
1099 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1100 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1101 }
1102 }
1103 packed_w += nr;
1104 for (size_t ky = oy; ky < kh; ky += sh) {
1105 for (size_t kx = ox; kx < kw; kx += sw) {
1106 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1107 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1108 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1109 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1110 if (kc_idx < kc) {
1111 packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1112 }
1113 }
1114 packed_w += kr;
1115 }
1116 packed_w += (nr - nr_block_size) * kr;
1117 }
1118 }
1119 }
1120 }
1121 }
1122 }
1123 k += kh * kw * kc * nc;
1124 if XNN_UNPREDICTABLE(b != NULL) {
1125 b += nc;
1126 }
1127 }
1128}
1129
1130void xnn_pack_f16_deconv_goki_w(
1131 size_t g,
1132 size_t nc,
1133 size_t kh,
1134 size_t kw,
1135 size_t kc,
1136 size_t sh,
1137 size_t sw,
1138 size_t nr,
1139 size_t kr,
1140 size_t sr,
1141 const uint16_t* k,
1142 const uint16_t* b,
1143 uint16_t* packed_w,
1144 struct subconvolution_params* subconv_params,
1145 const void* params)
1146{
1147 assert(nr >= sr);
1148
1149 const size_t skr = sr * kr;
1150 for (size_t i = 0; i < g; i++) {
1151 for (size_t oy = 0; oy < sh; oy++) {
1152 for (size_t ox = 0; ox < sw; ox++) {
1153 if (i == 0) {
1154 (*subconv_params++).weights = packed_w;
1155 }
1156 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1157 const size_t nr_block_size = min(nc - nr_block_start, nr);
1158 if XNN_LIKELY(b != NULL) {
1159 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1160 packed_w[nr_block_offset] = b[nr_block_start + nr_block_offset];
1161 }
1162 }
1163 packed_w += nr;
1164 for (size_t ky = oy; ky < kh; ky += sh) {
1165 for (size_t kx = ox; kx < kw; kx += sw) {
1166 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1167 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1168 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1169 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1170 if (kc_idx < kc) {
1171 packed_w[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1172 }
1173 }
1174 packed_w += kr;
1175 }
1176 packed_w += (nr - nr_block_size) * kr;
1177 }
1178 }
1179 }
1180 }
1181 }
1182 }
1183 k += kh * kw * kc * nc;
1184 if XNN_UNPREDICTABLE(b != NULL) {
1185 b += nc;
1186 }
1187 }
1188}
1189
1190void xnn_pack_f32_to_f16_deconv_goki_w(
1191 size_t g,
1192 size_t nc,
1193 size_t kh,
1194 size_t kw,
1195 size_t kc,
1196 size_t sh,
1197 size_t sw,
1198 size_t nr,
1199 size_t kr,
1200 size_t sr,
1201 const float* k,
1202 const float* b,
1203 uint16_t* packed_w,
1204 struct subconvolution_params* subconv_params,
1205 const void* params)
1206{
1207 assert(nr >= sr);
1208
1209 const size_t skr = sr * kr;
1210 for (size_t i = 0; i < g; i++) {
1211 for (size_t oy = 0; oy < sh; oy++) {
1212 for (size_t ox = 0; ox < sw; ox++) {
1213 if (i == 0) {
1214 (*subconv_params++).weights = packed_w;
1215 }
1216 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1217 const size_t nr_block_size = min(nc - nr_block_start, nr);
1218 if XNN_LIKELY(b != NULL) {
1219 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1220 packed_w[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1221 }
1222 }
1223 packed_w += nr;
1224 for (size_t ky = oy; ky < kh; ky += sh) {
1225 for (size_t kx = ox; kx < kw; kx += sw) {
1226 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1227 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1228 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1229 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1230 if (kc_idx < kc) {
1231 packed_w[kr_block_offset] = fp16_ieee_from_fp32_value(k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx]);
1232 }
1233 }
1234 packed_w += kr;
1235 }
1236 packed_w += (nr - nr_block_size) * kr;
1237 }
1238 }
1239 }
1240 }
1241 }
1242 }
1243 k += kh * kw * kc * nc;
1244 if XNN_UNPREDICTABLE(b != NULL) {
1245 b += nc;
1246 }
1247 }
1248}
1249
1250void xnn_pack_qs8_deconv_goki_w(
1251 size_t g,
1252 size_t nc,
1253 size_t kh,
1254 size_t kw,
1255 size_t kc,
1256 size_t sh,
1257 size_t sw,
1258 size_t nr,
1259 size_t kr,
1260 size_t sr,
1261 const int8_t* k,
1262 const int32_t* b,
1263 void* packed_w,
1264 struct subconvolution_params* subconv_params,
1265 const struct xnn_qs8_packing_params* params)
1266{
1267 assert(nr >= sr);
1268
1269 const size_t skr = sr * kr;
1270 const uint32_t izp = (uint32_t) params->input_zero_point;
1271 for (size_t i = 0; i < g; i++) {
1272 for (size_t oy = 0; oy < sh; oy++) {
1273 for (size_t ox = 0; ox < sw; ox++) {
1274 if (i == 0) {
1275 (*subconv_params++).weights = packed_w;
1276 }
1277 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1278 const size_t nr_block_size = min(nc - nr_block_start, nr);
1279 int32_t* packed_b = (int32_t*) packed_w;
1280 if XNN_LIKELY(b != 0) {
1281 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1282 unaligned_store_s32(packed_w, b[nr_block_start + nr_block_offset]);
1283 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1284 }
1285 } else {
1286 size_t n = nr_block_size;
1287 do {
1288 unaligned_store_s32(packed_w, 0);
1289 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1290 } while (--n != 0);
1291 }
1292 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1293 for (size_t ky = oy; ky < kh; ky += sh) {
1294 for (size_t kx = ox; kx < kw; kx += sw) {
1295 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1296 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1297 uint32_t ksum = 0;
1298 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1299 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1300 if (kc_idx < kc) {
1301 const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1302 ksum += (uint32_t) kv;
1303 ((int8_t*) packed_w)[kr_block_offset] = kv;
1304 }
1305 }
1306 unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
1307 packed_w = (int8_t*) packed_w + kr;
1308 }
1309 packed_w = (int8_t*) packed_w + (nr - nr_block_size) * kr;
1310 }
1311 }
1312 }
1313 }
1314 }
1315 }
1316 k += kh * kw * kc * nc;
1317 if XNN_UNPREDICTABLE(b != NULL) {
1318 b += nc;
1319 }
1320 }
1321}
1322
1323void xnn_pack_qu8_deconv_goki_w(
1324 size_t g,
1325 size_t nc,
1326 size_t kh,
1327 size_t kw,
1328 size_t kc,
1329 size_t sh,
1330 size_t sw,
1331 size_t nr,
1332 size_t kr,
1333 size_t sr,
1334 const uint8_t* k,
1335 const int32_t* b,
1336 void* packed_w,
1337 struct subconvolution_params* subconv_params,
1338 const struct xnn_qu8_packing_params* params)
1339{
1340 assert(nr >= sr);
1341
1342 const size_t skr = sr * kr;
1343 const int32_t izp = (int32_t) params->input_zero_point;
1344 const int32_t kzp = (int32_t) params->kernel_zero_point;
1345 for (size_t i = 0; i < g; i++) {
1346 for (size_t oy = 0; oy < sh; oy++) {
1347 for (size_t ox = 0; ox < sw; ox++) {
1348 if (i == 0) {
1349 (*subconv_params++).weights = packed_w;
1350 }
1351 const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1352 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1353 const size_t nr_block_size = min(nc - nr_block_start, nr);
1354 int32_t* packed_b = (int32_t*) packed_w;
1355 if XNN_LIKELY(b != 0) {
1356 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1357 unaligned_store_s32(packed_w, bzp + b[nr_block_start + nr_block_offset]);
1358 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1359 }
1360 } else {
1361 size_t n = nr_block_size;
1362 do {
1363 unaligned_store_s32(packed_w, bzp);
1364 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1365 } while (--n != 0);
1366 }
1367 packed_w = (void*) ((uintptr_t) packed_w + (nr - nr_block_size) * sizeof(int32_t));
1368 for (size_t ky = oy; ky < kh; ky += sh) {
1369 for (size_t kx = ox; kx < kw; kx += sw) {
1370 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1371 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1372 int32_t ksum = 0;
1373 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1374 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1375 if (kc_idx < kc) {
1376 const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1377 ksum += (int32_t) kv;
1378 ((uint8_t*) packed_w)[kr_block_offset] = kv;
1379 }
1380 }
1381 unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
1382 packed_w = (uint8_t*) packed_w + kr;
1383 }
1384 packed_w = (uint8_t*) packed_w + (nr - nr_block_size) * kr;
1385 }
1386 }
1387 }
1388 }
1389 }
1390 }
1391 k += kh * kw * kc * nc;
1392 if XNN_UNPREDICTABLE(b != NULL) {
1393 b += nc;
1394 }
1395 }
1396}
1397
1398void xnn_pack_f32_dwconv_ghw_w(
1399 size_t primary_tile,
1400 size_t h,
1401 size_t w,
1402 size_t c,
1403 size_t cr,
1404 const float* k,
1405 const float* b,
1406 float* packed_w,
1407 size_t extra_bytes,
1408 const void* params)
1409{
1410 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1411 const size_t cr_block_size = min(c - cr_block_start, cr);
1412 if XNN_LIKELY(b != NULL) {
1413 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1414 *packed_w++ = b[cr_block_start + cr_block_offset];
1415 }
1416 } else {
1417 size_t n = cr_block_size;
1418 do {
1419 *packed_w++ = 0.0f;
1420 } while (--n != 0);
1421 }
1422 packed_w += cr - cr_block_size;
1423 for (size_t x = 0; x < w; x++) {
1424 for (size_t y = 0; y < h; y++) {
1425 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1426 const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1427 *packed_w++ = kv;
1428 }
1429 packed_w += cr - cr_block_size;
1430 }
1431 }
1432 packed_w += (primary_tile - (h * w)) * cr_block_size;
1433 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1434 }
1435}
1436
1437void xnn_pack_f16_dwconv_ghw_w(
1438 size_t primary_tile,
1439 size_t h,
1440 size_t w,
1441 size_t c,
1442 size_t cr,
1443 const uint16_t* k,
1444 const uint16_t* b,
1445 uint16_t* packed_w,
1446 size_t extra_bytes,
1447 const void* params)
1448{
1449 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1450 const size_t cr_block_size = min(c - cr_block_start, cr);
1451 if XNN_LIKELY(b != NULL) {
1452 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1453 *packed_w++ = b[cr_block_start + cr_block_offset];
1454 }
1455 } else {
1456 size_t n = cr_block_size;
1457 do {
1458 *packed_w++ = 0;
1459 } while (--n != 0);
1460 }
1461 packed_w += cr - cr_block_size;
1462 for (size_t x = 0; x < w; x++) {
1463 for (size_t y = 0; y < h; y++) {
1464 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1465 const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1466 *packed_w++ = kv;
1467 }
1468 packed_w += cr - cr_block_size;
1469 }
1470 }
1471 packed_w += (primary_tile - (h * w)) * cr_block_size;
1472 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1473 }
1474}
1475
1476void xnn_pack_f32_to_f16_dwconv_ghw_w(
1477 size_t primary_tile,
1478 size_t h,
1479 size_t w,
1480 size_t c,
1481 size_t cr,
1482 const float* k,
1483 const float* b,
1484 uint16_t* packed_w,
1485 size_t extra_bytes,
1486 const void* params)
1487{
1488 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1489 const size_t cr_block_size = min(c - cr_block_start, cr);
1490 if XNN_LIKELY(b != NULL) {
1491 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1492 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1493 }
1494 } else {
1495 size_t n = cr_block_size;
1496 do {
1497 *packed_w++ = 0;
1498 } while (--n != 0);
1499 }
1500 packed_w += cr - cr_block_size;
1501 for (size_t x = 0; x < w; x++) {
1502 for (size_t y = 0; y < h; y++) {
1503 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1504 const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1505 *packed_w++ = kv;
1506 }
1507 packed_w += cr - cr_block_size;
1508 }
1509 }
1510 packed_w += (primary_tile - (h * w)) * cr_block_size;
1511 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1512 }
1513}
1514
1515void xnn_pack_qu8_dwconv_ghw_w(
1516 size_t primary_tile,
1517 size_t h,
1518 size_t w,
1519 size_t c,
1520 size_t cr,
1521 const uint8_t* k,
1522 const int32_t* b,
1523 void* packed_w,
1524 size_t extra_bytes,
1525 const struct xnn_qu8_packing_params* params)
1526{
1527 const int32_t izp = (int32_t) params->input_zero_point;
1528 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1529 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1530 const size_t cr_block_size = min(c - cr_block_start, cr);
1531 int32_t* packed_b = (int32_t*) packed_w;
1532 if XNN_LIKELY(b != NULL) {
1533 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1534 unaligned_store_s32(packed_w, boff + b[cr_block_start + cr_block_offset]);
1535 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1536 }
1537 } else {
1538 size_t n = cr_block_size;
1539 do {
1540 unaligned_store_s32(packed_w, boff);
1541 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1542 } while (--n != 0);
1543 }
1544 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1545 for (size_t x = 0; x < w; x++) {
1546 for (size_t y = 0; y < h; y++) {
1547 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1548 const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1549 unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1550 *((uint8_t*) packed_w) = kv;
1551 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1552 }
1553 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1554 }
1555 }
1556 packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
1557 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1558 }
1559}
1560
1561void xnn_pack_qs8_dwconv_ghw_w(
1562 size_t primary_tile,
1563 size_t h,
1564 size_t w,
1565 size_t c,
1566 size_t cr,
1567 const int8_t* k,
1568 const int32_t* b,
1569 void* packed_w,
1570 size_t extra_bytes,
1571 const struct xnn_qs8_packing_params* params)
1572{
1573 const uint32_t izp = (uint32_t) params->input_zero_point;
1574 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1575 const size_t cr_block_size = min(c - cr_block_start, cr);
1576 int32_t* packed_b = (int32_t*) packed_w;
1577 if XNN_LIKELY(b != NULL) {
1578 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1579 unaligned_store_s32(packed_w, b[cr_block_start + cr_block_offset]);
1580 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1581 }
1582 } else {
1583 size_t n = cr_block_size;
1584 do {
1585 unaligned_store_s32(packed_w, 0);
1586 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1587 } while (--n != 0);
1588 }
1589 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1590 for (size_t x = 0; x < w; x++) {
1591 for (size_t y = 0; y < h; y++) {
1592 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1593 const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1594 unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1595 *((int8_t*) packed_w) = kv;
1596 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1597 }
1598 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1599 }
1600 }
1601 packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
1602 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1603 }
1604}
1605
1606void xnn_pack_f32_dwconv_hwg_w(
1607 size_t primary_tile,
1608 size_t h,
1609 size_t w,
1610 size_t c,
1611 size_t cr,
1612 const float* k,
1613 const float* b,
1614 float* packed_w,
1615 size_t extra_bytes,
1616 const void* params)
1617{
1618 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1619 const size_t cr_block_size = min(c - cr_block_start, cr);
1620 if XNN_LIKELY(b != NULL) {
1621 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1622 *packed_w++ = b[cr_block_start + cr_block_offset];
1623 }
1624 } else {
1625 size_t n = cr_block_size;
1626 do {
1627 *packed_w++ = 0.0f;
1628 } while (--n != 0);
1629 }
1630 packed_w += cr - cr_block_size;
1631 for (size_t x = 0; x < w; x++) {
1632 for (size_t y = 0; y < h; y++) {
1633 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1634 const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1635 *packed_w++ = kv;
1636 }
1637 packed_w += cr - cr_block_size;
1638 }
1639 }
1640 packed_w += (primary_tile - (h * w)) * cr_block_size;
1641 packed_w = (float*) ((uintptr_t) packed_w + extra_bytes);
1642 }
1643}
1644
1645void xnn_pack_f16_dwconv_hwg_w(
1646 size_t primary_tile,
1647 size_t h,
1648 size_t w,
1649 size_t c,
1650 size_t cr,
1651 const uint16_t* k,
1652 const uint16_t* b,
1653 uint16_t* packed_w,
1654 size_t extra_bytes,
1655 const void* params)
1656{
1657 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1658 const size_t cr_block_size = min(c - cr_block_start, cr);
1659 if XNN_LIKELY(b != NULL) {
1660 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1661 *packed_w++ = b[cr_block_start + cr_block_offset];
1662 }
1663 } else {
1664 size_t n = cr_block_size;
1665 do {
1666 *packed_w++ = 0;
1667 } while (--n != 0);
1668 }
1669 packed_w += cr - cr_block_size;
1670 for (size_t x = 0; x < w; x++) {
1671 for (size_t y = 0; y < h; y++) {
1672 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1673 const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1674 *packed_w++ = kv;
1675 }
1676 packed_w += cr - cr_block_size;
1677 }
1678 }
1679 packed_w += (primary_tile - (h * w)) * cr_block_size;
1680 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1681 }
1682}
1683
1684void xnn_pack_f32_to_f16_dwconv_hwg_w(
1685 size_t primary_tile,
1686 size_t h,
1687 size_t w,
1688 size_t c,
1689 size_t cr,
1690 const float* k,
1691 const float* b,
1692 uint16_t* packed_w,
1693 size_t extra_bytes,
1694 const void* params)
1695{
1696 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1697 const size_t cr_block_size = min(c - cr_block_start, cr);
1698 if XNN_LIKELY(b != NULL) {
1699 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1700 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1701 }
1702 } else {
1703 size_t n = cr_block_size;
1704 do {
1705 *packed_w++ = 0;
1706 } while (--n != 0);
1707 }
1708 packed_w += cr - cr_block_size;
1709 for (size_t x = 0; x < w; x++) {
1710 for (size_t y = 0; y < h; y++) {
1711 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1712 const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
1713 *packed_w++ = kv;
1714 }
1715 packed_w += cr - cr_block_size;
1716 }
1717 }
1718 packed_w += (primary_tile - (h * w)) * cr_block_size;
1719 packed_w = (uint16_t*) ((uintptr_t) packed_w + extra_bytes);
1720 }
1721}
1722
1723void xnn_pack_qu8_dwconv_hwg_w(
1724 size_t primary_tile,
1725 size_t h,
1726 size_t w,
1727 size_t c,
1728 size_t cr,
1729 const uint8_t* k,
1730 const int32_t* b,
1731 void* packed_w,
1732 size_t extra_bytes,
1733 const struct xnn_qu8_packing_params* params)
1734{
1735 const int32_t izp = (int32_t) params->input_zero_point;
1736 const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1737 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1738 const size_t cr_block_size = min(c - cr_block_start, cr);
1739 int32_t* packed_b = (int32_t*) packed_w;
1740 if XNN_LIKELY(b != NULL) {
1741 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1742 unaligned_store_s32(packed_w, boff + b[cr_block_start + cr_block_offset]);
1743 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1744 }
1745 } else {
1746 size_t n = cr_block_size;
1747 do {
1748 unaligned_store_s32(packed_w, boff);
1749 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1750 } while (--n != 0);
1751 }
1752 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1753 for (size_t x = 0; x < w; x++) {
1754 for (size_t y = 0; y < h; y++) {
1755 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1756 const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1757 unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1758 *((uint8_t*) packed_w) = kv;
1759 packed_w = (void*) ((uintptr_t) packed_w + sizeof(uint8_t));
1760 }
1761 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(uint8_t));
1762 }
1763 }
1764 packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(uint8_t));
1765 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1766 }
1767}
1768
1769void xnn_pack_qs8_dwconv_hwg_w(
1770 size_t primary_tile,
1771 size_t h,
1772 size_t w,
1773 size_t c,
1774 size_t cr,
1775 const int8_t* k,
1776 const int32_t* b,
1777 void* packed_w,
1778 size_t extra_bytes,
1779 const struct xnn_qs8_packing_params* params)
1780{
1781 const uint32_t izp = (int32_t) params->input_zero_point;
1782 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
1783 const size_t cr_block_size = min(c - cr_block_start, cr);
1784 int32_t* packed_b = (int32_t*) packed_w;
1785 if XNN_LIKELY(b != NULL) {
1786 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1787 unaligned_store_s32(packed_w, b[cr_block_start + cr_block_offset]);
1788 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1789 }
1790 } else {
1791 size_t n = cr_block_size;
1792 do {
1793 unaligned_store_s32(packed_w, 0);
1794 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int32_t));
1795 } while (--n != 0);
1796 }
1797 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int32_t));
1798 for (size_t x = 0; x < w; x++) {
1799 for (size_t y = 0; y < h; y++) {
1800 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
1801 const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1802 unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1803 *((int8_t*) packed_w) = kv;
1804 packed_w = (void*) ((uintptr_t) packed_w + sizeof(int8_t));
1805 }
1806 packed_w = (void*) ((uintptr_t) packed_w + (cr - cr_block_size) * sizeof(int8_t));
1807 }
1808 }
1809 packed_w = (void*) ((uintptr_t) packed_w + (primary_tile - (h * w)) * cr_block_size * sizeof(int8_t));
1810 packed_w = (void*) ((uintptr_t) packed_w + extra_bytes);
1811 }
1812}
1813
1814void xnn_pack_f32_gemminc_goi_w(
1815 size_t g,
1816 size_t nc,
1817 size_t kc,
1818 size_t nr,
1819 size_t kr,
1820 size_t sr,
1821 const float* k,
1822 float* packed_w,
1823 const void* params)
1824{
1825 const size_t skr = sr * kr;
1826 do {
1827 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1828 const size_t nr_block_size = min(nc - nr_block_start, nr);
1829
1830 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1831 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1832 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1833 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1834 if (kc_idx < kc) {
1835 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1836 }
1837 }
1838 packed_w += kr;
1839 }
1840 packed_w += (nr - nr_block_size) * kr;
1841 }
1842 }
1843 k += nc * kc;
1844 } while (--g != 0);
1845}
1846
1847void xnn_pack_f16_gemminc_goi_w(
1848 size_t g,
1849 size_t nc,
1850 size_t kc,
1851 size_t nr,
1852 size_t kr,
1853 size_t sr,
1854 const uint16_t* k,
1855 uint16_t* packed_w,
1856 const void* params)
1857{
1858 const size_t skr = sr * kr;
1859 do {
1860 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1861 const size_t nr_block_size = min(nc - nr_block_start, nr);
1862
1863 for (size_t kr_block_start = 0; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1864 for (size_t nr_block_offset = 0; nr_block_offset < nr_block_size; nr_block_offset++) {
1865 for (size_t kr_block_offset = 0; kr_block_offset < kr; kr_block_offset++) {
1866 const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - 1));
1867 if (kc_idx < kc) {
1868 packed_w[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
1869 }
1870 }
1871 packed_w += kr;
1872 }
1873 packed_w += (nr - nr_block_size) * kr;
1874 }
1875 }
1876 k += nc * kc;
1877 } while (--g != 0);
1878}
1879
1880void xnn_pack_f32_dconv_oki_w(
1881 size_t nc,
1882 size_t kc,
1883 size_t nr,
1884 size_t kh,
1885 size_t kw,
1886 const float* k,
1887 const float* b,
1888 float* packed_w,
1889 const void* params)
1890{
1891 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1892 const size_t nr_block_size = min(nc - nr_block_start, nr);
1893 if XNN_LIKELY(b != NULL) {
1894 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1895 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1896 }
1897 } else {
1898 size_t n = nr;
1899 do {
1900 *packed_w++ = 0.0f;
1901 } while (--n != 0);
1902 }
1903
1904 for (size_t kx = 0; kx < kw; kx++) {
1905 for (size_t c = 0; c < kc; c++) {
1906 for (size_t ky = 0; ky < kh; ky++) {
1907 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1908 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1909 }
1910 }
1911 }
1912 }
1913 if XNN_UNPREDICTABLE(b != NULL) {
1914 b += nr;
1915 }
1916 }
1917}
1918
1919void xnn_pack_f16_dconv_oki_w(
1920 size_t nc,
1921 size_t kc,
1922 size_t nr,
1923 size_t kh,
1924 size_t kw,
1925 const uint16_t* k,
1926 const uint16_t* b,
1927 uint16_t* packed_w,
1928 const void* params)
1929{
1930 for (size_t nr_block_start = 0; nr_block_start < nc; nr_block_start += nr) {
1931 const size_t nr_block_size = min(nc - nr_block_start, nr);
1932 if XNN_LIKELY(b != NULL) {
1933 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1934 *packed_w++ = b[min(nr_block_offset, nr_block_size - 1)];
1935 }
1936 } else {
1937 size_t n = nr;
1938 do {
1939 *packed_w++ = 0;
1940 } while (--n != 0);
1941 }
1942
1943 for (size_t kx = 0; kx < kw; kx++) {
1944 for (size_t c = 0; c < kc; c++) {
1945 for (size_t ky = 0; ky < kh; ky++) {
1946 for (size_t nr_block_offset = 0; nr_block_offset < nr; nr_block_offset++) {
1947 *packed_w++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - 1)) * kh + ky) * kw + kx) * kc + c];
1948 }
1949 }
1950 }
1951 }
1952 if XNN_UNPREDICTABLE(b != NULL) {
1953 b += nr;
1954 }
1955 }
1956}
1957
1958void xnn_pack_f32_chw_dwconv_ghw_w(
1959 size_t kernel_size,
1960 size_t groups,
1961 const float* kernel,
1962 const float* bias,
1963 float* packed_weights,
1964 const void* params)
1965{
1966 for (size_t g = 0; g < groups; g++) {
1967 if XNN_LIKELY(bias != NULL) {
1968 *packed_weights = *bias++;
1969 } else {
1970 *packed_weights = 0.0f;
1971 }
1972 packed_weights += 1;
1973 for (size_t i = 0; i < kernel_size; i++) {
1974 *packed_weights++ = kernel[g * kernel_size + i];
1975 }
1976 }
1977}
1978
1979void xnn_pack_f16_chw_dwconv_ghw_w(
1980 size_t kernel_size,
1981 size_t groups,
1982 const uint16_t* kernel,
1983 const uint16_t* bias,
1984 uint16_t* packed_weights,
1985 const void* params)
1986{
1987 for (size_t g = 0; g < groups; g++) {
1988 if XNN_LIKELY(bias != NULL) {
1989 *packed_weights = *bias++;
1990 } else {
1991 *packed_weights = 0;
1992 }
1993 packed_weights += 1;
1994 for (size_t i = 0; i < kernel_size; i++) {
1995 *packed_weights++ = kernel[g * kernel_size + i];
1996 }
1997 }
1998}
1999
2000void xnn_pack_f32_chw_dwconv_hwg_w(
2001 size_t kernel_size,
2002 size_t groups,
2003 const float* kernel,
2004 const float* bias,
2005 float* packed_weights,
2006 const void* params)
2007{
2008 for (size_t g = 0; g < groups; g++) {
2009 if XNN_LIKELY(bias != NULL) {
2010 *packed_weights = *bias++;
2011 } else {
2012 *packed_weights = 0.0f;
2013 }
2014 packed_weights += 1;
2015 for (size_t i = 0; i < kernel_size; i++) {
2016 *packed_weights++ = kernel[i * groups + g];
2017 }
2018 }
2019}
2020
2021void xnn_pack_f32_vmulcaddc_w(
2022 size_t c,
2023 size_t cr,
2024 const float* s,
2025 const float* b,
2026 float* packed_w,
2027 const void* params)
2028{
2029 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2030 const size_t cr_block_size = min(c - cr_block_start, cr);
2031 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2032 *packed_w++ = s[cr_block_start + cr_block_offset];
2033 }
2034 packed_w += cr - cr_block_size;
2035 if XNN_LIKELY(b != NULL) {
2036 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2037 *packed_w++ = b[cr_block_start + cr_block_offset];
2038 }
2039 } else {
2040 size_t n = cr_block_size;
2041 do {
2042 *packed_w++ = 0.0f;
2043 } while (--n != 0);
2044 }
2045 packed_w += cr - cr_block_size;
2046 }
2047}
2048
2049void xnn_pack_f16_vmulcaddc_w(
2050 size_t c,
2051 size_t cr,
2052 const uint16_t* s,
2053 const uint16_t* b,
2054 uint16_t* packed_w,
2055 const void* params)
2056{
2057 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2058 const size_t cr_block_size = min(c - cr_block_start, cr);
2059 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2060 *packed_w++ = s[cr_block_start + cr_block_offset];
2061 }
2062 packed_w += cr - cr_block_size;
2063 if XNN_LIKELY(b != NULL) {
2064 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2065 *packed_w++ = b[cr_block_start + cr_block_offset];
2066 }
2067 } else {
2068 size_t n = cr_block_size;
2069 do {
2070 *packed_w++ = 0;
2071 } while (--n != 0);
2072 }
2073 packed_w += cr - cr_block_size;
2074 }
2075}
2076
2077void xnn_pack_f32_to_f16_vmulcaddc_w(
2078 size_t c,
2079 size_t cr,
2080 const float* s,
2081 const float* b,
2082 uint16_t* packed_w,
2083 const void* params)
2084{
2085 for (size_t cr_block_start = 0; cr_block_start < c; cr_block_start += cr) {
2086 const size_t cr_block_size = min(c - cr_block_start, cr);
2087 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2088 *packed_w++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2089 }
2090 packed_w += cr - cr_block_size;
2091 if XNN_LIKELY(b != NULL) {
2092 for (size_t cr_block_offset = 0; cr_block_offset < cr_block_size; cr_block_offset++) {
2093 *packed_w++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2094 }
2095 } else {
2096 size_t n = cr_block_size;
2097 do {
2098 *packed_w++ = 0;
2099 } while (--n != 0);
2100 }
2101 packed_w += cr - cr_block_size;
2102 }
2103}
2104
2105void xnn_pack_f32_prelu_w(
2106 size_t c,
2107 const float* s,
2108 float* packed_w)
2109{
2110 memcpy(packed_w, s, c * sizeof(float));
2111}
2112
2113void xnn_pack_f16_prelu_w(
2114 size_t c,
2115 const uint16_t* s,
2116 uint16_t* packed_w)
2117{
2118 memcpy(packed_w, s, c * sizeof(uint16_t));
2119}
2120
2121void xnn_pack_f32_to_f16_prelu_w(
2122 size_t c,
2123 const float* s,
2124 uint16_t* packed_w)
2125{
2126 do {
2127 *packed_w++ = fp16_ieee_from_fp32_value(*s++);
2128 } while (--c != 0);
2129}
2130