packing.c source code [pytorch/third_party/XNNPACK/src/packing.c]

1	// Copyright (c) Facebook, Inc. and its affiliates.
2	// All rights reserved.
3	//
4	// Copyright 2019 Google LLC
5	//
6	// This source code is licensed under the BSD-style license found in the
7	// LICENSE file in the root directory of this source tree.
8
9	#include <stdint.h>
10	#include <stddef.h>
11	#include <string.h>
12
13	#include <fp16.h>
14
15	#include <xnnpack/log.h>
16	#include <xnnpack/math.h>
17	#include <xnnpack/operator.h>
18	#include <xnnpack/pack.h>
19	#include <xnnpack/unaligned.h>
20
21
22	void xnn_pack_f32_gemm_goi_w(
23	size_t g,
24	size_t nc,
25	size_t kc,
26	size_t nr,
27	size_t kr,
28	size_t sr,
29	const float* k,
30	const float* b,
31	float* packed_weights,
32	size_t extra_bytes,
33	const void* params)
34	{
35	assert(g != `0`);
36	assert(nr >= sr);
37	assert(k != NULL);
38	assert(packed_weights != NULL);
39
40	const size_t skr = sr * kr;
41	do {
42	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
43	const size_t nr_block_size = min(nc - nr_block_start, nr);
44	if XNN_LIKELY(b != NULL) {
45	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
46	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
47	}
48	}
49	packed_weights += nr;
50
51	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
52	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
53	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
54	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
55	if (kc_idx < kc) {
56	packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
57	}
58	}
59	packed_weights += kr;
60	}
61	packed_weights += (nr - nr_block_size) * kr;
62	}
63	packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
64	}
65	k += nc * kc;
66	if XNN_UNPREDICTABLE(b != NULL) {
67	b += nc;
68	}
69	} while (--g != `0`);
70	}
71
72	void xnn_pack_f16_gemm_goi_w(
73	size_t g,
74	size_t nc,
75	size_t kc,
76	size_t nr,
77	size_t kr,
78	size_t sr,
79	const uint16_t* k,
80	const uint16_t* b,
81	uint16_t* packed_weights,
82	size_t extra_bytes,
83	const void* params)
84	{
85	assert(g != `0`);
86	assert(nr >= sr);
87	assert(k != NULL);
88	assert(packed_weights != NULL);
89
90	const size_t skr = sr * kr;
91	do {
92	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
93	const size_t nr_block_size = min(nc - nr_block_start, nr);
94	if XNN_LIKELY(b != NULL) {
95	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
96	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
97	}
98	}
99	packed_weights += nr;
100
101	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
102	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
103	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
104	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
105	if (kc_idx < kc) {
106	packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
107	}
108	}
109	packed_weights += kr;
110	}
111	packed_weights += (nr - nr_block_size) * kr;
112	}
113	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
114	}
115	k += nc * kc;
116	if XNN_UNPREDICTABLE(b != NULL) {
117	b += nc;
118	}
119	} while (--g != `0`);
120	}
121
122	void xnn_pack_f32_to_f16_gemm_goi_w(
123	size_t g,
124	size_t nc,
125	size_t kc,
126	size_t nr,
127	size_t kr,
128	size_t sr,
129	const float* k,
130	const float* b,
131	uint16_t* packed_weights,
132	size_t extra_bytes,
133	const void* params)
134	{
135	assert(g != `0`);
136	assert(nr >= sr);
137	assert(k != NULL);
138	assert(packed_weights != NULL);
139
140	const size_t skr = sr * kr;
141	do {
142	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
143	const size_t nr_block_size = min(nc - nr_block_start, nr);
144	if XNN_LIKELY(b != NULL) {
145	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
146	packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
147	}
148	}
149	packed_weights += nr;
150
151	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
152	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
153	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
154	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
155	if (kc_idx < kc) {
156	packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[(nr_block_start + nr_block_offset) * kc + kc_idx]);
157	}
158	}
159	packed_weights += kr;
160	}
161	packed_weights += (nr - nr_block_size) * kr;
162	}
163	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
164	}
165	k += nc * kc;
166	if XNN_UNPREDICTABLE(b != NULL) {
167	b += nc;
168	}
169	} while (--g != `0`);
170	}
171
172	void xnn_pack_qu8_gemm_goi_w(
173	size_t g,
174	size_t nc,
175	size_t kc,
176	size_t nr,
177	size_t kr,
178	size_t sr,
179	const uint8_t* k,
180	const int32_t* b,
181	void* packed_weights,
182	size_t extra_bytes,
183	const struct xnn_qu8_packing_params* params)
184	{
185	assert(g != `0`);
186	assert(nr >= sr);
187	assert(k != NULL);
188	assert(packed_weights != NULL);
189
190	const size_t skr = sr * kr;
191	const int32_t izp = (int32_t) params->input_zero_point;
192	const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
193	do {
194	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
195	const size_t nr_block_size = min(nc - nr_block_start, nr);
196	int32_t* packed_b = (int32_t*) packed_weights;
197	if XNN_LIKELY(b != NULL) {
198	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
199	unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
200	packed_weights = (int32_t*) packed_weights + `1`;
201	}
202	} else {
203	size_t n = nr_block_size;
204	do {
205	unaligned_store_s32(packed_weights, bzp);
206	packed_weights = (int32_t*) packed_weights + `1`;
207	} while (--n != `0`);
208	}
209	packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
210
211	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
212	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
213	int32_t ksum = `0`;
214	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
215	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
216	if (kc_idx < kc) {
217	const uint8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
218	ksum += (int32_t) kv;
219	((uint8_t*) packed_weights)[kr_block_offset] = kv;
220	}
221	}
222	unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
223	packed_weights = (uint8_t*) packed_weights + kr;
224	}
225	packed_weights = (uint8_t) packed_weights + (nr - nr_block_size) kr;
226	}
227	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
228	}
229	k += nc * kc;
230	if XNN_UNPREDICTABLE(b != NULL) {
231	b += nc;
232	}
233	} while (--g != `0`);
234	}
235
236	void xnn_pack_qs8_gemm_goi_w(
237	size_t g,
238	size_t nc,
239	size_t kc,
240	size_t nr,
241	size_t kr,
242	size_t sr,
243	const int8_t* k,
244	const int32_t* b,
245	void* packed_weights,
246	size_t extra_bytes,
247	const struct xnn_qs8_packing_params* params)
248	{
249	assert(g != `0`);
250	assert(nr >= sr);
251	assert(k != NULL);
252	assert(packed_weights != NULL);
253
254	const size_t skr = sr * kr;
255	const uint32_t izp = (uint32_t) params->input_zero_point;
256	do {
257	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
258	const size_t nr_block_size = min(nc - nr_block_start, nr);
259	int32_t* packed_b = (int32_t*) packed_weights;
260	if XNN_LIKELY(b != NULL) {
261	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
262	unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
263	packed_weights = (int32_t*) packed_weights + `1`;
264	}
265	} else {
266	size_t n = nr_block_size;
267	do {
268	unaligned_store_s32(packed_weights, `0`);
269	packed_weights = (int32_t*) packed_weights + `1`;
270	} while (--n != `0`);
271	}
272	packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
273
274	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
275	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
276	uint32_t ksum = `0`;
277	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
278	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
279	if (kc_idx < kc) {
280	const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
281	ksum += (uint32_t) kv;
282	((int8_t*) packed_weights)[kr_block_offset] = kv;
283	}
284	}
285	unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
286	packed_weights = (int8_t*) packed_weights + kr;
287	}
288	packed_weights = (int8_t) packed_weights + (nr - nr_block_size) kr;
289	}
290	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
291	}
292	k += nc * kc;
293	if XNN_UNPREDICTABLE(b != NULL) {
294	b += nc;
295	}
296	} while (--g != `0`);
297	}
298
299	void xnn_pack_qs8_gemm_xw_goi_w(
300	size_t g,
301	size_t nc,
302	size_t kc,
303	size_t nr,
304	size_t kr,
305	size_t sr,
306	const int8_t* k,
307	const int32_t* b,
308	void* packed_weights,
309	size_t extra_bytes,
310	const struct xnn_qs8_packing_params* params)
311	{
312	assert(g != `0`);
313	assert(nr >= sr);
314	assert(k != NULL);
315	assert(packed_weights != NULL);
316
317	const size_t skr = sr * kr;
318	const uint32_t izp = (uint32_t) params->input_zero_point;
319	do {
320	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
321	const size_t nr_block_size = min(nc - nr_block_start, nr);
322	int32_t* packed_b = (int32_t*) packed_weights;
323	if XNN_LIKELY(b != NULL) {
324	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
325	unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
326	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
327	}
328	} else {
329	size_t n = nr_block_size;
330	do {
331	unaligned_store_s32(packed_weights, `0`);
332	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
333	} while (--n != `0`);
334	}
335	packed_weights = (void) ((uintptr_t) packed_weights + (nr - nr_block_size) sizeof(int32_t));
336
337	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
338	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
339	uint32_t ksum = `0`;
340	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
341	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
342	if (kc_idx < kc) {
343	const int8_t kv = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
344	ksum += (uint32_t) kv;
345	((int16_t*) packed_weights)[kr_block_offset] = (int16_t) kv;
346	}
347	}
348	unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
349	packed_weights = (int16_t*) packed_weights + kr;
350	}
351	packed_weights = (int16_t) packed_weights + (nr - nr_block_size) kr;
352	}
353	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
354	}
355	k += nc * kc;
356	if XNN_UNPREDICTABLE(b != NULL) {
357	b += nc;
358	}
359	} while (--g != `0`);
360	}
361
362	void xnn_pack_f32_gemm_io_w(
363	size_t nc,
364	size_t kc,
365	size_t nr,
366	size_t kr,
367	size_t sr,
368	const float* k,
369	const float* b,
370	float* packed_weights,
371	const void* params)
372	{
373	assert(nr >= sr);
374	assert(k != NULL);
375	assert(packed_weights != NULL);
376
377	const size_t skr = sr * kr;
378	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
379	const size_t nr_block_size = min(nc - nr_block_start, nr);
380	if XNN_LIKELY(b != NULL) {
381	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
382	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
383	}
384	}
385	packed_weights += nr;
386
387	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
388	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
389	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
390	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
391	if (kc_idx < kc) {
392	packed_weights[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
393	}
394	}
395	packed_weights += kr;
396	}
397	packed_weights += (nr - nr_block_size) * kr;
398	}
399	}
400	}
401
402	void xnn_pack_f16_gemm_io_w(
403	size_t nc,
404	size_t kc,
405	size_t nr,
406	size_t kr,
407	size_t sr,
408	const uint16_t* k,
409	const uint16_t* b,
410	uint16_t* packed_weights,
411	const void* params)
412	{
413	assert(nr >= sr);
414	assert(k != NULL);
415	assert(packed_weights != NULL);
416
417	const size_t skr = sr * kr;
418	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
419	const size_t nr_block_size = min(nc - nr_block_start, nr);
420	if XNN_LIKELY(b != NULL) {
421	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
422	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
423	}
424	}
425	packed_weights += nr;
426
427	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
428	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
429	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
430	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
431	if (kc_idx < kc) {
432	packed_weights[kr_block_offset] = k[kc_idx * nc + nr_block_start + nr_block_offset];
433	}
434	}
435	packed_weights += kr;
436	}
437	packed_weights += (nr - nr_block_size) * kr;
438	}
439	}
440	}
441
442	void xnn_pack_f32_to_f16_gemm_io_w(
443	size_t nc,
444	size_t kc,
445	size_t nr,
446	size_t kr,
447	size_t sr,
448	const float* k,
449	const float* b,
450	uint16_t* packed_weights,
451	const void* params)
452	{
453	assert(nr >= sr);
454	assert(k != NULL);
455	assert(packed_weights != NULL);
456
457	const size_t skr = sr * kr;
458	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
459	const size_t nr_block_size = min(nc - nr_block_start, nr);
460	if XNN_LIKELY(b != NULL) {
461	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
462	packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
463	}
464	}
465	packed_weights += nr;
466
467	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
468	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
469	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
470	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
471	if (kc_idx < kc) {
472	packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[kc_idx * nc + nr_block_start + nr_block_offset]);
473	}
474	}
475	packed_weights += kr;
476	}
477	packed_weights += (nr - nr_block_size) * kr;
478	}
479	}
480	}
481
482	void xnn_pack_qu8_gemm_io_w(
483	size_t nc,
484	size_t kc,
485	size_t nr,
486	size_t kr,
487	size_t sr,
488	const uint8_t* k,
489	const int32_t* b,
490	void* packed_weights,
491	const struct xnn_qu8_packing_params* params)
492	{
493	assert(nr >= sr);
494	assert(k != NULL);
495	assert(packed_weights != NULL);
496
497	const size_t skr = sr * kr;
498	const int32_t izp = (int32_t) params->input_zero_point;
499	const int32_t bzp = (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
500	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
501	const size_t nr_block_size = min(nc - nr_block_start, nr);
502	int32_t* packed_b = (int32_t*) packed_weights;
503	if XNN_LIKELY(b != NULL) {
504	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
505	unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
506	packed_weights = (int32_t*) packed_weights + `1`;
507	}
508	} else {
509	size_t n = nr_block_size;
510	do {
511	unaligned_store_s32(packed_weights, bzp);
512	packed_weights = (int32_t*) packed_weights + `1`;
513	} while (--n != `0`);
514	}
515	packed_weights = (int32_t*) packed_weights + (nr - nr_block_size);
516
517	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
518	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
519	int32_t ksum = `0`;
520	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
521	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
522	if (kc_idx < kc) {
523	const uint8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
524	ksum += (int32_t) kv;
525	((uint8_t*) packed_weights)[kr_block_offset] = kv;
526	}
527	}
528	unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
529	packed_weights = (uint8_t*) packed_weights + kr;
530	}
531	packed_weights = (uint8_t) packed_weights + (nr - nr_block_size) kr;
532	}
533	}
534	}
535
536	void xnn_pack_qs8_gemm_io_w(
537	size_t nc,
538	size_t kc,
539	size_t nr,
540	size_t kr,
541	size_t sr,
542	const int8_t* k,
543	const int32_t* b,
544	void* packed_weights,
545	const struct xnn_qs8_packing_params* params)
546	{
547	assert(nr >= sr);
548	assert(k != NULL);
549	assert(packed_weights != NULL);
550
551	const size_t skr = sr * kr;
552	const uint32_t izp = (uint32_t) params->input_zero_point;
553	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
554	const size_t nr_block_size = min(nc - nr_block_start, nr);
555	int32_t* packed_b = (int32_t*) packed_weights;
556	if XNN_LIKELY(b != NULL) {
557	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
558	unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
559	packed_weights = (int32_t*) packed_weights + `1`;
560	}
561	} else {
562	size_t n = nr_block_size;
563	do {
564	unaligned_store_s32(packed_weights, `0`);
565	packed_weights = (int32_t*) packed_weights + `1`;
566	} while (--n != `0`);
567	}
568	packed_weights = (uint32_t*) packed_weights + (nr - nr_block_size);
569
570	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
571	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
572	uint32_t ksum = `0`;
573	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
574	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
575	if (kc_idx < kc) {
576	const int8_t kv = k[kc_idx * nc + (nr_block_start + nr_block_offset)];
577	ksum += (uint32_t) kv;
578	((int8_t*) packed_weights)[kr_block_offset] = kv;
579	}
580	}
581	unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
582	packed_weights = (int8_t*) packed_weights + kr;
583	}
584	packed_weights = (int8_t) packed_weights + (nr - nr_block_size) kr;
585	}
586	}
587	}
588
589	void xnn_pack_f32_conv_goki_w(
590	size_t g,
591	size_t nc,
592	size_t ks,
593	size_t kc,
594	size_t nr,
595	size_t kr,
596	size_t sr,
597	const float* k,
598	const float* b,
599	float* packed_weights,
600	size_t extra_bytes,
601	const void* params)
602	{
603	assert(g != `0`);
604	assert(nr >= sr);
605	assert(k != NULL);
606	assert(packed_weights != NULL);
607
608	const size_t skr = sr * kr;
609	do {
610	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
611	const size_t nr_block_size = min(nc - nr_block_start, nr);
612	if XNN_LIKELY(b != NULL) {
613	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
614	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
615	}
616	}
617	packed_weights += nr;
618
619	for (size_t ki = `0`; ki < ks; ki++) {
620	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
621	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
622	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
623	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
624	if (kc_idx < kc) {
625	packed_weights[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
626	}
627	}
628	packed_weights += kr;
629	}
630	packed_weights += (nr - nr_block_size) * kr;
631	}
632	}
633	packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
634	}
635	k += ks * kc * nc;
636	if XNN_UNPREDICTABLE(b != NULL) {
637	b += nc;
638	}
639	} while (--g != `0`);
640	}
641
642	void xnn_pack_f16_conv_goki_w(
643	size_t g,
644	size_t nc,
645	size_t ks,
646	size_t kc,
647	size_t nr,
648	size_t kr,
649	size_t sr,
650	const uint16_t* k,
651	const uint16_t* b,
652	uint16_t* packed_weights,
653	size_t extra_bytes,
654	const void* params)
655	{
656	assert(g != `0`);
657	assert(nr >= sr);
658	assert(k != NULL);
659	assert(packed_weights != NULL);
660
661	const size_t skr = sr * kr;
662	do {
663	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
664	const size_t nr_block_size = min(nc - nr_block_start, nr);
665	if XNN_LIKELY(b != NULL) {
666	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
667	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
668	}
669	}
670	packed_weights += nr;
671
672	for (size_t ki = `0`; ki < ks; ki++) {
673	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
674	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
675	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
676	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
677	if (kc_idx < kc) {
678	packed_weights[kr_block_offset] = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
679	}
680	}
681	packed_weights += kr;
682	}
683	packed_weights += (nr - nr_block_size) * kr;
684	}
685	}
686	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
687	}
688	k += ks * kc * nc;
689	if XNN_UNPREDICTABLE(b != NULL) {
690	b += nc;
691	}
692	} while (--g != `0`);
693	}
694
695	void xnn_pack_f32_to_f16_conv_goki_w(
696	size_t g,
697	size_t nc,
698	size_t ks,
699	size_t kc,
700	size_t nr,
701	size_t kr,
702	size_t sr,
703	const float* k,
704	const float* b,
705	uint16_t* packed_weights,
706	size_t extra_bytes,
707	const void* params)
708	{
709	assert(g != `0`);
710	assert(nr >= sr);
711	assert(k != NULL);
712	assert(packed_weights != NULL);
713
714	const size_t skr = sr * kr;
715	do {
716	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
717	const size_t nr_block_size = min(nc - nr_block_start, nr);
718	if XNN_LIKELY(b != NULL) {
719	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
720	packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
721	}
722	}
723	packed_weights += nr;
724
725	for (size_t ki = `0`; ki < ks; ki++) {
726	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
727	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
728	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
729	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
730	if (kc_idx < kc) {
731	packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx]);
732	}
733	}
734	packed_weights += kr;
735	}
736	packed_weights += (nr - nr_block_size) * kr;
737	}
738	}
739	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
740	}
741	k += ks * kc * nc;
742	if XNN_UNPREDICTABLE(b != NULL) {
743	b += nc;
744	}
745	} while (--g != `0`);
746	}
747
748	void xnn_pack_qu8_conv_goki_w(
749	size_t g,
750	size_t nc,
751	size_t ks,
752	size_t kc,
753	size_t nr,
754	size_t kr,
755	size_t sr,
756	const uint8_t* k,
757	const int32_t* b,
758	void* packed_weights,
759	size_t extra_bytes,
760	const struct xnn_qu8_packing_params* params)
761	{
762	assert(g != `0`);
763	assert(nr >= sr);
764	assert(k != NULL);
765	assert(packed_weights != NULL);
766
767	const size_t skr = sr * kr;
768	const int32_t izp = (int32_t) params->input_zero_point;
769	const int32_t bzp = (int32_t) ks * (int32_t) kc * izp * (int32_t) params->kernel_zero_point;
770	do {
771	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
772	const size_t nr_block_size = min(nc - nr_block_start, nr);
773	int32_t* packed_b = (int32_t*) packed_weights;
774	if XNN_LIKELY(b != NULL) {
775	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
776	unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
777	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
778	}
779	} else {
780	size_t n = nr_block_size;
781	do {
782	unaligned_store_s32(packed_weights, bzp);
783	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
784	} while (--n != `0`);
785	}
786	packed_weights = (void) ((uintptr_t) packed_weights + (nr - nr_block_size) sizeof(int32_t));
787
788	for (size_t ki = `0`; ki < ks; ki++) {
789	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
790	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
791	int32_t ksum = `0`;
792	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
793	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
794	if (kc_idx < kc) {
795	const uint8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
796	ksum += (int32_t) kv;
797	((uint8_t*) packed_weights)[kr_block_offset] = kv;
798	}
799	}
800	unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
801	packed_weights = (uint8_t*) packed_weights + kr;
802	}
803	packed_weights = (uint8_t) packed_weights + (nr - nr_block_size) kr;
804	}
805	}
806	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
807	}
808	k += ks * kc * nc;
809	if XNN_UNPREDICTABLE(b != NULL) {
810	b += nc;
811	}
812	} while (--g != `0`);
813	}
814
815	void xnn_pack_qs8_conv_goki_w(
816	size_t g,
817	size_t nc,
818	size_t ks,
819	size_t kc,
820	size_t nr,
821	size_t kr,
822	size_t sr,
823	const int8_t* k,
824	const int32_t* b,
825	void* packed_weights,
826	size_t extra_bytes,
827	const struct xnn_qs8_packing_params* params)
828	{
829	assert(g != `0`);
830	assert(nr >= sr);
831	assert(k != NULL);
832	assert(packed_weights != NULL);
833
834	const size_t skr = sr * kr;
835	const uint32_t izp = (int32_t) params->input_zero_point;
836	do {
837	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
838	const size_t nr_block_size = min(nc - nr_block_start, nr);
839	int32_t* packed_b = (int32_t*) packed_weights;
840	if XNN_LIKELY(b != NULL) {
841	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
842	unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
843	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
844	}
845	} else {
846	size_t n = nr_block_size;
847	do {
848	unaligned_store_s32(packed_weights, `0`);
849	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
850	} while (--n != `0`);
851	}
852	packed_weights = (void) ((uintptr_t) packed_weights + (nr - nr_block_size) sizeof(int32_t));
853
854	for (size_t ki = `0`; ki < ks; ki++) {
855	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
856	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
857	uint32_t ksum = `0`;
858	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
859	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
860	if (kc_idx < kc) {
861	const int8_t kv = k[((nr_block_start + nr_block_offset) * ks + ki) * kc + kc_idx];
862	ksum += (uint32_t) kv;
863	((int8_t*) packed_weights)[kr_block_offset] = kv;
864	}
865	}
866	unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
867	packed_weights = (int8_t*) packed_weights + kr;
868	}
869	packed_weights = (int8_t) packed_weights + (nr - nr_block_size) kr;
870	}
871	}
872	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
873	}
874	k += ks * kc * nc;
875	if XNN_UNPREDICTABLE(b != NULL) {
876	b += nc;
877	}
878	} while (--g != `0`);
879	}
880
881	void xnn_pack_f32_conv_kgo_w(
882	size_t g,
883	size_t nc,
884	size_t ks,
885	size_t nr,
886	size_t kr,
887	size_t sr,
888	const float* k,
889	const float* b,
890	float* packed_weights,
891	size_t extra_bytes,
892	const void* params)
893	{
894	assert(g != `0`);
895	assert(nr >= sr);
896	assert(k != NULL);
897	assert(packed_weights != NULL);
898
899	for (size_t i = `0`; i < g; i++) {
900	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
901	const size_t nr_block_size = min(nc - nr_block_start, nr);
902	if XNN_LIKELY(b != NULL) {
903	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
904	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
905	}
906	}
907	packed_weights += nr;
908
909	for (size_t ki = `0`; ki < ks; ki++) {
910	for (size_t sr_block_offset = `0`; sr_block_offset < sr; sr_block_offset++) {
911	for (size_t nr_block_offset = (-sr_block_offset) & (sr - `1`); nr_block_offset < nr_block_size; nr_block_offset += sr) {
912	packed_weights[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
913	}
914	packed_weights += nr * kr;
915	}
916	}
917	packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
918	}
919	k += nc;
920	if XNN_UNPREDICTABLE(b != NULL) {
921	b += nc;
922	}
923	}
924	}
925
926	void xnn_pack_f16_conv_kgo_w(
927	size_t g,
928	size_t nc,
929	size_t ks,
930	size_t nr,
931	size_t kr,
932	size_t sr,
933	const uint16_t* k,
934	const uint16_t* b,
935	uint16_t* packed_weights,
936	size_t extra_bytes,
937	const void* params)
938	{
939	assert(g != `0`);
940	assert(nr >= sr);
941	assert(k != NULL);
942	assert(packed_weights != NULL);
943
944	for (size_t i = `0`; i < g; i++) {
945	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
946	const size_t nr_block_size = min(nc - nr_block_start, nr);
947	if XNN_LIKELY(b != NULL) {
948	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
949	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
950	}
951	}
952	packed_weights += nr;
953
954	for (size_t ki = `0`; ki < ks; ki++) {
955	for (size_t sr_block_offset = `0`; sr_block_offset < sr; sr_block_offset++) {
956	for (size_t nr_block_offset = (-sr_block_offset) & (sr - `1`); nr_block_offset < nr_block_size; nr_block_offset += sr) {
957	packed_weights[nr_block_offset * kr] = k[ki * g * nc + (nr_block_start + nr_block_offset)];
958	}
959	packed_weights += nr * kr;
960	}
961	}
962	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
963	}
964	k += nc;
965	if XNN_UNPREDICTABLE(b != NULL) {
966	b += nc;
967	}
968	}
969	}
970
971	void xnn_pack_f32_to_f16_conv_kgo_w(
972	size_t g,
973	size_t nc,
974	size_t ks,
975	size_t nr,
976	size_t kr,
977	size_t sr,
978	const float* k,
979	const float* b,
980	uint16_t* packed_weights,
981	size_t extra_bytes,
982	const void* params)
983	{
984	assert(g != `0`);
985	assert(nr >= sr);
986	assert(k != NULL);
987	assert(packed_weights != NULL);
988
989	for (size_t i = `0`; i < g; i++) {
990	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
991	const size_t nr_block_size = min(nc - nr_block_start, nr);
992	if XNN_LIKELY(b != NULL) {
993	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
994	packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
995	}
996	}
997	packed_weights += nr;
998
999	for (size_t ki = `0`; ki < ks; ki++) {
1000	for (size_t sr_block_offset = `0`; sr_block_offset < sr; sr_block_offset++) {
1001	for (size_t nr_block_offset = (-sr_block_offset) & (sr - `1`); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1002	packed_weights[nr_block_offset * kr] = fp16_ieee_from_fp32_value(k[ki * g * nc + (nr_block_start + nr_block_offset)]);
1003	}
1004	packed_weights += nr * kr;
1005	}
1006	}
1007	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1008	}
1009	k += nc;
1010	if XNN_UNPREDICTABLE(b != NULL) {
1011	b += nc;
1012	}
1013	}
1014	}
1015
1016	void xnn_pack_qu8_conv_kgo_w(
1017	size_t g,
1018	size_t nc,
1019	size_t ks,
1020	size_t nr,
1021	size_t kr,
1022	size_t sr,
1023	const uint8_t* k,
1024	const int32_t* b,
1025	void* packed_weights,
1026	size_t extra_bytes,
1027	const struct xnn_qu8_packing_params* params)
1028	{
1029	assert(g != `0`);
1030	assert(nr >= sr);
1031	assert(k != NULL);
1032	assert(packed_weights != NULL);
1033
1034	const int32_t izp = (int32_t) params->input_zero_point;
1035	const int32_t bzp = (int32_t) ks * izp * (int32_t) params->kernel_zero_point;
1036	for (size_t i = `0`; i < g; i++) {
1037	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
1038	const size_t nr_block_size = min(nc - nr_block_start, nr);
1039	int32_t* packed_b = (int32_t*) packed_weights;
1040	if XNN_LIKELY(b != NULL) {
1041	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1042	unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
1043	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1044	}
1045	} else {
1046	size_t n = nr_block_size;
1047	do {
1048	unaligned_store_s32(packed_weights, bzp);
1049	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1050	} while (--n != `0`);
1051	}
1052	packed_weights = (void) ((uintptr_t) packed_weights + (nr - nr_block_size) sizeof(int32_t));
1053
1054	for (size_t ki = `0`; ki < ks; ki++) {
1055	for (size_t sr_block_offset = `0`; sr_block_offset < sr; sr_block_offset++) {
1056	for (size_t nr_block_offset = (-sr_block_offset) & (sr - `1`); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1057	const uint8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1058	((uint8_t) packed_weights)[nr_block_offset kr] = kv;
1059	unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - (int32_t) kv * izp);
1060	}
1061	packed_weights = (uint8_t) packed_weights + nr kr;
1062	}
1063	}
1064	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1065	}
1066	k += nc;
1067	if XNN_UNPREDICTABLE(b != NULL) {
1068	b += nc;
1069	}
1070	}
1071	}
1072
1073	void xnn_pack_qs8_conv_kgo_w(
1074	size_t g,
1075	size_t nc,
1076	size_t ks,
1077	size_t nr,
1078	size_t kr,
1079	size_t sr,
1080	const int8_t* k,
1081	const int32_t* b,
1082	void* packed_weights,
1083	size_t extra_bytes,
1084	const struct xnn_qs8_packing_params* params)
1085	{
1086	assert(g != `0`);
1087	assert(nr >= sr);
1088	assert(k != NULL);
1089	assert(packed_weights != NULL);
1090
1091	const uint32_t izp = (uint32_t) params->input_zero_point;
1092	for (size_t i = `0`; i < g; i++) {
1093	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
1094	const size_t nr_block_size = min(nc - nr_block_start, nr);
1095	int32_t* packed_b = (int32_t*) packed_weights;
1096	if XNN_LIKELY(b != NULL) {
1097	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1098	unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1099	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1100	}
1101	} else {
1102	size_t n = nr_block_size;
1103	do {
1104	unaligned_store_s32(packed_weights, `0`);
1105	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1106	} while (--n != `0`);
1107	}
1108	packed_weights = (void) ((uintptr_t) packed_weights + (nr - nr_block_size) sizeof(int32_t));
1109
1110	for (size_t ki = `0`; ki < ks; ki++) {
1111	for (size_t sr_block_offset = `0`; sr_block_offset < sr; sr_block_offset++) {
1112	for (size_t nr_block_offset = (-sr_block_offset) & (sr - `1`); nr_block_offset < nr_block_size; nr_block_offset += sr) {
1113	const int8_t kv = k[ki * g * nc + (nr_block_start + nr_block_offset)];
1114	((int8_t) packed_weights)[nr_block_offset kr] = kv;
1115	unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - (uint32_t) kv * izp);
1116	}
1117	packed_weights = (int8_t) packed_weights + nr kr;
1118	}
1119	}
1120	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1121	}
1122	k += nc;
1123	if XNN_UNPREDICTABLE(b != NULL) {
1124	b += nc;
1125	}
1126	}
1127	}
1128
1129	void xnn_pack_f32_deconv_goki_w(
1130	size_t g,
1131	size_t nc,
1132	size_t kh,
1133	size_t kw,
1134	size_t kc,
1135	size_t sh,
1136	size_t sw,
1137	size_t nr,
1138	size_t kr,
1139	size_t sr,
1140	const float* k,
1141	const float* b,
1142	float* packed_weights,
1143	struct subconvolution_params* subconv_params,
1144	const void* params)
1145	{
1146	assert(g != `0`);
1147	assert(nr >= sr);
1148	assert(k != NULL);
1149	assert(packed_weights != NULL);
1150
1151	const size_t skr = sr * kr;
1152	for (size_t i = `0`; i < g; i++) {
1153	for (size_t oy = `0`; oy < sh; oy++) {
1154	for (size_t ox = `0`; ox < sw; ox++) {
1155	if (i == `0`) {
1156	(*subconv_params++).weights = packed_weights;
1157	}
1158	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
1159	const size_t nr_block_size = min(nc - nr_block_start, nr);
1160	if XNN_LIKELY(b != NULL) {
1161	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1162	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1163	}
1164	}
1165	packed_weights += nr;
1166	for (size_t ky = oy; ky < kh; ky += sh) {
1167	for (size_t kx = ox; kx < kw; kx += sw) {
1168	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1169	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1170	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
1171	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
1172	if (kc_idx < kc) {
1173	packed_weights[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1174	}
1175	}
1176	packed_weights += kr;
1177	}
1178	packed_weights += (nr - nr_block_size) * kr;
1179	}
1180	}
1181	}
1182	}
1183	}
1184	}
1185	k += kh * kw * kc * nc;
1186	if XNN_UNPREDICTABLE(b != NULL) {
1187	b += nc;
1188	}
1189	}
1190	}
1191
1192	void xnn_pack_f16_deconv_goki_w(
1193	size_t g,
1194	size_t nc,
1195	size_t kh,
1196	size_t kw,
1197	size_t kc,
1198	size_t sh,
1199	size_t sw,
1200	size_t nr,
1201	size_t kr,
1202	size_t sr,
1203	const uint16_t* k,
1204	const uint16_t* b,
1205	uint16_t* packed_weights,
1206	struct subconvolution_params* subconv_params,
1207	const void* params)
1208	{
1209	assert(g != `0`);
1210	assert(nr >= sr);
1211	assert(k != NULL);
1212	assert(packed_weights != NULL);
1213
1214	const size_t skr = sr * kr;
1215	for (size_t i = `0`; i < g; i++) {
1216	for (size_t oy = `0`; oy < sh; oy++) {
1217	for (size_t ox = `0`; ox < sw; ox++) {
1218	if (i == `0`) {
1219	(*subconv_params++).weights = packed_weights;
1220	}
1221	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
1222	const size_t nr_block_size = min(nc - nr_block_start, nr);
1223	if XNN_LIKELY(b != NULL) {
1224	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1225	packed_weights[nr_block_offset] = b[nr_block_start + nr_block_offset];
1226	}
1227	}
1228	packed_weights += nr;
1229	for (size_t ky = oy; ky < kh; ky += sh) {
1230	for (size_t kx = ox; kx < kw; kx += sw) {
1231	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1232	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1233	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
1234	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
1235	if (kc_idx < kc) {
1236	packed_weights[kr_block_offset] = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1237	}
1238	}
1239	packed_weights += kr;
1240	}
1241	packed_weights += (nr - nr_block_size) * kr;
1242	}
1243	}
1244	}
1245	}
1246	}
1247	}
1248	k += kh * kw * kc * nc;
1249	if XNN_UNPREDICTABLE(b != NULL) {
1250	b += nc;
1251	}
1252	}
1253	}
1254
1255	void xnn_pack_f32_to_f16_deconv_goki_w(
1256	size_t g,
1257	size_t nc,
1258	size_t kh,
1259	size_t kw,
1260	size_t kc,
1261	size_t sh,
1262	size_t sw,
1263	size_t nr,
1264	size_t kr,
1265	size_t sr,
1266	const float* k,
1267	const float* b,
1268	uint16_t* packed_weights,
1269	struct subconvolution_params* subconv_params,
1270	const void* params)
1271	{
1272	assert(g != `0`);
1273	assert(nr >= sr);
1274	assert(k != NULL);
1275	assert(packed_weights != NULL);
1276
1277	const size_t skr = sr * kr;
1278	for (size_t i = `0`; i < g; i++) {
1279	for (size_t oy = `0`; oy < sh; oy++) {
1280	for (size_t ox = `0`; ox < sw; ox++) {
1281	if (i == `0`) {
1282	(*subconv_params++).weights = packed_weights;
1283	}
1284	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
1285	const size_t nr_block_size = min(nc - nr_block_start, nr);
1286	if XNN_LIKELY(b != NULL) {
1287	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1288	packed_weights[nr_block_offset] = fp16_ieee_from_fp32_value(b[nr_block_start + nr_block_offset]);
1289	}
1290	}
1291	packed_weights += nr;
1292	for (size_t ky = oy; ky < kh; ky += sh) {
1293	for (size_t kx = ox; kx < kw; kx += sw) {
1294	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1295	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1296	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
1297	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
1298	if (kc_idx < kc) {
1299	packed_weights[kr_block_offset] = fp16_ieee_from_fp32_value(k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx]);
1300	}
1301	}
1302	packed_weights += kr;
1303	}
1304	packed_weights += (nr - nr_block_size) * kr;
1305	}
1306	}
1307	}
1308	}
1309	}
1310	}
1311	k += kh * kw * kc * nc;
1312	if XNN_UNPREDICTABLE(b != NULL) {
1313	b += nc;
1314	}
1315	}
1316	}
1317
1318	void xnn_pack_qs8_deconv_goki_w(
1319	size_t g,
1320	size_t nc,
1321	size_t kh,
1322	size_t kw,
1323	size_t kc,
1324	size_t sh,
1325	size_t sw,
1326	size_t nr,
1327	size_t kr,
1328	size_t sr,
1329	const int8_t* k,
1330	const int32_t* b,
1331	void* packed_weights,
1332	struct subconvolution_params* subconv_params,
1333	const struct xnn_qs8_packing_params* params)
1334	{
1335	assert(g != `0`);
1336	assert(nr >= sr);
1337	assert(k != NULL);
1338	assert(packed_weights != NULL);
1339
1340	const size_t skr = sr * kr;
1341	const uint32_t izp = (uint32_t) params->input_zero_point;
1342	for (size_t i = `0`; i < g; i++) {
1343	for (size_t oy = `0`; oy < sh; oy++) {
1344	for (size_t ox = `0`; ox < sw; ox++) {
1345	if (i == `0`) {
1346	(*subconv_params++).weights = packed_weights;
1347	}
1348	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
1349	const size_t nr_block_size = min(nc - nr_block_start, nr);
1350	int32_t* packed_b = (int32_t*) packed_weights;
1351	if XNN_LIKELY(b != `0`) {
1352	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1353	unaligned_store_s32(packed_weights, b[nr_block_start + nr_block_offset]);
1354	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1355	}
1356	} else {
1357	size_t n = nr_block_size;
1358	do {
1359	unaligned_store_s32(packed_weights, `0`);
1360	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1361	} while (--n != `0`);
1362	}
1363	packed_weights = (void) ((uintptr_t) packed_weights + (nr - nr_block_size) sizeof(int32_t));
1364	for (size_t ky = oy; ky < kh; ky += sh) {
1365	for (size_t kx = ox; kx < kw; kx += sw) {
1366	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1367	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1368	uint32_t ksum = `0`;
1369	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
1370	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
1371	if (kc_idx < kc) {
1372	const int8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1373	ksum += (uint32_t) kv;
1374	((int8_t*) packed_weights)[kr_block_offset] = kv;
1375	}
1376	}
1377	unaligned_indexed_store_u32(packed_b, nr_block_offset, unaligned_indexed_load_u32(packed_b, nr_block_offset) - ksum * izp);
1378	packed_weights = (int8_t*) packed_weights + kr;
1379	}
1380	packed_weights = (int8_t) packed_weights + (nr - nr_block_size) kr;
1381	}
1382	}
1383	}
1384	}
1385	}
1386	}
1387	k += kh * kw * kc * nc;
1388	if XNN_UNPREDICTABLE(b != NULL) {
1389	b += nc;
1390	}
1391	}
1392	}
1393
1394	void xnn_pack_qu8_deconv_goki_w(
1395	size_t g,
1396	size_t nc,
1397	size_t kh,
1398	size_t kw,
1399	size_t kc,
1400	size_t sh,
1401	size_t sw,
1402	size_t nr,
1403	size_t kr,
1404	size_t sr,
1405	const uint8_t* k,
1406	const int32_t* b,
1407	void* packed_weights,
1408	struct subconvolution_params* subconv_params,
1409	const struct xnn_qu8_packing_params* params)
1410	{
1411	assert(g != `0`);
1412	assert(nr >= sr);
1413	assert(k != NULL);
1414	assert(packed_weights != NULL);
1415
1416	const size_t skr = sr * kr;
1417	const int32_t izp = (int32_t) params->input_zero_point;
1418	const int32_t kzp = (int32_t) params->kernel_zero_point;
1419	for (size_t i = `0`; i < g; i++) {
1420	for (size_t oy = `0`; oy < sh; oy++) {
1421	for (size_t ox = `0`; ox < sw; ox++) {
1422	if (i == `0`) {
1423	(*subconv_params++).weights = packed_weights;
1424	}
1425	const int32_t bzp = (int32_t) divide_round_up(kh - oy, sh) * (int32_t) divide_round_up(kw - ox, sw) * (int32_t) kc * izp * kzp;
1426	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
1427	const size_t nr_block_size = min(nc - nr_block_start, nr);
1428	int32_t* packed_b = (int32_t*) packed_weights;
1429	if XNN_LIKELY(b != `0`) {
1430	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1431	unaligned_store_s32(packed_weights, bzp + b[nr_block_start + nr_block_offset]);
1432	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1433	}
1434	} else {
1435	size_t n = nr_block_size;
1436	do {
1437	unaligned_store_s32(packed_weights, bzp);
1438	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1439	} while (--n != `0`);
1440	}
1441	packed_weights = (void) ((uintptr_t) packed_weights + (nr - nr_block_size) sizeof(int32_t));
1442	for (size_t ky = oy; ky < kh; ky += sh) {
1443	for (size_t kx = ox; kx < kw; kx += sw) {
1444	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
1445	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
1446	int32_t ksum = `0`;
1447	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
1448	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
1449	if (kc_idx < kc) {
1450	const uint8_t kv = k[(((nr_block_start + nr_block_offset) * kh + ky) * kw + kx) * kc + kc_idx];
1451	ksum += (int32_t) kv;
1452	((uint8_t*) packed_weights)[kr_block_offset] = kv;
1453	}
1454	}
1455	unaligned_indexed_store_s32(packed_b, nr_block_offset, unaligned_indexed_load_s32(packed_b, nr_block_offset) - ksum * izp);
1456	packed_weights = (uint8_t*) packed_weights + kr;
1457	}
1458	packed_weights = (uint8_t) packed_weights + (nr - nr_block_size) kr;
1459	}
1460	}
1461	}
1462	}
1463	}
1464	}
1465	k += kh * kw * kc * nc;
1466	if XNN_UNPREDICTABLE(b != NULL) {
1467	b += nc;
1468	}
1469	}
1470	}
1471
1472	void xnn_pack_f32_dwconv_ghw_w(
1473	size_t primary_tile,
1474	size_t h,
1475	size_t w,
1476	size_t c,
1477	size_t cr,
1478	const float* k,
1479	const float* b,
1480	float* packed_weights,
1481	size_t extra_bytes,
1482	const void* params)
1483	{
1484	assert(primary_tile >= h * w);
1485	xnn_pack_f32_dwconv_multipass_ghw_w(
1486	primary_tile,
1487	/middle_pass_tile=/`0`,
1488	/last_pass_tile=/`0`,
1489	h,
1490	w,
1491	c,
1492	cr,
1493	cr,
1494	cr,
1495	k,
1496	b,
1497	packed_weights,
1498	extra_bytes,
1499	params);
1500	}
1501
1502	// Helper function to advance x and y indices.
1503	inline static void advance_x_y(size_t h, size_t* x, size_t* y) {
1504	if (++*y == h) {
1505	*y = `0`;
1506	++*x;
1507	}
1508	}
1509
1510	void xnn_pack_f32_dwconv_multipass_ghw_w(
1511	size_t first_pass_tile,
1512	size_t middle_pass_tile,
1513	size_t last_pass_tile,
1514	size_t h,
1515	size_t w,
1516	size_t c,
1517	size_t channel_tile,
1518	size_t channel_subtile,
1519	size_t channel_round,
1520	const float* k,
1521	const float* b,
1522	float* packed_weights,
1523	size_t extra_bytes,
1524	const void* params)
1525	{
1526	assert(k != NULL);
1527	assert(packed_weights != NULL);
1528	size_t kernel_size = h * w;
1529	if (middle_pass_tile == `0`) {
1530	// Uni-pass DWCONV.
1531	assert(last_pass_tile == `0`);
1532	} else {
1533	// Multi-pass DWCONV.
1534	assert(kernel_size > first_pass_tile);
1535	}
1536
1537	// Stores the x and y index that should be processed next.
1538	size_t processed_x = `0`;
1539	size_t processed_y = `0`;
1540	size_t x = `0`;
1541	size_t y = `0`;
1542	// First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
1543	const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
1544
1545	// Pack in blocks of channel_tile, then in blocks of channel_subtile.
1546	{
1547	size_t cr_block_start = `0`;
1548	for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
1549	const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1550	if XNN_LIKELY(b != NULL) {
1551	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1552	*packed_weights++ = b[cr_block_start + cr_block_offset];
1553	}
1554	} else {
1555	size_t n = cr_block_size;
1556	do {
1557	*packed_weights++ = `0.0f`;
1558	} while (--n != `0`);
1559	}
1560	packed_weights += channel_tile - cr_block_size;
1561
1562	x = `0`;
1563	y = `0`;
1564	// kernel_size can be less than the first_pass_tile, in this case, pack up
1565	// to the smaller of the two.
1566	for (size_t i = `0`; i < min(first_pass_tile, kernel_size); i++) {
1567	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1568	const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1569	*packed_weights++ = kv;
1570	}
1571	packed_weights += channel_tile - cr_block_size;
1572	advance_x_y(h, &x, &y);
1573	}
1574	// And make sure to skip weights if kernel_size < first_pass_tile.
1575	packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
1576	}
1577
1578	for (; cr_block_start < c; cr_block_start += channel_subtile) {
1579	const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1580	if XNN_LIKELY(b != NULL) {
1581	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1582	*packed_weights++ = b[cr_block_start + cr_block_offset];
1583	}
1584	} else {
1585	size_t n = cr_block_size;
1586	do {
1587	*packed_weights++ = `0.0f`;
1588	} while (--n != `0`);
1589	}
1590	packed_weights += channel_subtile - cr_block_size;
1591
1592	x = `0`;
1593	y = `0`;
1594	// kernel_size can be less than the first_pass_tile, in this case, pack up
1595	// to the smaller of the two.
1596	for (size_t i = `0`; i < min(first_pass_tile, kernel_size); i++) {
1597	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1598	const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1599	*packed_weights++ = kv;
1600	}
1601	packed_weights += channel_subtile - cr_block_size;
1602	advance_x_y(h, &x, &y);
1603	}
1604	// And make sure to skip weights if kernel_size < first_pass_tile.
1605	packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
1606	}
1607	}
1608
1609	if (kernel_size <= first_pass_tile) {
1610	return;
1611	}
1612
1613	kernel_size -= first_pass_tile;
1614
1615	processed_x = x;
1616	processed_y = y;
1617
1618	// Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
1619	// middle_pass_tile cr weights.*
1620	for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
1621	assert(kernel_size >= middle_pass_tile);
1622	size_t cr_block_start = `0`;
1623	for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
1624	x = processed_x;
1625	y = processed_y;
1626	const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1627	for (size_t j = `0`; j < middle_pass_tile; j++) {
1628	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1629	const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1630	*packed_weights++ = kv;
1631	}
1632	packed_weights += channel_tile - cr_block_size;
1633	advance_x_y(h, &x, &y);
1634	}
1635	}
1636	for (; cr_block_start < c; cr_block_start += channel_subtile) {
1637	x = processed_x;
1638	y = processed_y;
1639	const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1640	for (size_t j = `0`; j < middle_pass_tile; j++) {
1641	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1642	const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1643	*packed_weights++ = kv;
1644	}
1645	packed_weights += channel_subtile - cr_block_size;
1646	advance_x_y(h, &x, &y);
1647	}
1648	}
1649	processed_x = x;
1650	processed_y = y;
1651	}
1652
1653	// Last pass.
1654	{
1655	assert(kernel_size <= last_pass_tile);
1656	size_t cr_block_start = `0`;
1657	for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
1658	// Last pass does not pack to rounded c, since it handles remainder.
1659	x = processed_x;
1660	y = processed_y;
1661	const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1662	for (size_t i = `0`; i < kernel_size; i++) {
1663	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1664	const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1665	*packed_weights++ = kv;
1666	}
1667	packed_weights += channel_tile - cr_block_size;
1668	advance_x_y(h, &x, &y);
1669	}
1670	// Pad so that we can always read last_pass_tile weights in the last pass.
1671	packed_weights += (last_pass_tile - kernel_size) * channel_tile;
1672	// TODO(zhin): support extra bytes for channel_tile and subtile.
1673	packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
1674	}
1675	for (; cr_block_start < c; cr_block_start += channel_subtile) {
1676	// Last pass does not pack to rounded c, since it handles remainder.
1677	x = processed_x;
1678	y = processed_y;
1679	const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1680	for (size_t i = `0`; i < kernel_size; i++) {
1681	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1682	const float kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1683	*packed_weights++ = kv;
1684	}
1685	packed_weights += channel_subtile - cr_block_size;
1686	advance_x_y(h, &x, &y);
1687	}
1688	// Pad so that we can always read last_pass_tile weights in the last pass.
1689	packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
1690	// TODO(zhin): support extra bytes for channel_tile and subtile.
1691	packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
1692	}
1693	}
1694	}
1695
1696	void xnn_pack_f16_dwconv_ghw_w(
1697	size_t primary_tile,
1698	size_t h,
1699	size_t w,
1700	size_t c,
1701	size_t cr,
1702	const uint16_t* k,
1703	const uint16_t* b,
1704	uint16_t* packed_weights,
1705	size_t extra_bytes,
1706	const void* params)
1707	{
1708	assert(k != NULL);
1709	assert(packed_weights != NULL);
1710
1711	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
1712	const size_t cr_block_size = min(c - cr_block_start, cr);
1713	if XNN_LIKELY(b != NULL) {
1714	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1715	*packed_weights++ = b[cr_block_start + cr_block_offset];
1716	}
1717	} else {
1718	size_t n = cr_block_size;
1719	do {
1720	*packed_weights++ = `0`;
1721	} while (--n != `0`);
1722	}
1723	packed_weights += cr - cr_block_size;
1724	for (size_t x = `0`; x < w; x++) {
1725	for (size_t y = `0`; y < h; y++) {
1726	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1727	const uint16_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1728	*packed_weights++ = kv;
1729	}
1730	packed_weights += cr - cr_block_size;
1731	}
1732	}
1733	packed_weights += (primary_tile - (h * w)) * cr_block_size;
1734	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1735	}
1736	}
1737
1738	void xnn_pack_f32_to_f16_dwconv_ghw_w(
1739	size_t primary_tile,
1740	size_t h,
1741	size_t w,
1742	size_t c,
1743	size_t cr,
1744	const float* k,
1745	const float* b,
1746	uint16_t* packed_weights,
1747	size_t extra_bytes,
1748	const void* params)
1749	{
1750	assert(k != NULL);
1751	assert(packed_weights != NULL);
1752
1753	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
1754	const size_t cr_block_size = min(c - cr_block_start, cr);
1755	if XNN_LIKELY(b != NULL) {
1756	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1757	*packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
1758	}
1759	} else {
1760	size_t n = cr_block_size;
1761	do {
1762	*packed_weights++ = `0`;
1763	} while (--n != `0`);
1764	}
1765	packed_weights += cr - cr_block_size;
1766	for (size_t x = `0`; x < w; x++) {
1767	for (size_t y = `0`; y < h; y++) {
1768	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1769	const uint16_t kv = fp16_ieee_from_fp32_value(k[((cr_block_start + cr_block_offset) * h + y) * w + x]);
1770	*packed_weights++ = kv;
1771	}
1772	packed_weights += cr - cr_block_size;
1773	}
1774	}
1775	packed_weights += (primary_tile - (h * w)) * cr_block_size;
1776	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
1777	}
1778	}
1779
1780	void xnn_pack_qu8_dwconv_ghw_w(
1781	size_t primary_tile,
1782	size_t h,
1783	size_t w,
1784	size_t c,
1785	size_t cr,
1786	const uint8_t* k,
1787	const int32_t* b,
1788	void* packed_weights,
1789	size_t extra_bytes,
1790	const struct xnn_qu8_packing_params* params)
1791	{
1792	assert(k != NULL);
1793	assert(packed_weights != NULL);
1794
1795	const int32_t izp = (int32_t) params->input_zero_point;
1796	const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
1797	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
1798	const size_t cr_block_size = min(c - cr_block_start, cr);
1799	int32_t* packed_b = (int32_t*) packed_weights;
1800	if XNN_LIKELY(b != NULL) {
1801	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1802	unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
1803	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1804	}
1805	} else {
1806	size_t n = cr_block_size;
1807	do {
1808	unaligned_store_s32(packed_weights, boff);
1809	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1810	} while (--n != `0`);
1811	}
1812	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(int32_t));
1813	for (size_t x = `0`; x < w; x++) {
1814	for (size_t y = `0`; y < h; y++) {
1815	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1816	const uint8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1817	unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
1818	((uint8_t) packed_weights) = kv;
1819	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(uint8_t));
1820	}
1821	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(uint8_t));
1822	}
1823	}
1824	packed_weights = (void) ((uintptr_t) packed_weights + (primary_tile - (h w)) * cr_block_size * sizeof(uint8_t));
1825	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1826	}
1827	}
1828
1829	void xnn_pack_qs8_dwconv_ghw_w(
1830	size_t primary_tile,
1831	size_t h,
1832	size_t w,
1833	size_t c,
1834	size_t cr,
1835	const int8_t* k,
1836	const int32_t* b,
1837	void* packed_weights,
1838	size_t extra_bytes,
1839	const struct xnn_qs8_packing_params* params)
1840	{
1841	assert(k != NULL);
1842	assert(packed_weights != NULL);
1843
1844	const uint32_t izp = (uint32_t) params->input_zero_point;
1845	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
1846	const size_t cr_block_size = min(c - cr_block_start, cr);
1847	int32_t* packed_b = (int32_t*) packed_weights;
1848	if XNN_LIKELY(b != NULL) {
1849	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1850	unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
1851	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1852	}
1853	} else {
1854	size_t n = cr_block_size;
1855	do {
1856	unaligned_store_s32(packed_weights, `0`);
1857	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
1858	} while (--n != `0`);
1859	}
1860	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(int32_t));
1861	for (size_t x = `0`; x < w; x++) {
1862	for (size_t y = `0`; y < h; y++) {
1863	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1864	const int8_t kv = k[((cr_block_start + cr_block_offset) * h + y) * w + x];
1865	unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
1866	((int8_t) packed_weights) = kv;
1867	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int8_t));
1868	}
1869	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(int8_t));
1870	}
1871	}
1872	packed_weights = (void) ((uintptr_t) packed_weights + (primary_tile - (h w)) * cr_block_size * sizeof(int8_t));
1873	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
1874	}
1875	}
1876
1877	void xnn_pack_f32_dwconv_hwg_w(
1878	size_t primary_tile,
1879	size_t h,
1880	size_t w,
1881	size_t c,
1882	size_t cr,
1883	const float* k,
1884	const float* b,
1885	float* packed_weights,
1886	size_t extra_bytes,
1887	const void* params)
1888	{
1889	assert(primary_tile >= h * w);
1890	xnn_pack_f32_dwconv_multipass_hwg_w(
1891	primary_tile,
1892	/middle_pass_tile=/`0`,
1893	/last_pass_tile=/`0`,
1894	h,
1895	w,
1896	c,
1897	cr,
1898	cr,
1899	cr,
1900	k,
1901	b,
1902	packed_weights,
1903	extra_bytes,
1904	params);
1905	}
1906
1907	void xnn_pack_f32_dwconv_multipass_hwg_w(
1908	size_t first_pass_tile,
1909	size_t middle_pass_tile,
1910	size_t last_pass_tile,
1911	size_t h,
1912	size_t w,
1913	size_t c,
1914	size_t channel_tile,
1915	size_t channel_subtile,
1916	size_t channel_round,
1917	const float* k,
1918	const float* b,
1919	float* packed_weights,
1920	size_t extra_bytes,
1921	const void* params)
1922	{
1923	assert(k != NULL);
1924	assert(packed_weights != NULL);
1925	size_t kernel_size = h * w;
1926	if (middle_pass_tile == `0`) {
1927	// Uni-pass DWCONV.
1928	assert(last_pass_tile == `0`);
1929	} else {
1930	// Multi-pass DWCONV.
1931	assert(kernel_size > first_pass_tile);
1932	}
1933
1934	// Stores the x and y index that should be processed next.
1935	size_t processed_x = `0`;
1936	size_t processed_y = `0`;
1937	size_t x = `0`;
1938	size_t y = `0`;
1939	// First and middle pass packs in sizes of channel_tile to tiled_c, then in sizes of channel_subtile.
1940	const size_t tiled_c = round_down_po2(round_up_po2(c, channel_round), channel_tile);
1941
1942	// Pack in blocks of channel_tile, then in blocks of channel_subtile.
1943	{
1944	size_t cr_block_start = `0`;
1945	for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
1946	const size_t cr_block_size = min(c - cr_block_start, channel_tile);
1947	if XNN_LIKELY(b != NULL) {
1948	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1949	*packed_weights++ = b[cr_block_start + cr_block_offset];
1950	}
1951	} else {
1952	size_t n = cr_block_size;
1953	do {
1954	*packed_weights++ = `0.0f`;
1955	} while (--n != `0`);
1956	}
1957	packed_weights += channel_tile - cr_block_size;
1958
1959	x = processed_x;
1960	y = processed_y;
1961	// kernel_size can be less than the first_pass_tile, in this case, pack up
1962	// to the smaller of the two.
1963	for (size_t i = `0`; i < min(first_pass_tile, kernel_size); i++) {
1964	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1965	const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1966	*packed_weights++ = kv;
1967	}
1968	packed_weights += channel_tile - cr_block_size;
1969	if (++y == h) {
1970	y = `0`;
1971	x++;
1972	}
1973	}
1974	// And make sure to skip weights if kernel_size < first_pass_tile.
1975	packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
1976	}
1977	for (; cr_block_start < c; cr_block_start += channel_subtile) {
1978	const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
1979	if XNN_LIKELY(b != NULL) {
1980	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1981	*packed_weights++ = b[cr_block_start + cr_block_offset];
1982	}
1983	} else {
1984	size_t n = cr_block_size;
1985	do {
1986	*packed_weights++ = `0.0f`;
1987	} while (--n != `0`);
1988	}
1989	packed_weights += channel_subtile - cr_block_size;
1990
1991	x = processed_x;
1992	y = processed_y;
1993	// kernel_size can be less than the first_pass_tile, in this case, pack up
1994	// to the smaller of the two.
1995	for (size_t i = `0`; i < min(first_pass_tile, kernel_size); i++) {
1996	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
1997	const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
1998	*packed_weights++ = kv;
1999	}
2000	packed_weights += channel_subtile - cr_block_size;
2001	if (++y == h) {
2002	y = `0`;
2003	x++;
2004	}
2005	}
2006	// And make sure to skip weights if kernel_size < first_pass_tile.
2007	packed_weights += doz(first_pass_tile, kernel_size) * cr_block_size;
2008	}
2009	}
2010
2011	if (kernel_size <= first_pass_tile) {
2012	return;
2013	}
2014
2015	kernel_size -= first_pass_tile;
2016
2017	processed_x = x;
2018	processed_y = y;
2019
2020	// Middle pass. (kernel_size / middle_pass_tile) blocks, within each block is
2021	// middle_pass_tile cr weights.*
2022	for (; kernel_size > last_pass_tile; kernel_size -= middle_pass_tile) {
2023	assert(kernel_size >= middle_pass_tile);
2024	size_t cr_block_start = `0`;
2025	for (; cr_block_start < tiled_c; cr_block_start += channel_tile) {
2026	x = processed_x;
2027	y = processed_y;
2028	const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2029	for (size_t j = `0`; j < middle_pass_tile; j++) {
2030	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2031	const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2032	*packed_weights++ = kv;
2033	}
2034	packed_weights += channel_tile - cr_block_size;
2035	if (++y == h) {
2036	y = `0`;
2037	x++;
2038	}
2039	}
2040	}
2041	for (; cr_block_start < c; cr_block_start += channel_subtile) {
2042	x = processed_x;
2043	y = processed_y;
2044	const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2045	for (size_t j = `0`; j < middle_pass_tile; j++) {
2046	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2047	const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2048	*packed_weights++ = kv;
2049	}
2050	packed_weights += channel_subtile - cr_block_size;
2051	if (++y == h) {
2052	y = `0`;
2053	x++;
2054	}
2055	}
2056	}
2057	processed_x = x;
2058	processed_y = y;
2059	}
2060
2061	// Last pass.
2062	{
2063	assert(kernel_size <= last_pass_tile);
2064	size_t cr_block_start = `0`;
2065	for (; cr_block_start < round_down_po2(c, channel_tile); cr_block_start += channel_tile) {
2066	// for (; cr_block_start <= c - channel_tile; cr_block_start += channel_tile) {
2067	x = processed_x;
2068	y = processed_y;
2069	const size_t cr_block_size = min(c - cr_block_start, channel_tile);
2070	for (size_t i = `0`; i < kernel_size; i++) {
2071	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2072	const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2073	*packed_weights++ = kv;
2074	}
2075	packed_weights += channel_tile - cr_block_size;
2076	if (++y == h) {
2077	y = `0`;
2078	x++;
2079	}
2080	}
2081	// Pad so that we can always read last_pass_tile weights in the last pass.
2082	packed_weights += (last_pass_tile - kernel_size) * channel_tile;
2083	// TODO(zhin): support extra bytes for channel_tile and subtile.
2084	packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
2085	}
2086	for (; cr_block_start < c; cr_block_start += channel_subtile) {
2087	x = processed_x;
2088	y = processed_y;
2089	const size_t cr_block_size = min(c - cr_block_start, channel_subtile);
2090	for (size_t i = `0`; i < kernel_size; i++) {
2091	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2092	const float kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2093	*packed_weights++ = kv;
2094	}
2095	packed_weights += channel_subtile - cr_block_size;
2096	if (++y == h) {
2097	y = `0`;
2098	x++;
2099	}
2100	}
2101	// Pad so that we can always read last_pass_tile weights in the last pass.
2102	packed_weights += (last_pass_tile - kernel_size) * channel_subtile;
2103	// TODO(zhin): support extra bytes for channel_tile and subtile.
2104	packed_weights = (float*) ((uintptr_t) packed_weights + extra_bytes);
2105	}
2106	}
2107	}
2108
2109	void xnn_pack_f16_dwconv_hwg_w(
2110	size_t primary_tile,
2111	size_t h,
2112	size_t w,
2113	size_t c,
2114	size_t cr,
2115	const uint16_t* k,
2116	const uint16_t* b,
2117	uint16_t* packed_weights,
2118	size_t extra_bytes,
2119	const void* params)
2120	{
2121	assert(k != NULL);
2122	assert(packed_weights != NULL);
2123
2124	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
2125	const size_t cr_block_size = min(c - cr_block_start, cr);
2126	if XNN_LIKELY(b != NULL) {
2127	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2128	*packed_weights++ = b[cr_block_start + cr_block_offset];
2129	}
2130	} else {
2131	size_t n = cr_block_size;
2132	do {
2133	*packed_weights++ = `0`;
2134	} while (--n != `0`);
2135	}
2136	packed_weights += cr - cr_block_size;
2137	for (size_t x = `0`; x < w; x++) {
2138	for (size_t y = `0`; y < h; y++) {
2139	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2140	const uint16_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2141	*packed_weights++ = kv;
2142	}
2143	packed_weights += cr - cr_block_size;
2144	}
2145	}
2146	packed_weights += (primary_tile - (h * w)) * cr_block_size;
2147	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
2148	}
2149	}
2150
2151	void xnn_pack_f32_to_f16_dwconv_hwg_w(
2152	size_t primary_tile,
2153	size_t h,
2154	size_t w,
2155	size_t c,
2156	size_t cr,
2157	const float* k,
2158	const float* b,
2159	uint16_t* packed_weights,
2160	size_t extra_bytes,
2161	const void* params)
2162	{
2163	assert(k != NULL);
2164	assert(packed_weights != NULL);
2165
2166	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
2167	const size_t cr_block_size = min(c - cr_block_start, cr);
2168	if XNN_LIKELY(b != NULL) {
2169	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2170	*packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2171	}
2172	} else {
2173	size_t n = cr_block_size;
2174	do {
2175	*packed_weights++ = `0`;
2176	} while (--n != `0`);
2177	}
2178	packed_weights += cr - cr_block_size;
2179	for (size_t x = `0`; x < w; x++) {
2180	for (size_t y = `0`; y < h; y++) {
2181	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2182	const uint16_t kv = fp16_ieee_from_fp32_value(k[(y * w + x) * c + (cr_block_start + cr_block_offset)]);
2183	*packed_weights++ = kv;
2184	}
2185	packed_weights += cr - cr_block_size;
2186	}
2187	}
2188	packed_weights += (primary_tile - (h * w)) * cr_block_size;
2189	packed_weights = (uint16_t*) ((uintptr_t) packed_weights + extra_bytes);
2190	}
2191	}
2192
2193	void xnn_pack_qu8_dwconv_hwg_w(
2194	size_t primary_tile,
2195	size_t h,
2196	size_t w,
2197	size_t c,
2198	size_t cr,
2199	const uint8_t* k,
2200	const int32_t* b,
2201	void* packed_weights,
2202	size_t extra_bytes,
2203	const struct xnn_qu8_packing_params* params)
2204	{
2205	assert(k != NULL);
2206	assert(packed_weights != NULL);
2207
2208	const int32_t izp = (int32_t) params->input_zero_point;
2209	const int32_t boff = (int32_t) h * (int32_t) w * izp * (int32_t) params->kernel_zero_point;
2210	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
2211	const size_t cr_block_size = min(c - cr_block_start, cr);
2212	int32_t* packed_b = (int32_t*) packed_weights;
2213	if XNN_LIKELY(b != NULL) {
2214	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2215	unaligned_store_s32(packed_weights, boff + b[cr_block_start + cr_block_offset]);
2216	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
2217	}
2218	} else {
2219	size_t n = cr_block_size;
2220	do {
2221	unaligned_store_s32(packed_weights, boff);
2222	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
2223	} while (--n != `0`);
2224	}
2225	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(int32_t));
2226	for (size_t x = `0`; x < w; x++) {
2227	for (size_t y = `0`; y < h; y++) {
2228	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2229	const uint8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2230	unaligned_indexed_store_s32(packed_b, cr_block_offset, unaligned_indexed_load_s32(packed_b, cr_block_offset) - (int32_t) kv * izp);
2231	((uint8_t) packed_weights) = kv;
2232	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(uint8_t));
2233	}
2234	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(uint8_t));
2235	}
2236	}
2237	packed_weights = (void) ((uintptr_t) packed_weights + (primary_tile - (h w)) * cr_block_size * sizeof(uint8_t));
2238	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
2239	}
2240	}
2241
2242	void xnn_pack_qs8_dwconv_hwg_w(
2243	size_t primary_tile,
2244	size_t h,
2245	size_t w,
2246	size_t c,
2247	size_t cr,
2248	const int8_t* k,
2249	const int32_t* b,
2250	void* packed_weights,
2251	size_t extra_bytes,
2252	const struct xnn_qs8_packing_params* params)
2253	{
2254	assert(k != NULL);
2255	assert(packed_weights != NULL);
2256
2257	const uint32_t izp = (int32_t) params->input_zero_point;
2258	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
2259	const size_t cr_block_size = min(c - cr_block_start, cr);
2260	int32_t* packed_b = (int32_t*) packed_weights;
2261	if XNN_LIKELY(b != NULL) {
2262	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2263	unaligned_store_s32(packed_weights, b[cr_block_start + cr_block_offset]);
2264	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
2265	}
2266	} else {
2267	size_t n = cr_block_size;
2268	do {
2269	unaligned_store_s32(packed_weights, `0`);
2270	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int32_t));
2271	} while (--n != `0`);
2272	}
2273	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(int32_t));
2274	for (size_t x = `0`; x < w; x++) {
2275	for (size_t y = `0`; y < h; y++) {
2276	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2277	const int8_t kv = k[(y * w + x) * c + (cr_block_start + cr_block_offset)];
2278	unaligned_indexed_store_u32(packed_b, cr_block_offset, unaligned_indexed_load_u32(packed_b, cr_block_offset) - (uint32_t) kv * izp);
2279	((int8_t) packed_weights) = kv;
2280	packed_weights = (void) ((uintptr_t) packed_weights + sizeof*(int8_t));
2281	}
2282	packed_weights = (void) ((uintptr_t) packed_weights + (cr - cr_block_size) sizeof(int8_t));
2283	}
2284	}
2285	packed_weights = (void) ((uintptr_t) packed_weights + (primary_tile - (h w)) * cr_block_size * sizeof(int8_t));
2286	packed_weights = (void*) ((uintptr_t) packed_weights + extra_bytes);
2287	}
2288	}
2289
2290	void xnn_pack_f32_gemminc_goi_w(
2291	size_t g,
2292	size_t nc,
2293	size_t kc,
2294	size_t nr,
2295	size_t kr,
2296	size_t sr,
2297	const float* k,
2298	float* packed_weights,
2299	const void* params)
2300	{
2301	assert(g != `0`);
2302	assert(nr >= sr);
2303	assert(k != NULL);
2304	assert(packed_weights != NULL);
2305
2306	const size_t skr = sr * kr;
2307	do {
2308	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
2309	const size_t nr_block_size = min(nc - nr_block_start, nr);
2310
2311	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
2312	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
2313	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
2314	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
2315	if (kc_idx < kc) {
2316	packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
2317	}
2318	}
2319	packed_weights += kr;
2320	}
2321	packed_weights += (nr - nr_block_size) * kr;
2322	}
2323	}
2324	k += nc * kc;
2325	} while (--g != `0`);
2326	}
2327
2328	void xnn_pack_f16_gemminc_goi_w(
2329	size_t g,
2330	size_t nc,
2331	size_t kc,
2332	size_t nr,
2333	size_t kr,
2334	size_t sr,
2335	const uint16_t* k,
2336	uint16_t* packed_weights,
2337	const void* params)
2338	{
2339	assert(g != `0`);
2340	assert(nr >= sr);
2341	assert(k != NULL);
2342	assert(packed_weights != NULL);
2343
2344	const size_t skr = sr * kr;
2345	do {
2346	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
2347	const size_t nr_block_size = min(nc - nr_block_start, nr);
2348
2349	for (size_t kr_block_start = `0`; kr_block_start < round_up_po2(kc, skr); kr_block_start += kr) {
2350	for (size_t nr_block_offset = `0`; nr_block_offset < nr_block_size; nr_block_offset++) {
2351	for (size_t kr_block_offset = `0`; kr_block_offset < kr; kr_block_offset++) {
2352	const size_t kc_idx = round_down_po2(kr_block_start, skr) + ((kr_block_start + kr_block_offset + nr_block_offset * kr) & (skr - `1`));
2353	if (kc_idx < kc) {
2354	packed_weights[kr_block_offset] = k[(nr_block_start + nr_block_offset) * kc + kc_idx];
2355	}
2356	}
2357	packed_weights += kr;
2358	}
2359	packed_weights += (nr - nr_block_size) * kr;
2360	}
2361	}
2362	k += nc * kc;
2363	} while (--g != `0`);
2364	}
2365
2366	void xnn_pack_f32_dconv_oki_w(
2367	size_t nc,
2368	size_t kc,
2369	size_t nr,
2370	size_t kh,
2371	size_t kw,
2372	const float* k,
2373	const float* b,
2374	float* packed_weights,
2375	const void* params)
2376	{
2377	assert(k != NULL);
2378	assert(packed_weights != NULL);
2379
2380	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
2381	const size_t nr_block_size = min(nc - nr_block_start, nr);
2382	if XNN_LIKELY(b != NULL) {
2383	for (size_t nr_block_offset = `0`; nr_block_offset < nr; nr_block_offset++) {
2384	*packed_weights++ = b[min(nr_block_offset, nr_block_size - `1`)];
2385	}
2386	} else {
2387	size_t n = nr;
2388	do {
2389	*packed_weights++ = `0.0f`;
2390	} while (--n != `0`);
2391	}
2392
2393	for (size_t kx = `0`; kx < kw; kx++) {
2394	for (size_t c = `0`; c < kc; c++) {
2395	for (size_t ky = `0`; ky < kh; ky++) {
2396	for (size_t nr_block_offset = `0`; nr_block_offset < nr; nr_block_offset++) {
2397	packed_weights++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - `1`)) kh + ky) * kw + kx) * kc + c];
2398	}
2399	}
2400	}
2401	}
2402	if XNN_UNPREDICTABLE(b != NULL) {
2403	b += nr;
2404	}
2405	}
2406	}
2407
2408	void xnn_pack_f32_to_f16_dconv_oki_w(
2409	size_t nc,
2410	size_t kc,
2411	size_t nr,
2412	size_t kh,
2413	size_t kw,
2414	const float* k,
2415	const float* b,
2416	uint16_t* packed_weights,
2417	const void* params)
2418	{
2419	assert(k != NULL);
2420	assert(packed_weights != NULL);
2421
2422	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
2423	const size_t nr_block_size = min(nc - nr_block_start, nr);
2424	if XNN_LIKELY(b != NULL) {
2425	for (size_t nr_block_offset = `0`; nr_block_offset < nr; nr_block_offset++) {
2426	*packed_weights++ = fp16_ieee_from_fp32_value(b[min(nr_block_offset, nr_block_size - `1`)]);
2427	}
2428	} else {
2429	size_t n = nr;
2430	do {
2431	*packed_weights++ = `0`;
2432	} while (--n != `0`);
2433	}
2434
2435	for (size_t kx = `0`; kx < kw; kx++) {
2436	for (size_t c = `0`; c < kc; c++) {
2437	for (size_t ky = `0`; ky < kh; ky++) {
2438	for (size_t nr_block_offset = `0`; nr_block_offset < nr; nr_block_offset++) {
2439	packed_weights++ = fp16_ieee_from_fp32_value(k[(((nr_block_start + min(nr_block_offset, nr_block_size - `1`)) kh + ky) * kw + kx) * kc + c]);
2440	}
2441	}
2442	}
2443	}
2444	if XNN_UNPREDICTABLE(b != NULL) {
2445	b += nr;
2446	}
2447	}
2448	}
2449
2450	void xnn_pack_f16_dconv_oki_w(
2451	size_t nc,
2452	size_t kc,
2453	size_t nr,
2454	size_t kh,
2455	size_t kw,
2456	const uint16_t* k,
2457	const uint16_t* b,
2458	uint16_t* packed_weights,
2459	const void* params)
2460	{
2461	assert(k != NULL);
2462	assert(packed_weights != NULL);
2463
2464	for (size_t nr_block_start = `0`; nr_block_start < nc; nr_block_start += nr) {
2465	const size_t nr_block_size = min(nc - nr_block_start, nr);
2466	if XNN_LIKELY(b != NULL) {
2467	for (size_t nr_block_offset = `0`; nr_block_offset < nr; nr_block_offset++) {
2468	*packed_weights++ = b[min(nr_block_offset, nr_block_size - `1`)];
2469	}
2470	} else {
2471	size_t n = nr;
2472	do {
2473	*packed_weights++ = `0`;
2474	} while (--n != `0`);
2475	}
2476
2477	for (size_t kx = `0`; kx < kw; kx++) {
2478	for (size_t c = `0`; c < kc; c++) {
2479	for (size_t ky = `0`; ky < kh; ky++) {
2480	for (size_t nr_block_offset = `0`; nr_block_offset < nr; nr_block_offset++) {
2481	packed_weights++ = k[(((nr_block_start + min(nr_block_offset, nr_block_size - `1`)) kh + ky) * kw + kx) * kc + c];
2482	}
2483	}
2484	}
2485	}
2486	if XNN_UNPREDICTABLE(b != NULL) {
2487	b += nr;
2488	}
2489	}
2490	}
2491
2492	void xnn_pack_f32_chw_dwconv_ghw_w(
2493	size_t kernel_size,
2494	size_t groups,
2495	const float* k,
2496	const float* b,
2497	float* packed_weights,
2498	const void* params)
2499	{
2500	assert(k != NULL);
2501	assert(packed_weights != NULL);
2502
2503	for (size_t g = `0`; g < groups; g++) {
2504	if XNN_LIKELY(b != NULL) {
2505	packed_weights = b++;
2506	} else {
2507	*packed_weights = `0.0f`;
2508	}
2509	packed_weights += `1`;
2510	for (size_t i = `0`; i < kernel_size; i++) {
2511	packed_weights++ = k[g kernel_size + i];
2512	}
2513	}
2514	}
2515
2516	void xnn_pack_f32_to_f16_chw_dwconv_ghw_w(
2517	size_t kernel_size,
2518	size_t groups,
2519	const float* k,
2520	const float* b,
2521	uint16_t* packed_weights,
2522	const void* params)
2523	{
2524	assert(k != NULL);
2525	assert(packed_weights != NULL);
2526
2527	for (size_t g = `0`; g < groups; g++) {
2528	if XNN_LIKELY(b != NULL) {
2529	packed_weights = fp16_ieee_from_fp32_value(b++);
2530	} else {
2531	*packed_weights = `0`;
2532	}
2533	packed_weights += `1`;
2534	for (size_t i = `0`; i < kernel_size; i++) {
2535	packed_weights++ = fp16_ieee_from_fp32_value(k[g kernel_size + i]);
2536	}
2537	}
2538	}
2539
2540	void xnn_pack_f16_chw_dwconv_ghw_w(
2541	size_t kernel_size,
2542	size_t groups,
2543	const uint16_t* k,
2544	const uint16_t* b,
2545	uint16_t* packed_weights,
2546	const void* params)
2547	{
2548	assert(k != NULL);
2549	assert(packed_weights != NULL);
2550
2551	for (size_t g = `0`; g < groups; g++) {
2552	if XNN_LIKELY(b != NULL) {
2553	packed_weights = b++;
2554	} else {
2555	*packed_weights = `0`;
2556	}
2557	packed_weights += `1`;
2558	for (size_t i = `0`; i < kernel_size; i++) {
2559	packed_weights++ = k[g kernel_size + i];
2560	}
2561	}
2562	}
2563
2564	void xnn_pack_f32_chw_dwconv_hwg_w(
2565	size_t kernel_size,
2566	size_t groups,
2567	const float* k,
2568	const float* b,
2569	float* packed_weights,
2570	const void* params)
2571	{
2572	assert(k != NULL);
2573	assert(packed_weights != NULL);
2574
2575	for (size_t g = `0`; g < groups; g++) {
2576	if XNN_LIKELY(b != NULL) {
2577	packed_weights = b++;
2578	} else {
2579	*packed_weights = `0.0f`;
2580	}
2581	packed_weights += `1`;
2582	for (size_t i = `0`; i < kernel_size; i++) {
2583	packed_weights++ = k[i groups + g];
2584	}
2585	}
2586	}
2587
2588	void xnn_pack_f16_chw_dwconv_hwg_w(
2589	size_t kernel_size,
2590	size_t groups,
2591	const uint16_t* k,
2592	const uint16_t* b,
2593	uint16_t* packed_weights,
2594	const void* params)
2595	{
2596	assert(k != NULL);
2597	assert(packed_weights != NULL);
2598
2599	for (size_t g = `0`; g < groups; g++) {
2600	if XNN_LIKELY(b != NULL) {
2601	packed_weights = b++;
2602	} else {
2603	*packed_weights = `0`;
2604	}
2605	packed_weights += `1`;
2606	for (size_t i = `0`; i < kernel_size; i++) {
2607	packed_weights++ = k[i groups + g];
2608	}
2609	}
2610	}
2611
2612	void xnn_pack_f32_to_f16_chw_dwconv_hwg_w(
2613	size_t kernel_size,
2614	size_t groups,
2615	const float* k,
2616	const float* b,
2617	uint16_t* packed_weights,
2618	const void* params)
2619	{
2620	assert(k != NULL);
2621	assert(packed_weights != NULL);
2622
2623	for (size_t g = `0`; g < groups; g++) {
2624	if XNN_LIKELY(b != NULL) {
2625	packed_weights = fp16_ieee_from_fp32_value(b++);
2626	} else {
2627	*packed_weights = `0`;
2628	}
2629	packed_weights += `1`;
2630	for (size_t i = `0`; i < kernel_size; i++) {
2631	packed_weights++ = fp16_ieee_from_fp32_value(k[i groups + g]);
2632	}
2633	}
2634	}
2635
2636
2637	void xnn_pack_f32_vmulcaddc_w(
2638	size_t c,
2639	size_t cr,
2640	const float* s,
2641	const float* b,
2642	float* packed_weights,
2643	const void* params)
2644	{
2645	assert(s != NULL);
2646	assert(packed_weights != NULL);
2647
2648	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
2649	const size_t cr_block_size = min(c - cr_block_start, cr);
2650	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2651	*packed_weights++ = s[cr_block_start + cr_block_offset];
2652	}
2653	packed_weights += cr - cr_block_size;
2654	if XNN_LIKELY(b != NULL) {
2655	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2656	*packed_weights++ = b[cr_block_start + cr_block_offset];
2657	}
2658	} else {
2659	size_t n = cr_block_size;
2660	do {
2661	*packed_weights++ = `0.0f`;
2662	} while (--n != `0`);
2663	}
2664	packed_weights += cr - cr_block_size;
2665	}
2666	}
2667
2668	void xnn_pack_f16_vmulcaddc_w(
2669	size_t c,
2670	size_t cr,
2671	const uint16_t* s,
2672	const uint16_t* b,
2673	uint16_t* packed_weights,
2674	const void* params)
2675	{
2676	assert(s != NULL);
2677	assert(packed_weights != NULL);
2678
2679	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
2680	const size_t cr_block_size = min(c - cr_block_start, cr);
2681	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2682	*packed_weights++ = s[cr_block_start + cr_block_offset];
2683	}
2684	packed_weights += cr - cr_block_size;
2685	if XNN_LIKELY(b != NULL) {
2686	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2687	*packed_weights++ = b[cr_block_start + cr_block_offset];
2688	}
2689	} else {
2690	size_t n = cr_block_size;
2691	do {
2692	*packed_weights++ = `0`;
2693	} while (--n != `0`);
2694	}
2695	packed_weights += cr - cr_block_size;
2696	}
2697	}
2698
2699	void xnn_pack_f32_to_f16_vmulcaddc_w(
2700	size_t c,
2701	size_t cr,
2702	const float* s,
2703	const float* b,
2704	uint16_t* packed_weights,
2705	const void* params)
2706	{
2707	assert(s != NULL);
2708	assert(packed_weights != NULL);
2709
2710	for (size_t cr_block_start = `0`; cr_block_start < c; cr_block_start += cr) {
2711	const size_t cr_block_size = min(c - cr_block_start, cr);
2712	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2713	*packed_weights++ = fp16_ieee_from_fp32_value(s[cr_block_start + cr_block_offset]);
2714	}
2715	packed_weights += cr - cr_block_size;
2716	if XNN_LIKELY(b != NULL) {
2717	for (size_t cr_block_offset = `0`; cr_block_offset < cr_block_size; cr_block_offset++) {
2718	*packed_weights++ = fp16_ieee_from_fp32_value(b[cr_block_start + cr_block_offset]);
2719	}
2720	} else {
2721	size_t n = cr_block_size;
2722	do {
2723	*packed_weights++ = `0`;
2724	} while (--n != `0`);
2725	}
2726	packed_weights += cr - cr_block_size;
2727	}
2728	}
2729
2730	void xnn_pack_f32_prelu_w(
2731	size_t c,
2732	const float* s,
2733	float* packed_weights)
2734	{
2735	assert(s != NULL);
2736	assert(packed_weights != NULL);
2737
2738	memcpy(packed_weights, s, c * sizeof(float));
2739	}
2740
2741	void xnn_pack_f16_prelu_w(
2742	size_t c,
2743	const uint16_t* s,
2744	uint16_t* packed_weights)
2745	{
2746	assert(s != NULL);
2747	assert(packed_weights != NULL);
2748
2749	memcpy(packed_weights, s, c * sizeof(uint16_t));
2750	}
2751
2752	void xnn_pack_f32_to_f16_prelu_w(
2753	size_t c,
2754	const float* s,
2755	uint16_t* packed_weights)
2756	{
2757	assert(s != NULL);
2758	assert(packed_weights != NULL);
2759
2760	do {
2761	packed_weights++ = fp16_ieee_from_fp32_value(s++);
2762	} while (--c != `0`);
2763	}
2764
2765	void xnn_analyze_f32_spmm(
2766	size_t group_output_channels,
2767	size_t group_input_channels,
2768	const float* kernel,
2769	struct xnn_spmm_packing_params* params)
2770	{
2771	assert(kernel != NULL);
2772	assert(params != NULL);
2773
2774	// Count number of non-zero values.
2775	size_t num_nonzeroes = `0`;
2776	size_t num_nonzero_blocks2 = `0`;
2777	size_t num_nonzero_blocks4 = `0`;
2778	for (size_t oc = `0`; oc < round_down_po2(group_output_channels, `4`); oc += `4`) {
2779	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2780	const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != `0.0f`);
2781	const size_t row1_nonzero = (size_t) (kernel[(oc + `1`) * group_input_channels + ic] != `0.0f`);
2782	const size_t row2_nonzero = (size_t) (kernel[(oc + `2`) * group_input_channels + ic] != `0.0f`);
2783	const size_t row3_nonzero = (size_t) (kernel[(oc + `3`) * group_input_channels + ic] != `0.0f`);
2784	num_nonzeroes += row0_nonzero + row1_nonzero + row2_nonzero + row3_nonzero;
2785	num_nonzero_blocks2 += (row0_nonzero \| row1_nonzero) + (row2_nonzero \| row3_nonzero);
2786	num_nonzero_blocks4 += (row0_nonzero \| row1_nonzero \| row2_nonzero \| row3_nonzero);
2787	}
2788	}
2789	const size_t num_block4_nonzeroes = num_nonzeroes;
2790	for (size_t oc = round_down_po2(group_output_channels, `4`); oc < round_down_po2(group_output_channels, `2`); oc += `2`) {
2791	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2792	const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != `0.0f`);
2793	const size_t row1_nonzero = (size_t) (kernel[(oc + `1`) * group_input_channels + ic] != `0.0f`);
2794	num_nonzeroes += row0_nonzero + row1_nonzero;
2795	num_nonzero_blocks2 += (row0_nonzero \| row1_nonzero);
2796	}
2797	}
2798	const size_t num_block2_nonzeroes = num_nonzeroes;
2799	for (size_t oc = round_down_po2(group_output_channels, `2`); oc < group_output_channels; oc++) {
2800	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2801	num_nonzeroes += (size_t) (kernel[oc * group_input_channels + ic] != `0.0f`);
2802	}
2803	}
2804	params->num_nonzeroes = num_nonzeroes;
2805	params->num_nonzero_blocks2 = num_nonzero_blocks2;
2806	params->num_nonzero_blocks4 = num_nonzero_blocks4;
2807	params->num_block2_nonzeroes = num_block2_nonzeroes;
2808	params->num_block4_nonzeroes = num_block4_nonzeroes;
2809	}
2810
2811	void xnn_analyze_f16_spmm(
2812	size_t group_output_channels,
2813	size_t group_input_channels,
2814	const uint16_t* kernel,
2815	struct xnn_spmm_packing_params* params)
2816	{
2817	assert(kernel != NULL);
2818	assert(params != NULL);
2819
2820	// Count number of non-zero values.
2821	size_t num_nonzeroes = `0`;
2822	size_t num_nonzero_blocks2 = `0`;
2823	size_t num_nonzero_blocks4 = `0`;
2824	for (size_t oc = `0`; oc < round_down_po2(group_output_channels, `4`); oc += `4`) {
2825	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2826	const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != `0`);
2827	const size_t row1_nonzero = (size_t) (kernel[(oc + `1`) * group_input_channels + ic] != `0`);
2828	const size_t row2_nonzero = (size_t) (kernel[(oc + `2`) * group_input_channels + ic] != `0`);
2829	const size_t row3_nonzero = (size_t) (kernel[(oc + `3`) * group_input_channels + ic] != `0`);
2830	num_nonzeroes += row0_nonzero + row1_nonzero + row2_nonzero + row3_nonzero;
2831	num_nonzero_blocks2 += (row0_nonzero \| row1_nonzero) + (row2_nonzero \| row3_nonzero);
2832	num_nonzero_blocks4 += (row0_nonzero \| row1_nonzero \| row2_nonzero \| row3_nonzero);
2833	}
2834	}
2835	const size_t num_block4_nonzeroes = num_nonzeroes;
2836	for (size_t oc = round_down_po2(group_output_channels, `4`); oc < round_down_po2(group_output_channels, `2`); oc += `2`) {
2837	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2838	const size_t row0_nonzero = (size_t) (kernel[oc * group_input_channels + ic] != `0`);
2839	const size_t row1_nonzero = (size_t) (kernel[(oc + `1`) * group_input_channels + ic] != `0`);
2840	num_nonzeroes += row0_nonzero + row1_nonzero;
2841	num_nonzero_blocks2 += (row0_nonzero \| row1_nonzero);
2842	}
2843	}
2844	const size_t num_block2_nonzeroes = num_nonzeroes;
2845	for (size_t oc = round_down_po2(group_output_channels, `2`); oc < group_output_channels; oc++) {
2846	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2847	num_nonzeroes += (size_t) (kernel[oc * group_input_channels + ic] != `0`);
2848	}
2849	}
2850	params->num_nonzeroes = num_nonzeroes;
2851	params->num_nonzero_blocks2 = num_nonzero_blocks2;
2852	params->num_nonzero_blocks4 = num_nonzero_blocks4;
2853	params->num_block2_nonzeroes = num_block2_nonzeroes;
2854	params->num_block4_nonzeroes = num_block4_nonzeroes;
2855	}
2856
2857	enum xnn_status xnn_pack_f32_spmm(
2858	size_t group_output_channels,
2859	size_t output_channels_block_size,
2860	size_t group_input_channels,
2861	const float* kernel,
2862	const float* bias,
2863	int32_t* input_channel_diffs,
2864	uint32_t* output_channel_nonzeros,
2865	float* nonzero_values,
2866	size_t* first_input_channel)
2867	{
2868	size_t first_ic = `0`, last_ic = `0`;
2869	bool first_nonzero = true;
2870	for (size_t ocb = `0`; ocb < round_down_po2(group_output_channels, output_channels_block_size); ocb += output_channels_block_size) {
2871	if XNN_LIKELY(bias != NULL) {
2872	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2873	*nonzero_values++ = bias[ocb + oco];
2874	}
2875	} else {
2876	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2877	*nonzero_values++ = `0.0f`;
2878	}
2879	}
2880	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2881	bool is_nonzero_block = false;
2882	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2883	is_nonzero_block \|= (kernel[(ocb + oco) * group_input_channels + ic] != `0.0f`);
2884	}
2885	if (is_nonzero_block) {
2886	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2887	nonzero_values++ = kernel[(ocb + oco) group_input_channels + ic];
2888	}
2889	if (first_nonzero) {
2890	first_ic = ic;
2891	} else {
2892	const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(float);
2893	if (diff != (int64_t) (int32_t) diff) {
2894	xnn_log_error("failed to convert kernel to sparse representation: "
2895	"scaled difference in input channels exceeds int32_t range");
2896	return xnn_status_unsupported_parameter;
2897	}
2898	*input_channel_diffs++ = (int32_t) diff;
2899	}
2900	first_nonzero = false;
2901	last_ic = ic;
2902	*output_channel_nonzeros += `1`;
2903	}
2904	}
2905	output_channel_nonzeros += `1`;
2906	}
2907	for (size_t oc = round_down_po2(group_output_channels, output_channels_block_size); oc < group_output_channels; oc++) {
2908	if XNN_LIKELY(bias != NULL) {
2909	*nonzero_values++ = bias[oc];
2910	} else {
2911	*nonzero_values++ = `0.0f`;
2912	}
2913	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2914	const float weight = kernel[oc * group_input_channels + ic];
2915	if (weight != `0.0f`) {
2916	*nonzero_values++ = weight;
2917	if (first_nonzero) {
2918	first_ic = ic;
2919	} else {
2920	const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(float);
2921	if (diff != (int64_t) (int32_t) diff) {
2922	xnn_log_error("failed to convert kernel to sparse representation: "
2923	"scaled difference in input channels exceeds int32_t range");
2924	return xnn_status_unsupported_parameter;
2925	}
2926	*input_channel_diffs++ = (int32_t) diff;
2927	}
2928	first_nonzero = false;
2929	last_ic = ic;
2930	*output_channel_nonzeros += `1`;
2931	}
2932	}
2933	output_channel_nonzeros += `1`;
2934	}
2935	// If there are any non-zero elements, we have to return to the initial input channel.
2936	if (!first_nonzero) {
2937	const int64_t diff = (int64_t) ((uint64_t) first_ic - (uint64_t) last_ic) * (int64_t) sizeof(float);
2938	if (diff != (int64_t) (int32_t) diff) {
2939	xnn_log_error("failed to convert kernel to sparse representation: "
2940	"scaled difference in input channels exceeds int32_t range");
2941	return xnn_status_unsupported_parameter;
2942	}
2943	*input_channel_diffs++ = (int32_t) diff;
2944	}
2945	*first_input_channel = first_ic;
2946	return xnn_status_success;
2947	}
2948
2949
2950	enum xnn_status xnn_pack_f32_to_f16_spmm(
2951	size_t group_output_channels,
2952	size_t output_channels_block_size,
2953	size_t group_input_channels,
2954	const float* kernel,
2955	const float* bias,
2956	int32_t* input_channel_diffs,
2957	uint32_t* output_channel_nonzeros,
2958	uint16_t* nonzero_values, // fp16 values
2959	size_t* first_input_channel)
2960	{
2961	size_t first_ic = `0`, last_ic = `0`;
2962	bool first_nonzero = true;
2963	for (size_t ocb = `0`; ocb < round_down_po2(group_output_channels, output_channels_block_size); ocb += output_channels_block_size) {
2964	if XNN_LIKELY(bias != NULL) {
2965	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2966	*nonzero_values++ = fp16_ieee_from_fp32_value(bias[ocb + oco]);
2967	}
2968	} else {
2969	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2970	*nonzero_values++ = `0`;
2971	}
2972	}
2973	for (size_t ic = `0`; ic < group_input_channels; ic++) {
2974	bool is_nonzero_block = false;
2975	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2976	is_nonzero_block \|= (kernel[(ocb + oco) * group_input_channels + ic] != `0.0f`);
2977	}
2978	if (is_nonzero_block) {
2979	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
2980	nonzero_values++ = fp16_ieee_from_fp32_value(kernel[(ocb + oco) group_input_channels + ic]);
2981	}
2982	if (first_nonzero) {
2983	first_ic = ic;
2984	} else {
2985	const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
2986	if (diff != (int64_t) (int32_t) diff) {
2987	xnn_log_error("failed to convert kernel to sparse representation: "
2988	"scaled difference in input channels exceeds int32_t range");
2989	return xnn_status_unsupported_parameter;
2990	}
2991	*input_channel_diffs++ = (int32_t) diff;
2992	}
2993	first_nonzero = false;
2994	last_ic = ic;
2995	*output_channel_nonzeros += `1`;
2996	}
2997	}
2998	output_channel_nonzeros += `1`;
2999	}
3000	for (size_t oc = round_down_po2(group_output_channels, output_channels_block_size); oc < group_output_channels; oc++) {
3001	if XNN_LIKELY(bias != NULL) {
3002	*nonzero_values++ = fp16_ieee_from_fp32_value(bias[oc]);
3003	} else {
3004	*nonzero_values++ = `0`;
3005	}
3006	for (size_t ic = `0`; ic < group_input_channels; ic++) {
3007	const float weight = kernel[oc * group_input_channels + ic];
3008	if (weight != `0.0f`) {
3009	*nonzero_values++ = fp16_ieee_from_fp32_value(weight);
3010	if (first_nonzero) {
3011	first_ic = ic;
3012	} else {
3013	const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3014	if (diff != (int64_t) (int32_t) diff) {
3015	xnn_log_error("failed to convert kernel to sparse representation: "
3016	"scaled difference in input channels exceeds int32_t range");
3017	return xnn_status_unsupported_parameter;
3018	}
3019	*input_channel_diffs++ = (int32_t) diff;
3020	}
3021	first_nonzero = false;
3022	last_ic = ic;
3023	*output_channel_nonzeros += `1`;
3024	}
3025	}
3026	output_channel_nonzeros += `1`;
3027	}
3028	// If there are any non-zero elements, we have to return to the initial input channel.
3029	if (!first_nonzero) {
3030	const int64_t diff = (int64_t) ((uint64_t) first_ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3031	if (diff != (int64_t) (int32_t) diff) {
3032	xnn_log_error("failed to convert kernel to sparse representation: "
3033	"scaled difference in input channels exceeds int32_t range");
3034	return xnn_status_unsupported_parameter;
3035	}
3036	*input_channel_diffs++ = (int32_t) diff;
3037	}
3038	*first_input_channel = first_ic;
3039	return xnn_status_success;
3040	}
3041
3042	enum xnn_status xnn_pack_f16_spmm(
3043	size_t group_output_channels,
3044	size_t output_channels_block_size,
3045	size_t group_input_channels,
3046	const uint16_t* kernel, // fp16 values
3047	const uint16_t* bias, // fp16 values
3048	int32_t* input_channel_diffs,
3049	uint32_t* output_channel_nonzeros,
3050	uint16_t* nonzero_values, // fp16 values
3051	size_t* first_input_channel)
3052	{
3053	size_t first_ic = `0`, last_ic = `0`;
3054	bool first_nonzero = true;
3055	for (size_t ocb = `0`; ocb < round_down_po2(group_output_channels, output_channels_block_size); ocb += output_channels_block_size) {
3056	if XNN_LIKELY(bias != NULL) {
3057	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
3058	*nonzero_values++ = bias[ocb + oco];
3059	}
3060	} else {
3061	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
3062	*nonzero_values++ = `0`;
3063	}
3064	}
3065	for (size_t ic = `0`; ic < group_input_channels; ic++) {
3066	bool is_nonzero_block = false;
3067	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
3068	is_nonzero_block \|= (kernel[(ocb + oco) * group_input_channels + ic] != `0`);
3069	}
3070	if (is_nonzero_block) {
3071	for (size_t oco = `0`; oco < output_channels_block_size; oco++) {
3072	nonzero_values++ = kernel[(ocb + oco) group_input_channels + ic];
3073	}
3074	if (first_nonzero) {
3075	first_ic = ic;
3076	} else {
3077	const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3078	if (diff != (int64_t) (int32_t) diff) {
3079	xnn_log_error("failed to convert kernel to sparse representation: "
3080	"scaled difference in input channels exceeds int32_t range");
3081	return xnn_status_unsupported_parameter;
3082	}
3083	*input_channel_diffs++ = (int32_t) diff;
3084	}
3085	first_nonzero = false;
3086	last_ic = ic;
3087	*output_channel_nonzeros += `1`;
3088	}
3089	}
3090	output_channel_nonzeros += `1`;
3091	}
3092	for (size_t oc = round_down_po2(group_output_channels, output_channels_block_size); oc < group_output_channels; oc++) {
3093	if XNN_LIKELY(bias != NULL) {
3094	*nonzero_values++ = bias[oc];
3095	} else {
3096	*nonzero_values++ = `0`;
3097	}
3098	for (size_t ic = `0`; ic < group_input_channels; ic++) {
3099	const float weight = kernel[oc * group_input_channels + ic];
3100	if (weight != `0`) {
3101	*nonzero_values++ = weight;
3102	if (first_nonzero) {
3103	first_ic = ic;
3104	} else {
3105	const int64_t diff = (int64_t) ((uint64_t) ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3106	if (diff != (int64_t) (int32_t) diff) {
3107	xnn_log_error("failed to convert kernel to sparse representation: "
3108	"scaled difference in input channels exceeds int32_t range");
3109	return xnn_status_unsupported_parameter;
3110	}
3111	*input_channel_diffs++ = (int32_t) diff;
3112	}
3113	first_nonzero = false;
3114	last_ic = ic;
3115	*output_channel_nonzeros += `1`;
3116	}
3117	}
3118	output_channel_nonzeros += `1`;
3119	}
3120	// If there are any non-zero elements, we have to return to the initial input channel.
3121	if (!first_nonzero) {
3122	const int64_t diff = (int64_t) ((uint64_t) first_ic - (uint64_t) last_ic) * (int64_t) sizeof(uint16_t);
3123	if (diff != (int64_t) (int32_t) diff) {
3124	xnn_log_error("failed to convert kernel to sparse representation: "
3125	"scaled difference in input channels exceeds int32_t range");
3126	return xnn_status_unsupported_parameter;
3127	}
3128	*input_channel_diffs++ = (int32_t) diff;
3129	}
3130	*first_input_channel = first_ic;
3131	return xnn_status_success;
3132	}
3133

Browse the source code of pytorch/third_party/XNNPACK/src/packing.c