1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *xe_lp_conv_nhwc_fwd_first_x8s8x_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2020-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
21R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_scales.h" )==""\n"
23R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
24R"==(#define KDHW_SIZE (KH * KW * KD) )==""\n"
25R"==(#define CX1_OFFSET (sizeof(C00) / sizeof(int)) )==""\n"
26R"==(#if (PW % 4 == 0) && (SRC_SLM_SIZE % 4 == 0) )==""\n"
27R"==(#define WRITE_SLM_BLOCK(p, data) block_write(p, data) )==""\n"
28R"==(#define WRITE_SLM_BLOCK_SHORT(p, data) block_write_us(p, data) )==""\n"
29R"==(#else )==""\n"
30R"==(#define WRITE_SLM_BLOCK(p, data) block_write_emu(p, data) )==""\n"
31R"==(#define WRITE_SLM_BLOCK_SHORT(p, data) block_write_us_emu(p, data) )==""\n"
32R"==(#endif )==""\n"
33R"==(#define GET_INT_BLOCK(src_slm, slm_index, src_global) \ )==""\n"
34R"==(uchar4 res = 0; \ )==""\n"
35R"==(for (int j = 0; j < IC; j++) { \ )==""\n"
36R"==(((uchar *)&res)[j] = src_global[j]; \ )==""\n"
37R"==(} \ )==""\n"
38R"==(src_slm[slm_index] = as_int(res); \ )==""\n"
39R"==(src_global += IC; )==""\n"
40R"==(#define BLOCK_READ_SRC(data, idx) \ )==""\n"
41R"==(data = intel_sub_group_block_read8((__global uint *)&src[idx]); )==""\n"
42R"==(#define BLOCK_READ_WHT(data, idx) \ )==""\n"
43R"==(data = as_int(intel_sub_group_block_read((__global uint *)&wei[idx])); )==""\n"
44R"==(#define BLOCK_READ_WHT8(data, idx) \ )==""\n"
45R"==(data = as_int8(intel_sub_group_block_read8((__global uint *)&wei[idx])); )==""\n"
46R"==(#define BLOCK_READ_BIA(data, idx) \ )==""\n"
47R"==(data = as_float4(intel_sub_group_block_read4((__global uint *)&bias[idx])); )==""\n"
48R"==(#if SCALES_PER_OC )==""\n"
49R"==(#define SCALE_VEC4 scales.s01230123 )==""\n"
50R"==(#define SCALE scales )==""\n"
51R"==(#elif SCALES_COMMON )==""\n"
52R"==(#define SCALE_VEC4 runtime_scales[0] )==""\n"
53R"==(#define SCALE runtime_scales[0] )==""\n"
54R"==(#else )==""\n"
55R"==(#define SCALE_VEC4 1 )==""\n"
56R"==(#define SCALE 1 )==""\n"
57R"==(#endif )==""\n"
58R"==(#define OC_PADD8 ((OC % 8) ? (OC / 8 + 1) * 8 : OC) )==""\n"
59R"==(#if DST_DT_S8 || DST_DT_U8 )==""\n"
60R"==(#define OC_BLOCK_READ_BOUND 4 )==""\n"
61R"==(#define OC_BLOCK_WRITE_BOUND 16 )==""\n"
62R"==(#else )==""\n"
63R"==(#define OC_BLOCK_READ_BOUND 1 )==""\n"
64R"==(#define OC_BLOCK_WRITE_BOUND 4 )==""\n"
65R"==(#endif )==""\n"
66R"==(inline DST_DATA4_T read_oc_block4(const __global DST_DATA_T *dst, int off) { )==""\n"
67R"==(const int local_id = get_sub_group_local_id(); )==""\n"
68R"==(#if OC % OC_BLOCK != 0 )==""\n"
69R"==(int tail = OC - off; )==""\n"
70R"==(if (tail < OC_BLOCK) { )==""\n"
71R"==(return (DST_DATA4_T)( )==""\n"
72R"==(local_id < tail - 8 * 0 ? dst[0 * 8 + local_id] : 0, )==""\n"
73R"==(local_id < tail - 8 * 1 ? dst[1 * 8 + local_id] : 0, )==""\n"
74R"==(local_id < tail - 8 * 2 ? dst[2 * 8 + local_id] : 0, )==""\n"
75R"==(local_id < tail - 8 * 3 ? dst[3 * 8 + local_id] : 0); )==""\n"
76R"==(} )==""\n"
77R"==(#endif )==""\n"
78R"==(#if OC % OC_BLOCK_READ_BOUND != 0 )==""\n"
79R"==(return (DST_DATA4_T)(dst[0 * 8 + local_id], dst[1 * 8 + local_id], )==""\n"
80R"==(dst[2 * 8 + local_id], dst[3 * 8 + local_id]); )==""\n"
81R"==(#else )==""\n"
82R"==(return BLOCK_READ_DST4(dst); )==""\n"
83R"==(#endif )==""\n"
84R"==(} )==""\n"
85R"==(inline void write_oc_block4( )==""\n"
86R"==(__global DST_DATA_T *dst, int off, DST_DATA4_T value) { )==""\n"
87R"==(const int local_id = get_sub_group_local_id(); )==""\n"
88R"==(#if OC % OC_BLOCK != 0 )==""\n"
89R"==(int tail = OC - off; )==""\n"
90R"==(if (tail < OC_BLOCK) { )==""\n"
91R"==(if (local_id < tail) dst[0 * 8 + local_id] = value.s0; )==""\n"
92R"==(if (local_id < tail - 8 * 1) dst[1 * 8 + local_id] = value.s1; )==""\n"
93R"==(if (local_id < tail - 8 * 2) dst[2 * 8 + local_id] = value.s2; )==""\n"
94R"==(if (local_id < tail - 8 * 3) dst[3 * 8 + local_id] = value.s3; )==""\n"
95R"==(return; )==""\n"
96R"==(} )==""\n"
97R"==(#endif )==""\n"
98R"==(#if OC % OC_BLOCK_WRITE_BOUND != 0 )==""\n"
99R"==(dst[0 * 8 + local_id] = value.s0; )==""\n"
100R"==(dst[1 * 8 + local_id] = value.s1; )==""\n"
101R"==(dst[2 * 8 + local_id] = value.s2; )==""\n"
102R"==(dst[3 * 8 + local_id] = value.s3; )==""\n"
103R"==(return; )==""\n"
104R"==(#else )==""\n"
105R"==(BLOCK_WRITE_DST4(dst, value); )==""\n"
106R"==(return; )==""\n"
107R"==(#endif )==""\n"
108R"==(} )==""\n"
109R"==(__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) )==""\n"
110R"==(__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) __kernel void )==""\n"
111R"==(conv_nhwc_fwd_first_x8s8x(const __global uchar *src, const __global char *wei, )==""\n"
112R"==(const __global float *bias, __global DST_DATA_T *dst POST_OP_ARGS, )==""\n"
113R"==(const __global float *runtime_scales, )==""\n"
114R"==(const __global int *src_compensation, const __global int *src_zpoints, )==""\n"
115R"==(const __global int *dst_compensation) { )==""\n"
116R"==(const int group_oc = get_group_id(0) * OC_GROUP; )==""\n"
117R"==(const int group_mb = get_group_id(2) * MB_GROUP; )==""\n"
118R"==(const int group_sp = get_group_id(1) * SP_GROUP; )==""\n"
119R"==(const int sub_group_id = get_sub_group_id(); )==""\n"
120R"==(const int sub_local_id = get_sub_group_local_id(); )==""\n"
121R"==(const int oc = (sub_group_id % OC_GROUP); )==""\n"
122R"==(const int sp = (sub_group_id / OC_GROUP); )==""\n"
123R"==(const int g = (group_oc + oc) / OC_NCHUNK; )==""\n"
124R"==(const int group_ic = IC_NCHUNK * g; )==""\n"
125R"==(const int god = group_sp / (OW_PADDED * OH); )==""\n"
126R"==(const int gohw = group_sp % (OW_PADDED * OH); )==""\n"
127R"==(const int goh = gohw / OW_PADDED; )==""\n"
128R"==(const int gow = OW_BLOCK * (gohw % OW_PADDED); )==""\n"
129R"==(const int gid = god * SD; )==""\n"
130R"==(const int gih = goh * SH; )==""\n"
131R"==(const int giw = gow * SW; )==""\n"
132R"==(const int local_ow = OW_BLOCK * sp; )==""\n"
133R"==(const int local_iw = local_ow * SW; )==""\n"
134R"==(const int od = god; )==""\n"
135R"==(const int ow = gow + local_ow; )==""\n"
136R"==(const int oh = goh; )==""\n"
137R"==(const int id = gid - PD; )==""\n"
138R"==(const int iw = giw + local_iw - PW; )==""\n"
139R"==(const int ih = gih - PH; )==""\n"
140R"==(__local uint S_slice[SRC_SLM_SIZE * KH * KD]; )==""\n"
141R"==(__local uint *S_part = S_slice + (sp * SW * OW_BLOCK + PW); )==""\n"
142R"==(__local SRC_MMAD_DATA_T *S_work = S_slice + (sp * SW * OW_BLOCK); )==""\n"
143R"==(#if DST_NHWC )==""\n"
144R"==(dst += group_mb * OD * OH * OW * G * OC; )==""\n"
145R"==(dst += (OW * OH * od + OW * oh + ow) * G * OC; )==""\n"
146R"==(dst += OC_BLOCK * (group_oc + oc); )==""\n"
147R"==(#else )==""\n"
148R"==(dst += OC_BLOCK * OD * OH * OW * MB_BLOCK * (group_oc + oc); )==""\n"
149R"==(dst += OC_BLOCK * OD * OH * OW * OC_NCHUNK * G * MB_BLOCK )==""\n"
150R"==(* (group_mb / MB_BLOCK); )==""\n"
151R"==(dst += OC_BLOCK * (group_mb % MB_BLOCK); )==""\n"
152R"==(dst += OC_BLOCK * MB_BLOCK * (OW * OH * od + OW * oh + ow); )==""\n"
153R"==(#endif )==""\n"
154R"==(src += (group_mb >= MB ? MB - 1 : group_mb) * ID * IH * IW * G * IC; )==""\n"
155R"==(src += (IW * IH * id + IW * ih + iw + PW) * G * IC; )==""\n"
156R"==(wei += 4 * KDHW_SIZE * OC_BLOCK * (group_oc + oc); )==""\n"
157R"==(/* WORK WITH SLM */ )==""\n"
158R"==(const bool left_tail = iw < 0; )==""\n"
159R"==(const bool left_nozero_tail = sub_group_id == 0 && iw >= 0; )==""\n"
160R"==(const bool right_tail = (iw + PW + SLM_TAIL >= IW) && (iw + PW < IW); )==""\n"
161R"==(const bool empty = (iw + PW >= IW); )==""\n"
162R"==(const bool right_nozero_tail )==""\n"
163R"==(= sp == (LWS_1 - 1) && (iw + PW + SLM_TAIL < IW); )==""\n"
164R"==(barrier(CLK_LOCAL_MEM_FENCE); )==""\n"
165R"==(/* KD */ )==""\n"
166R"==(#if KD > 1 )==""\n"
167R"==(for (int kd = 0; kd < KD; kd++) { )==""\n"
168R"==(if (kd * (1 + DD) + id < 0 || kd * (1 + DD) + id >= ID) { )==""\n"
169R"==(S_part += SRC_SLM_SIZE * KH; )==""\n"
170R"==(src += IC * IW * IH * (1 + DD); )==""\n"
171R"==(continue; )==""\n"
172R"==(} )==""\n"
173R"==(#endif )==""\n"
174R"==(/* KH */ )==""\n"
175R"==(#if KH > 1 )==""\n"
176R"==(for (int kh = 0; kh < KH; kh++) { )==""\n"
177R"==(if (kh * (1 + DH) + ih < 0 || kh * (1 + DH) + ih >= IH) { )==""\n"
178R"==(S_part += SRC_SLM_SIZE; )==""\n"
179R"==(src += IC * IW * (1 + DH); )==""\n"
180R"==(continue; )==""\n"
181R"==(} )==""\n"
182R"==(#endif )==""\n"
183R"==(/* KW */ )==""\n"
184R"==(/* left tail */ )==""\n"
185R"==(#if PW > 0 )==""\n"
186R"==(if (left_tail) { )==""\n"
187R"==(for (int i = -PW; i < 0; i++) { )==""\n"
188R"==(S_part[i] = 0; )==""\n"
189R"==(} )==""\n"
190R"==(} )==""\n"
191R"==(#endif )==""\n"
192R"==(/* right tail */ )==""\n"
193R"==(#if ZERO_TAIL > 0 )==""\n"
194R"==(if (right_tail) { )==""\n"
195R"==(for (int i = SLM_TAIL; )==""\n"
196R"==(i < SW * OW_BLOCK + (KW - 1) * (1 + DW) - PW; i++) { )==""\n"
197R"==(S_part[i] = 0; )==""\n"
198R"==(} )==""\n"
199R"==(} )==""\n"
200R"==(#if SLM_NCHUNK < OW_NCHUNK )==""\n"
201R"==(if (empty) { )==""\n"
202R"==(for (int i = 0; i < SW * OW_BLOCK + (KW - 1) * (1 + DW) - PW; )==""\n"
203R"==(i++) { )==""\n"
204R"==(WRITE_SLM_BLOCK(S_part + i * 8, 0); )==""\n"
205R"==(} )==""\n"
206R"==(} )==""\n"
207R"==(#endif )==""\n"
208R"==(#endif )==""\n"
209R"==(#if SLM_NCHUNK < OW_NCHUNK )==""\n"
210R"==(if (iw + PW < IW) { )==""\n"
211R"==(#endif )==""\n"
212R"==(#if OW_NCHUNK > LWS_1 )==""\n"
213R"==(/* Copy tails in case of multigroups */ )==""\n"
214R"==(if (ow < OW) { )==""\n"
215R"==(#if PW > 0 )==""\n"
216R"==(if (left_nozero_tail) { )==""\n"
217R"==(__global uchar *s = (__global uchar *)src; )==""\n"
218R"==(for (int i = -PW; i < 0; i++) { )==""\n"
219R"==(GET_INT_BLOCK(S_part, i, s); )==""\n"
220R"==(} )==""\n"
221R"==(} )==""\n"
222R"==(#endif )==""\n"
223R"==(if (right_nozero_tail) { )==""\n"
224R"==(__global uchar *s = (__global uchar *)src; )==""\n"
225R"==(for (int i = SW * OW_BLOCK; )==""\n"
226R"==(i < SW * OW_BLOCK + (KW - 1) * (1 + DW) - PW; )==""\n"
227R"==(i++) { )==""\n"
228R"==(GET_INT_BLOCK(S_part, i, s); )==""\n"
229R"==(} )==""\n"
230R"==(} )==""\n"
231R"==(#endif )==""\n"
232R"==(#if SLM_TAIL != OW_BLOCK * SW )==""\n"
233R"==(/* Copy last block to SLM */ )==""\n"
234R"==(if (right_tail) { )==""\n"
235R"==(__global uchar *s = (__global uchar *)src; )==""\n"
236R"==(__attribute__((opencl_unroll_hint)) )==""\n"
237R"==(for (int i = 0; i < SLM_TAIL; i++) { )==""\n"
238R"==(GET_INT_BLOCK(S_part, i, s); )==""\n"
239R"==(} )==""\n"
240R"==(} else { )==""\n"
241R"==(#endif )==""\n"
242R"==(#if (SW * OW_BLOCK) % 8 == 0 )==""\n"
243R"==(/* Copy block to SLM */ )==""\n"
244R"==(#if IC == 4 )==""\n"
245R"==(__attribute__((opencl_unroll_hint)) )==""\n"
246R"==(for (int i = 0; i < SW * OW_BLOCK; i += 8) { )==""\n"
247R"==(WRITE_SLM_BLOCK(S_part + i, )==""\n"
248R"==(intel_sub_group_block_read((const __global )==""\n"
249R"==(uint *)(&src[i * IC]))); )==""\n"
250R"==(} )==""\n"
251R"==(#else )==""\n"
252R"==(__global uchar *s = (__global uchar *)src; )==""\n"
253R"==(__attribute__((opencl_unroll_hint)) )==""\n"
254R"==(for (int i = 0; i < SW * OW_BLOCK; i += 8) { )==""\n"
255R"==(const int local_id = get_sub_group_local_id(); )==""\n"
256R"==(uchar4 res = 0; )==""\n"
257R"==(for (int j = 0; j < IC; j++) { )==""\n"
258R"==(res[j] = s[local_id * IC + j]; )==""\n"
259R"==(} )==""\n"
260R"==(s += IC * 8; )==""\n"
261R"==(WRITE_SLM_BLOCK(S_part + i, as_uint(res)); )==""\n"
262R"==(} )==""\n"
263R"==(#endif )==""\n"
264R"==(#else )==""\n"
265R"==(#if IC == 4 )==""\n"
266R"==(__attribute__((opencl_unroll_hint)) )==""\n"
267R"==(for (int i = 0; i < SW * OW_BLOCK; i += 4) { )==""\n"
268R"==(WRITE_SLM_BLOCK_SHORT(S_part + i, )==""\n"
269R"==(intel_sub_group_block_read_us( )==""\n"
270R"==((const __global ushort *)(&src[i * IC]))); )==""\n"
271R"==(} )==""\n"
272R"==(#else )==""\n"
273R"==(__global uchar *s = (__global uchar *)src; )==""\n"
274R"==(__attribute__((opencl_unroll_hint)) )==""\n"
275R"==(for (int i = 0; i < SW * OW_BLOCK; i += 4) { )==""\n"
276R"==(const int local_id = get_sub_group_local_id(); )==""\n"
277R"==(uchar2 res = 0; )==""\n"
278R"==(#if IC == 3 )==""\n"
279R"==(res[0] = local_id % 2 ? s[local_id * IC / 2 + 1] )==""\n"
280R"==(: s[0 + local_id * IC / 2]; )==""\n"
281R"==(res[1] = local_id % 2 ?: s[1 + local_id * IC / 2]; )==""\n"
282R"==(#elif IC == 2 )==""\n"
283R"==(res[0] = local_id % 2 ?: s[0 + local_id * IC / 2]; )==""\n"
284R"==(res[1] = local_id % 2 ?: s[1 + local_id * IC / 2]; )==""\n"
285R"==(#else )==""\n"
286R"==(res[0] = local_id % 2 ?: s[0 + local_id * IC / 2]; )==""\n"
287R"==(#endif )==""\n"
288R"==(s += IC * 4; )==""\n"
289R"==(WRITE_SLM_BLOCK_SHORT(S_part + i, as_ushort(res)); )==""\n"
290R"==(} )==""\n"
291R"==(#endif )==""\n"
292R"==(#endif )==""\n"
293R"==(#if SLM_TAIL != OW_BLOCK * SW )==""\n"
294R"==(} )==""\n"
295R"==(#endif )==""\n"
296R"==(#if OW_NCHUNK > LWS_1 )==""\n"
297R"==(} )==""\n"
298R"==(#endif )==""\n"
299R"==(#if SLM_NCHUNK < OW_NCHUNK )==""\n"
300R"==(} )==""\n"
301R"==(#endif )==""\n"
302R"==(#if KH > 1 )==""\n"
303R"==(S_part += SRC_SLM_SIZE; )==""\n"
304R"==(src += IC * IW * (1 + DH); )==""\n"
305R"==(} )==""\n"
306R"==(S_part -= SRC_SLM_SIZE * KH; )==""\n"
307R"==(src -= IC * KH * IW * (1 + DH); )==""\n"
308R"==(#endif )==""\n"
309R"==(#if KD > 1 )==""\n"
310R"==(S_part += SRC_SLM_SIZE * KH; )==""\n"
311R"==(src += IC * IW * IH * (1 + DD); )==""\n"
312R"==(} )==""\n"
313R"==(#endif )==""\n"
314R"==(barrier(CLK_LOCAL_MEM_FENCE); )==""\n"
315R"==(SRC_MMAD_DATA8_T S; )==""\n"
316R"==(int8 W0 = 0, W1 = 0, W2 = 0, W3 = 0; )==""\n"
317R"==(int W00 = 0, W10 = 0, W20 = 0, W30 = 0; )==""\n"
318R"==(int8 C00 = 0; )==""\n"
319R"==(int8 C10 = 0; )==""\n"
320R"==(int8 C20 = 0; )==""\n"
321R"==(int8 C30 = 0; )==""\n"
322R"==(#if OW_BLOCK == 12 )==""\n"
323R"==(SRC_MMAD_DATA4_T SS; )==""\n"
324R"==(int4 C01 = 0; )==""\n"
325R"==(int4 C11 = 0; )==""\n"
326R"==(int4 C21 = 0; )==""\n"
327R"==(int4 C31 = 0; )==""\n"
328R"==(#endif )==""\n"
329R"==(#if OW_BLOCK == 16 )==""\n"
330R"==(int8 C01 = 0; )==""\n"
331R"==(int8 C11 = 0; )==""\n"
332R"==(int8 C21 = 0; )==""\n"
333R"==(int8 C31 = 0; )==""\n"
334R"==(#endif )==""\n"
335R"==(for (int i = 0; i < KDHW_SIZE - KDHW_SIZE % 8; i += 8) { )==""\n"
336R"==(const int ihw = (i + sub_local_id) % (KW * KH); )==""\n"
337R"==(const int filter_iw = (ihw % KW) * (1 + DW); )==""\n"
338R"==(const int filter_ih = ihw / KW; )==""\n"
339R"==(const int filter_id = (i + sub_local_id) / (KH * KW); )==""\n"
340R"==(const int filter = (filter_ih * (1 + DH) + ih >= 0) )==""\n"
341R"==(&& (filter_ih * (1 + DH) + ih < IH) )==""\n"
342R"==(&& (filter_id * (1 + DD) + id >= 0 )==""\n"
343R"==(&& filter_id * (1 + DD) + id < ID); )==""\n"
344R"==(BLOCK_READ_WHT8(W0, 0); )==""\n"
345R"==(#if OC_PADD8 * 4 > OC_BLOCK )==""\n"
346R"==(BLOCK_READ_WHT8(W1, KDHW_SIZE * OC_BLOCK); )==""\n"
347R"==(#endif )==""\n"
348R"==(#if OC_PADD8 * 4 > OC_BLOCK * 2 )==""\n"
349R"==(BLOCK_READ_WHT8(W2, 2 * KDHW_SIZE * OC_BLOCK); )==""\n"
350R"==(#endif )==""\n"
351R"==(#if OC_PADD8 * 4 > OC_BLOCK * 3 )==""\n"
352R"==(BLOCK_READ_WHT8(W3, 3 * KDHW_SIZE * OC_BLOCK); )==""\n"
353R"==(#endif )==""\n"
354R"==(if (filter) { )==""\n"
355R"==(S.s0 = S_work[SW * 0 + SRC_SLM_SIZE * KH * filter_id )==""\n"
356R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
357R"==(S.s1 = S_work[SW * 1 + SRC_SLM_SIZE * KH * filter_id )==""\n"
358R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
359R"==(S.s2 = S_work[SW * 2 + SRC_SLM_SIZE * KH * filter_id )==""\n"
360R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
361R"==(S.s3 = S_work[SW * 3 + SRC_SLM_SIZE * KH * filter_id )==""\n"
362R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
363R"==(S.s4 = S_work[SW * 4 + SRC_SLM_SIZE * KH * filter_id )==""\n"
364R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
365R"==(S.s5 = S_work[SW * 5 + SRC_SLM_SIZE * KH * filter_id )==""\n"
366R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
367R"==(S.s6 = S_work[SW * 6 + SRC_SLM_SIZE * KH * filter_id )==""\n"
368R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
369R"==(S.s7 = S_work[SW * 7 + SRC_SLM_SIZE * KH * filter_id )==""\n"
370R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
371R"==(#if OW_BLOCK == 12 )==""\n"
372R"==(SS.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
373R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
374R"==(SS.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
375R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
376R"==(SS.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
377R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
378R"==(SS.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
379R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
380R"==(#endif )==""\n"
381R"==(} else { )==""\n"
382R"==(S = 0; )==""\n"
383R"==(#if OW_BLOCK == 12 )==""\n"
384R"==(SS = 0; )==""\n"
385R"==(#endif )==""\n"
386R"==(} )==""\n"
387R"==(C00 = mmad8x8(S, W0, C00); )==""\n"
388R"==(C10 = mmad8x8(S, W1, C10); )==""\n"
389R"==(C20 = mmad8x8(S, W2, C20); )==""\n"
390R"==(C30 = mmad8x8(S, W3, C30); )==""\n"
391R"==(#if OW_BLOCK == 12 )==""\n"
392R"==(C01 = mmad8x4(SS, W0, C01); )==""\n"
393R"==(C11 = mmad8x4(SS, W1, C11); )==""\n"
394R"==(C21 = mmad8x4(SS, W2, C21); )==""\n"
395R"==(C31 = mmad8x4(SS, W3, C31); )==""\n"
396R"==(#endif )==""\n"
397R"==(#if OW_BLOCK == 16 )==""\n"
398R"==(if (filter) { )==""\n"
399R"==(S.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
400R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
401R"==(S.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
402R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
403R"==(S.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
404R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
405R"==(S.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
406R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
407R"==(S.s4 = S_work[SW * 12 + SRC_SLM_SIZE * KH * filter_id )==""\n"
408R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
409R"==(S.s5 = S_work[SW * 13 + SRC_SLM_SIZE * KH * filter_id )==""\n"
410R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
411R"==(S.s6 = S_work[SW * 14 + SRC_SLM_SIZE * KH * filter_id )==""\n"
412R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
413R"==(S.s7 = S_work[SW * 15 + SRC_SLM_SIZE * KH * filter_id )==""\n"
414R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
415R"==(} else { )==""\n"
416R"==(S = 0; )==""\n"
417R"==(} )==""\n"
418R"==(C01 = mmad8x8(S, W0, C01); )==""\n"
419R"==(C11 = mmad8x8(S, W1, C11); )==""\n"
420R"==(C21 = mmad8x8(S, W2, C21); )==""\n"
421R"==(C31 = mmad8x8(S, W3, C31); )==""\n"
422R"==(#endif )==""\n"
423R"==(wei += OC_BLOCK * 8; )==""\n"
424R"==(} )==""\n"
425R"==(for (int i = KDHW_SIZE - KDHW_SIZE % 8; i < KDHW_SIZE; i++) { )==""\n"
426R"==(const int ihw = (i) % (KW * KH); )==""\n"
427R"==(const int filter_iw = (ihw % KW) * (1 + DW); )==""\n"
428R"==(const int filter_ih = ihw / KW; )==""\n"
429R"==(const int filter_id = (i) / (KH * KW); )==""\n"
430R"==(const int filter = (filter_ih * (1 + DH) + ih >= 0) )==""\n"
431R"==(&& (filter_ih * (1 + DH) + ih < IH) )==""\n"
432R"==(&& (filter_id * (1 + DD) + id >= 0 )==""\n"
433R"==(&& filter_id * (1 + DD) + id < ID); )==""\n"
434R"==(if (filter) { )==""\n"
435R"==(BLOCK_READ_WHT(W00, 0); )==""\n"
436R"==(#if OC_PADD8 * 4 > OC_BLOCK )==""\n"
437R"==(BLOCK_READ_WHT(W10, KDHW_SIZE * OC_BLOCK); )==""\n"
438R"==(#endif )==""\n"
439R"==(#if OC_PADD8 * 4 > OC_BLOCK * 2 )==""\n"
440R"==(BLOCK_READ_WHT(W20, 2 * KDHW_SIZE * OC_BLOCK); )==""\n"
441R"==(#endif )==""\n"
442R"==(#if OC_PADD8 * 4 > OC_BLOCK * 3 )==""\n"
443R"==(BLOCK_READ_WHT(W30, 3 * KDHW_SIZE * OC_BLOCK); )==""\n"
444R"==(#endif )==""\n"
445R"==(S.s0 = S_work[SW * 0 + SRC_SLM_SIZE * KH * filter_id )==""\n"
446R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
447R"==(S.s1 = S_work[SW * 1 + SRC_SLM_SIZE * KH * filter_id )==""\n"
448R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
449R"==(S.s2 = S_work[SW * 2 + SRC_SLM_SIZE * KH * filter_id )==""\n"
450R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
451R"==(S.s3 = S_work[SW * 3 + SRC_SLM_SIZE * KH * filter_id )==""\n"
452R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
453R"==(S.s4 = S_work[SW * 4 + SRC_SLM_SIZE * KH * filter_id )==""\n"
454R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
455R"==(S.s5 = S_work[SW * 5 + SRC_SLM_SIZE * KH * filter_id )==""\n"
456R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
457R"==(S.s6 = S_work[SW * 6 + SRC_SLM_SIZE * KH * filter_id )==""\n"
458R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
459R"==(S.s7 = S_work[SW * 7 + SRC_SLM_SIZE * KH * filter_id )==""\n"
460R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
461R"==(#if OW_BLOCK == 12 )==""\n"
462R"==(SS.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
463R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
464R"==(SS.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
465R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
466R"==(SS.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
467R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
468R"==(SS.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
469R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
470R"==(#endif )==""\n"
471R"==(C00.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W00), C00.s0); )==""\n"
472R"==(C00.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W00), C00.s1); )==""\n"
473R"==(C00.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W00), C00.s2); )==""\n"
474R"==(C00.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W00), C00.s3); )==""\n"
475R"==(C00.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W00), C00.s4); )==""\n"
476R"==(C00.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W00), C00.s5); )==""\n"
477R"==(C00.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W00), C00.s6); )==""\n"
478R"==(C00.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W00), C00.s7); )==""\n"
479R"==(C10.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W10), C10.s0); )==""\n"
480R"==(C10.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W10), C10.s1); )==""\n"
481R"==(C10.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W10), C10.s2); )==""\n"
482R"==(C10.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W10), C10.s3); )==""\n"
483R"==(C10.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W10), C10.s4); )==""\n"
484R"==(C10.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W10), C10.s5); )==""\n"
485R"==(C10.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W10), C10.s6); )==""\n"
486R"==(C10.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W10), C10.s7); )==""\n"
487R"==(C20.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W20), C20.s0); )==""\n"
488R"==(C20.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W20), C20.s1); )==""\n"
489R"==(C20.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W20), C20.s2); )==""\n"
490R"==(C20.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W20), C20.s3); )==""\n"
491R"==(C20.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W20), C20.s4); )==""\n"
492R"==(C20.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W20), C20.s5); )==""\n"
493R"==(C20.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W20), C20.s6); )==""\n"
494R"==(C20.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W20), C20.s7); )==""\n"
495R"==(C30.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W30), C30.s0); )==""\n"
496R"==(C30.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W30), C30.s1); )==""\n"
497R"==(C30.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W30), C30.s2); )==""\n"
498R"==(C30.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W30), C30.s3); )==""\n"
499R"==(C30.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W30), C30.s4); )==""\n"
500R"==(C30.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W30), C30.s5); )==""\n"
501R"==(C30.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W30), C30.s6); )==""\n"
502R"==(C30.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W30), C30.s7); )==""\n"
503R"==(#if OW_BLOCK == 12 )==""\n"
504R"==(C01.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W00), C01.s0); )==""\n"
505R"==(C01.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W00), C01.s1); )==""\n"
506R"==(C01.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W00), C01.s2); )==""\n"
507R"==(C01.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W00), C01.s3); )==""\n"
508R"==(C11.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W10), C11.s0); )==""\n"
509R"==(C11.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W10), C11.s1); )==""\n"
510R"==(C11.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W10), C11.s2); )==""\n"
511R"==(C11.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W10), C11.s3); )==""\n"
512R"==(C21.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W20), C21.s0); )==""\n"
513R"==(C21.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W20), C21.s1); )==""\n"
514R"==(C21.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W20), C21.s2); )==""\n"
515R"==(C21.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W20), C21.s3); )==""\n"
516R"==(C31.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W30), C31.s0); )==""\n"
517R"==(C31.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W30), C31.s1); )==""\n"
518R"==(C31.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W30), C31.s2); )==""\n"
519R"==(C31.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W30), C31.s3); )==""\n"
520R"==(#endif )==""\n"
521R"==(#if OW_BLOCK == 16 )==""\n"
522R"==(S.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
523R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
524R"==(S.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
525R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
526R"==(S.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
527R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
528R"==(S.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
529R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
530R"==(S.s4 = S_work[SW * 12 + SRC_SLM_SIZE * KH * filter_id )==""\n"
531R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
532R"==(S.s5 = S_work[SW * 13 + SRC_SLM_SIZE * KH * filter_id )==""\n"
533R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
534R"==(S.s6 = S_work[SW * 14 + SRC_SLM_SIZE * KH * filter_id )==""\n"
535R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
536R"==(S.s7 = S_work[SW * 15 + SRC_SLM_SIZE * KH * filter_id )==""\n"
537R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
538R"==(C01.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W00), C01.s0); )==""\n"
539R"==(C01.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W00), C01.s1); )==""\n"
540R"==(C01.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W00), C01.s2); )==""\n"
541R"==(C01.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W00), C01.s3); )==""\n"
542R"==(C01.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W00), C01.s4); )==""\n"
543R"==(C01.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W00), C01.s5); )==""\n"
544R"==(C01.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W00), C01.s6); )==""\n"
545R"==(C01.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W00), C01.s7); )==""\n"
546R"==(C11.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W10), C11.s0); )==""\n"
547R"==(C11.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W10), C11.s1); )==""\n"
548R"==(C11.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W10), C11.s2); )==""\n"
549R"==(C11.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W10), C11.s3); )==""\n"
550R"==(C11.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W10), C11.s4); )==""\n"
551R"==(C11.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W10), C11.s5); )==""\n"
552R"==(C11.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W10), C11.s6); )==""\n"
553R"==(C11.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W10), C11.s7); )==""\n"
554R"==(C21.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W20), C21.s0); )==""\n"
555R"==(C21.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W20), C21.s1); )==""\n"
556R"==(C21.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W20), C21.s2); )==""\n"
557R"==(C21.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W20), C21.s3); )==""\n"
558R"==(C21.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W20), C21.s4); )==""\n"
559R"==(C21.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W20), C21.s5); )==""\n"
560R"==(C21.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W20), C21.s6); )==""\n"
561R"==(C21.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W20), C21.s7); )==""\n"
562R"==(C31.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W30), C31.s0); )==""\n"
563R"==(C31.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W30), C31.s1); )==""\n"
564R"==(C31.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W30), C31.s2); )==""\n"
565R"==(C31.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W30), C31.s3); )==""\n"
566R"==(C31.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W30), C31.s4); )==""\n"
567R"==(C31.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W30), C31.s5); )==""\n"
568R"==(C31.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W30), C31.s6); )==""\n"
569R"==(C31.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W30), C31.s7); )==""\n"
570R"==(#endif )==""\n"
571R"==(} )==""\n"
572R"==(wei += OC_BLOCK; )==""\n"
573R"==(} )==""\n"
574R"==(DST_DATA16_T R1, R2, R3, R4; )==""\n"
575R"==(#if SCALES_PER_OC )==""\n"
576R"==(float4 scales; )==""\n"
577R"==(block_read_scales( )==""\n"
578R"==(&scales, (group_oc + oc) * OC_BLOCK, sub_local_id, runtime_scales); )==""\n"
579R"==(#endif )==""\n"
580R"==(#if WITH_BIAS )==""\n"
581R"==(float4 bia; )==""\n"
582R"==(BLOCK_READ_BIA(bia, (group_oc + oc) * OC_BLOCK); )==""\n"
583R"==(#define QUANTIZE_ADD_BIAS() tmp = SCALE * fma(tmp, (float4)1, bia); )==""\n"
584R"==(#define QUANTIZE_ADD_BIAS_4() \ )==""\n"
585R"==(tmp0 = ((float8)SCALE_VEC4) * fma(tmp0, (float8)1, bia.s01230123); \ )==""\n"
586R"==(tmp1 = ((float8)SCALE_VEC4) * fma(tmp1, (float8)1, bia.s01230123); )==""\n"
587R"==(#else )==""\n"
588R"==(#define QUANTIZE_ADD_BIAS() tmp *= SCALE; )==""\n"
589R"==(#define QUANTIZE_ADD_BIAS_4() \ )==""\n"
590R"==(tmp0 *= SCALE_VEC4; \ )==""\n"
591R"==(tmp1 *= SCALE_VEC4; )==""\n"
592R"==(#endif )==""\n"
593R"==(#if WITH_POST_OP )==""\n"
594R"==(#define APPLY_POST_OPS_COMMON(nelems, accum, sum) \ )==""\n"
595R"==(for (int didx = 0; didx < nelems; ++didx) { \ )==""\n"
596R"==(float tmp_i = accum[didx]; \ )==""\n"
597R"==(SUM_DATA_T d_i = sum[didx]; \ )==""\n"
598R"==(const int po_mb = group_mb % MB; \ )==""\n"
599R"==(const int po_oc \ )==""\n"
600R"==(= (oc * OC_BLOCK + ((didx * SUB_GROUP_SIZE) % OC_BLOCK) \ )==""\n"
601R"==(+ sub_local_id) \ )==""\n"
602R"==(% (OC * G); \ )==""\n"
603R"==(APPLY_POST_OPS_SERIAL_BINARY_2D( \ )==""\n"
604R"==(tmp_i, float, d_i, SUM_DATA_T, po_mb, 1, po_oc, 1); \ )==""\n"
605R"==(accum[didx] = tmp_i; \ )==""\n"
606R"==(} )==""\n"
607R"==(#if DST_NHWC )==""\n"
608R"==(#define DO_POST_OP() \ )==""\n"
609R"==(do { \ )==""\n"
610R"==(SUM_DATA4_T d; \ )==""\n"
611R"==(if (WITH_SUM) \ )==""\n"
612R"==(d = AS_SUM_DATA4_T( \ )==""\n"
613R"==(read_oc_block4(dst, (group_oc + oc) * OC_BLOCK)); \ )==""\n"
614R"==(APPLY_POST_OPS_COMMON(4, tmp, d); \ )==""\n"
615R"==(} while (0) )==""\n"
616R"==(#define DO_POST_OP_4() \ )==""\n"
617R"==({ \ )==""\n"
618R"==(SUM_DATA16_T d; \ )==""\n"
619R"==(if (WITH_SUM) \ )==""\n"
620R"==(d = AS_SUM_DATA16_T( \ )==""\n"
621R"==((DST_DATA16_T)(read_oc_block4(dst + G * OC * 0, \ )==""\n"
622R"==((group_oc + oc) * OC_BLOCK), \ )==""\n"
623R"==(read_oc_block4(dst + G * OC * 1, \ )==""\n"
624R"==((group_oc + oc) * OC_BLOCK), \ )==""\n"
625R"==(read_oc_block4(dst + G * OC * 2, \ )==""\n"
626R"==((group_oc + oc) * OC_BLOCK), \ )==""\n"
627R"==(read_oc_block4(dst + G * OC * 3, \ )==""\n"
628R"==((group_oc + oc) * OC_BLOCK))); \ )==""\n"
629R"==(float16 tmp_x16 = (float16)(tmp0, tmp1); \ )==""\n"
630R"==(APPLY_POST_OPS_COMMON(16, tmp_x16, d); \ )==""\n"
631R"==(tmp0 = tmp_x16.s01234567; \ )==""\n"
632R"==(tmp1 = tmp_x16.s89abcdef; \ )==""\n"
633R"==(} )==""\n"
634R"==(#else )==""\n"
635R"==(#define DO_POST_OP() \ )==""\n"
636R"==({ \ )==""\n"
637R"==(SUM_DATA4_T d; \ )==""\n"
638R"==(if (WITH_SUM) d = AS_SUM_DATA4_T(BLOCK_READ_DST4(dst)); \ )==""\n"
639R"==(APPLY_POST_OPS_COMMON(4, tmp, d); \ )==""\n"
640R"==(} )==""\n"
641R"==(#define DO_POST_OP_4() \ )==""\n"
642R"==({ \ )==""\n"
643R"==(SUM_DATA16_T d; \ )==""\n"
644R"==(if (WITH_SUM) d = AS_SUM_DATA16_T(BLOCK_READ_DST16(dst)); \ )==""\n"
645R"==(float16 tmp_x16 = (float16)(tmp0, tmp1); \ )==""\n"
646R"==(APPLY_POST_OPS_COMMON(16, tmp_x16, d); \ )==""\n"
647R"==(tmp0 = tmp_x16.s01234567; \ )==""\n"
648R"==(tmp1 = tmp_x16.s89abcdef; \ )==""\n"
649R"==(} )==""\n"
650R"==(#endif )==""\n"
651R"==(#else )==""\n"
652R"==(#define DO_POST_OP() ; )==""\n"
653R"==(#define DO_POST_OP_4() ; )==""\n"
654R"==(#endif )==""\n"
655R"==(#define PACK(C0, C1, C2, C3, idx) \ )==""\n"
656R"==(do { \ )==""\n"
657R"==(tmp.s0 = ((int *)&C0)[idx]; \ )==""\n"
658R"==(tmp.s1 = ((int *)&C1)[idx]; \ )==""\n"
659R"==(tmp.s2 = ((int *)&C2)[idx]; \ )==""\n"
660R"==(tmp.s3 = ((int *)&C3)[idx]; \ )==""\n"
661R"==(} while (0) )==""\n"
662R"==(#define PACK_4(C0, C1, C2, C3, idx) \ )==""\n"
663R"==(do { \ )==""\n"
664R"==(tmp0.s0 = ((int *)&C0)[idx]; \ )==""\n"
665R"==(tmp0.s1 = ((int *)&C1)[idx]; \ )==""\n"
666R"==(tmp0.s2 = ((int *)&C2)[idx]; \ )==""\n"
667R"==(tmp0.s3 = ((int *)&C3)[idx]; \ )==""\n"
668R"==(\ )==""\n"
669R"==(tmp0.s4 = ((int *)&C0)[idx + 1]; \ )==""\n"
670R"==(tmp0.s5 = ((int *)&C1)[idx + 1]; \ )==""\n"
671R"==(tmp0.s6 = ((int *)&C2)[idx + 1]; \ )==""\n"
672R"==(tmp0.s7 = ((int *)&C3)[idx + 1]; \ )==""\n"
673R"==(\ )==""\n"
674R"==(tmp1.s0 = ((int *)&C0)[idx + 2]; \ )==""\n"
675R"==(tmp1.s1 = ((int *)&C1)[idx + 2]; \ )==""\n"
676R"==(tmp1.s2 = ((int *)&C2)[idx + 2]; \ )==""\n"
677R"==(tmp1.s3 = ((int *)&C3)[idx + 2]; \ )==""\n"
678R"==(\ )==""\n"
679R"==(tmp1.s4 = ((int *)&C0)[idx + 3]; \ )==""\n"
680R"==(tmp1.s5 = ((int *)&C1)[idx + 3]; \ )==""\n"
681R"==(tmp1.s6 = ((int *)&C2)[idx + 3]; \ )==""\n"
682R"==(tmp1.s7 = ((int *)&C3)[idx + 3]; \ )==""\n"
683R"==(} while (0) )==""\n"
684R"==(#define CONVERT_PACK() \ )==""\n"
685R"==(do { \ )==""\n"
686R"==(tmp_cvt = CONVERT_DST_DATA4_T(tmp); \ )==""\n"
687R"==(} while (0) )==""\n"
688R"==(#define CONVERT_PACK_4() \ )==""\n"
689R"==(do { \ )==""\n"
690R"==(R.s01234567 = CONVERT_DST_DATA8_T(tmp0); \ )==""\n"
691R"==(R.s89abcdef = CONVERT_DST_DATA8_T(tmp1); \ )==""\n"
692R"==(} while (0) )==""\n"
693R"==(#if DST_NHWC )==""\n"
694R"==(#define STORE_DST(C0, C1, C2, C3, i) \ )==""\n"
695R"==(do { \ )==""\n"
696R"==(PACK(C0, C1, C2, C3, i); \ )==""\n"
697R"==(QUANTIZE_ADD_BIAS(); \ )==""\n"
698R"==(DO_POST_OP(); \ )==""\n"
699R"==(CONVERT_PACK(); \ )==""\n"
700R"==(write_oc_block4(dst, (group_oc + oc) * OC_BLOCK, tmp_cvt); \ )==""\n"
701R"==(dst += OC; \ )==""\n"
702R"==(} while (0) )==""\n"
703R"==(#define STORE_DST_4(C0, C1, C2, C3, i) \ )==""\n"
704R"==(do { \ )==""\n"
705R"==(PACK_4(C0, C1, C2, C3, i); \ )==""\n"
706R"==(QUANTIZE_ADD_BIAS_4(); \ )==""\n"
707R"==(DO_POST_OP_4(); \ )==""\n"
708R"==(CONVERT_PACK_4(); \ )==""\n"
709R"==(write_oc_block4(dst, (group_oc + oc) * OC_BLOCK, R.s0123); \ )==""\n"
710R"==(write_oc_block4(dst + OC * 1, (group_oc + oc) * OC_BLOCK, R.s4567); \ )==""\n"
711R"==(write_oc_block4(dst + OC * 2, (group_oc + oc) * OC_BLOCK, R.s89ab); \ )==""\n"
712R"==(write_oc_block4(dst + OC * 3, (group_oc + oc) * OC_BLOCK, R.scdef); \ )==""\n"
713R"==(dst += 4 * OC; \ )==""\n"
714R"==(} while (0) )==""\n"
715R"==(#else )==""\n"
716R"==(#define STORE_DST(C0, C1, C2, C3, i) \ )==""\n"
717R"==(do { \ )==""\n"
718R"==(PACK(C0, C1, C2, C3, i); \ )==""\n"
719R"==(if (MB % MB_BLOCK == 0 || group_mb < MB) { \ )==""\n"
720R"==(QUANTIZE_ADD_BIAS(); \ )==""\n"
721R"==(DO_POST_OP(); \ )==""\n"
722R"==(CONVERT_PACK(); \ )==""\n"
723R"==(} else { \ )==""\n"
724R"==(tmp_cvt = 0; \ )==""\n"
725R"==(} \ )==""\n"
726R"==(BLOCK_WRITE_DST4(dst, tmp_cvt); \ )==""\n"
727R"==(dst += OC_BLOCK * MB_BLOCK; \ )==""\n"
728R"==(} while (0) )==""\n"
729R"==(#define STORE_DST_4(C0, C1, C2, C3, i) \ )==""\n"
730R"==(do { \ )==""\n"
731R"==(PACK_4(C0, C1, C2, C3, i); \ )==""\n"
732R"==(QUANTIZE_ADD_BIAS_4(); \ )==""\n"
733R"==(DO_POST_OP_4(); \ )==""\n"
734R"==(CONVERT_PACK_4(); \ )==""\n"
735R"==(BLOCK_WRITE_DST16(dst, R); \ )==""\n"
736R"==(dst += 4 * OC_BLOCK; \ )==""\n"
737R"==(} while (0) )==""\n"
738R"==(#endif )==""\n"
739R"==(if (ow < OW) { )==""\n"
740R"==(float4 tmp; )==""\n"
741R"==(DST_DATA4_T tmp_cvt; )==""\n"
742R"==(float8 tmp0, tmp1; )==""\n"
743R"==(DST_DATA16_T R; )==""\n"
744R"==(#if OW_TAIL )==""\n"
745R"==(if (ow + OW_BLOCK < OW) { )==""\n"
746R"==(#endif )==""\n"
747R"==(#if !DST_NHWC && MB_BLOCK == 32 )==""\n"
748R"==(STORE_DST(C00, C10, C20, C30, 0); )==""\n"
749R"==(STORE_DST(C00, C10, C20, C30, 1); )==""\n"
750R"==(STORE_DST(C00, C10, C20, C30, 2); )==""\n"
751R"==(STORE_DST(C00, C10, C20, C30, 3); )==""\n"
752R"==(STORE_DST(C00, C10, C20, C30, 4); )==""\n"
753R"==(STORE_DST(C00, C10, C20, C30, 5); )==""\n"
754R"==(STORE_DST(C00, C10, C20, C30, 6); )==""\n"
755R"==(STORE_DST(C00, C10, C20, C30, 7); )==""\n"
756R"==(#if OW_BLOCK >= 12 )==""\n"
757R"==(STORE_DST(C01, C11, C21, C31, 0); )==""\n"
758R"==(STORE_DST(C01, C11, C21, C31, 1); )==""\n"
759R"==(STORE_DST(C01, C11, C21, C31, 2); )==""\n"
760R"==(STORE_DST(C01, C11, C21, C31, 3); )==""\n"
761R"==(#endif )==""\n"
762R"==(#if OW_BLOCK == 16 )==""\n"
763R"==(STORE_DST(C01, C11, C21, C31, 4); )==""\n"
764R"==(STORE_DST(C01, C11, C21, C31, 5); )==""\n"
765R"==(STORE_DST(C01, C11, C21, C31, 6); )==""\n"
766R"==(STORE_DST(C01, C11, C21, C31, 7); )==""\n"
767R"==(#endif )==""\n"
768R"==(#else )==""\n"
769R"==(STORE_DST_4(C00, C10, C20, C30, 0); )==""\n"
770R"==(STORE_DST_4(C00, C10, C20, C30, 4); )==""\n"
771R"==(#if OW_BLOCK >= 12 )==""\n"
772R"==(STORE_DST_4(C01, C11, C21, C31, 0); )==""\n"
773R"==(#endif )==""\n"
774R"==(#if OW_BLOCK >= 16 )==""\n"
775R"==(STORE_DST_4(C01, C11, C21, C31, 4); )==""\n"
776R"==(#endif )==""\n"
777R"==(#endif )==""\n"
778R"==(#if OW_TAIL )==""\n"
779R"==(} else { )==""\n"
780R"==(#if OW_TAIL < 4 )==""\n"
781R"==(for (int i = 0; i < OW_TAIL; i++) { )==""\n"
782R"==(STORE_DST(C00, C10, C20, C30, i); )==""\n"
783R"==(} )==""\n"
784R"==(#else )==""\n"
785R"==(#if !DST_NHWC && MB_BLOCK == 32 )==""\n"
786R"==(STORE_DST(C00, C10, C20, C30, 0); )==""\n"
787R"==(STORE_DST(C00, C10, C20, C30, 1); )==""\n"
788R"==(STORE_DST(C00, C10, C20, C30, 2); )==""\n"
789R"==(STORE_DST(C00, C10, C20, C30, 3); )==""\n"
790R"==(#else )==""\n"
791R"==(STORE_DST_4(C00, C10, C20, C30, 0); )==""\n"
792R"==(#endif )==""\n"
793R"==(#endif )==""\n"
794R"==(#if OW_TAIL > 4 )==""\n"
795R"==(#if OW_TAIL < 8 )==""\n"
796R"==(for (int i = 4; i < OW_TAIL; i++) { )==""\n"
797R"==(STORE_DST(C00, C10, C20, C30, i); )==""\n"
798R"==(} )==""\n"
799R"==(#else )==""\n"
800R"==(#if !DST_NHWC && MB_BLOCK == 32 )==""\n"
801R"==(STORE_DST(C00, C10, C20, C30, 4); )==""\n"
802R"==(STORE_DST(C00, C10, C20, C30, 5); )==""\n"
803R"==(STORE_DST(C00, C10, C20, C30, 6); )==""\n"
804R"==(STORE_DST(C00, C10, C20, C30, 7); )==""\n"
805R"==(#else )==""\n"
806R"==(STORE_DST_4(C00, C10, C20, C30, 4); )==""\n"
807R"==(#endif )==""\n"
808R"==(#endif )==""\n"
809R"==(#if OW_TAIL > 8 )==""\n"
810R"==(#if OW_TAIL < 12 )==""\n"
811R"==(for (int i = 8; i < OW_TAIL; i++) { )==""\n"
812R"==(STORE_DST(C01, C11, C21, C31, i - CX1_OFFSET); )==""\n"
813R"==(} )==""\n"
814R"==(#else )==""\n"
815R"==(#if !DST_NHWC && MB_BLOCK == 32 )==""\n"
816R"==(STORE_DST(C01, C11, C21, C31, 0); )==""\n"
817R"==(STORE_DST(C01, C11, C21, C31, 1); )==""\n"
818R"==(STORE_DST(C01, C11, C21, C31, 2); )==""\n"
819R"==(STORE_DST(C01, C11, C21, C31, 3); )==""\n"
820R"==(#else )==""\n"
821R"==(STORE_DST_4(C01, C11, C21, C31, 0); )==""\n"
822R"==(#endif )==""\n"
823R"==(#endif )==""\n"
824R"==(#if OW_TAIL > 12 )==""\n"
825R"==(#if OW_TAIL < 16 )==""\n"
826R"==(for (int i = 12; i < OW_TAIL; i++) { )==""\n"
827R"==(STORE_DST(C01, C11, C21, C31, i - CX1_OFFSET); )==""\n"
828R"==(} )==""\n"
829R"==(#else )==""\n"
830R"==(#if !DST_NHWC && MB_BLOCK == 32 )==""\n"
831R"==(STORE_DST(C01, C11, C21, C31, 4); )==""\n"
832R"==(STORE_DST(C01, C11, C21, C31, 5); )==""\n"
833R"==(STORE_DST(C01, C11, C21, C31, 6); )==""\n"
834R"==(STORE_DST(C01, C11, C21, C31, 7); )==""\n"
835R"==(#else )==""\n"
836R"==(STORE_DST_4(C01, C11, C21, C31, 4); )==""\n"
837R"==(#endif )==""\n"
838R"==(#endif )==""\n"
839R"==(#endif )==""\n"
840R"==(#endif )==""\n"
841R"==(#endif )==""\n"
842R"==(} )==""\n"
843R"==(#endif )==""\n"
844R"==(} )==""\n"
845R"==(} )==""\n"
846R"==()==";
847}
848}
849}
850}