1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *xe_lp_conv_fwd_data_first_x8s8x_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#undef cl_intel_subgroup_matrix_multiply_accumulate )==""\n"
21R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
23R"==(#include "gpu/ocl/ocl_scales.h" )==""\n"
24R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
25R"==(#include "gpu/ocl/ocl_zero_points.h" )==""\n"
26R"==(#define KDHW_SIZE (KH * KW * KD) )==""\n"
27R"==(#if (PW % 4 == 0) && (SRC_SLM_SIZE % 4 == 0) )==""\n"
28R"==(#define WRITE_SLM_BLOCK(p, data) block_write(p, data) )==""\n"
29R"==(#define WRITE_SLM_BLOCK_SHORT(p, data) block_write_us(p, data) )==""\n"
30R"==(#else )==""\n"
31R"==(#define WRITE_SLM_BLOCK(p, data) block_write_emu(p, data) )==""\n"
32R"==(#define WRITE_SLM_BLOCK_SHORT(p, data) block_write_us_emu(p, data) )==""\n"
33R"==(#endif )==""\n"
34R"==(#define GET_INT_BLOCK(SRC_SLM, SLM_INDEX, SRC_GLOBAL, GLOBAL_INDEX) \ )==""\n"
35R"==(uchar4 res = 0; \ )==""\n"
36R"==(for (int j = 0; j < IC; j++) { \ )==""\n"
37R"==(res[j] = SRC_GLOBAL[GLOBAL_INDEX + j * IH * IW * ID]; \ )==""\n"
38R"==(} \ )==""\n"
39R"==(SRC_SLM[SLM_INDEX] = as_int(res); )==""\n"
40R"==(#define BLOCK_READ_SRC(data, idx) \ )==""\n"
41R"==(data = intel_sub_group_block_read8((__global uint *)&src[idx]); )==""\n"
42R"==(#define BLOCK_READ_WHT(data, idx) \ )==""\n"
43R"==(data = as_int(intel_sub_group_block_read((__global uint *)&wei[idx])); )==""\n"
44R"==(#define BLOCK_READ_WHT8(data, idx) \ )==""\n"
45R"==(data = as_int8(intel_sub_group_block_read8((__global uint *)&wei[idx])); )==""\n"
46R"==(#define BLOCK_READ_BIA(data, idx) \ )==""\n"
47R"==(data = as_float4(intel_sub_group_block_read4((__global uint *)&bias[idx])); )==""\n"
48R"==(#if SCALES_PER_OC )==""\n"
49R"==(#define SCALE_VEC4 scales.s01230123 )==""\n"
50R"==(#define SCALE scales )==""\n"
51R"==(#elif SCALES_COMMON )==""\n"
52R"==(#define SCALE_VEC4 runtime_scales[0] )==""\n"
53R"==(#define SCALE runtime_scales[0] )==""\n"
54R"==(#else )==""\n"
55R"==(#define SCALE_VEC4 1 )==""\n"
56R"==(#define SCALE 1 )==""\n"
57R"==(#endif )==""\n"
58R"==(#define OC_PADD8 ((OC % 8) ? (OC / 8 + 1) * 8 : OC) )==""\n"
59R"==(__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) )==""\n"
60R"==(__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) __kernel void )==""\n"
61R"==(conv_fwd_first_x8s8x(const __global uchar *src, const __global char *wei, )==""\n"
62R"==(const __global float *bias, __global DST_DATA_T *dst POST_OP_ARGS, )==""\n"
63R"==(const __global float *runtime_scales, )==""\n"
64R"==(const __global int *src_compensation, const __global int *src_zpoints, )==""\n"
65R"==(const __global int *dst_compensation) { )==""\n"
66R"==(const int group_oc = get_group_id(0) * OC_GROUP; )==""\n"
67R"==(const int group_sp = get_group_id(1) * SP_GROUP; )==""\n"
68R"==(const int group_mb = get_group_id(2) * MB_GROUP; )==""\n"
69R"==(const int is_mb_padding = MB % MB_BLOCK != 0 && group_mb >= MB; )==""\n"
70R"==(const int sub_group_id = get_sub_group_id(); )==""\n"
71R"==(const int sub_local_id = get_sub_group_local_id(); )==""\n"
72R"==(const int oc = (sub_group_id % OC_GROUP); )==""\n"
73R"==(const int sp = (sub_group_id / OC_GROUP); )==""\n"
74R"==(const int g = (group_oc + oc) / OC_NCHUNK; )==""\n"
75R"==(const int group_ic = IC_NCHUNK * g; )==""\n"
76R"==(const int god = group_sp / (OW_PADDED * OH); )==""\n"
77R"==(const int gohw = group_sp % (OW_PADDED * OH); )==""\n"
78R"==(const int goh = gohw / OW_PADDED; )==""\n"
79R"==(const int gow = OW_BLOCK * (gohw % OW_PADDED); )==""\n"
80R"==(const int gid = god * SD; )==""\n"
81R"==(const int gih = goh * SH; )==""\n"
82R"==(const int giw = gow * SW; )==""\n"
83R"==(const int local_ow = OW_BLOCK * sp; )==""\n"
84R"==(const int local_iw = local_ow * SW; )==""\n"
85R"==(const int od = god; )==""\n"
86R"==(const int ow = gow + local_ow; )==""\n"
87R"==(const int oh = goh; )==""\n"
88R"==(const int id = gid - PD; )==""\n"
89R"==(const int iw = giw + local_iw - PW; )==""\n"
90R"==(const int ih = gih - PH; )==""\n"
91R"==(__local uint S_slice[SRC_SLM_SIZE * KH * KD]; )==""\n"
92R"==(__local uint *S_part = S_slice + (sp * SW * OW_BLOCK + PW); )==""\n"
93R"==(__local SRC_MMAD_DATA_T *S_work = S_slice + (sp * SW * OW_BLOCK); )==""\n"
94R"==(dst += OC_BLOCK * OD * OH * OW * MB_BLOCK * (group_oc + oc); )==""\n"
95R"==(dst += OC_BLOCK * OD * OH * OW * OC_NCHUNK * G * MB_BLOCK )==""\n"
96R"==(* (group_mb / MB_BLOCK); )==""\n"
97R"==(dst += OC_BLOCK * (group_mb % MB_BLOCK); )==""\n"
98R"==(dst += OC_BLOCK * MB_BLOCK * (OW * OH * od + OW * oh + ow); )==""\n"
99R"==(#if NCHW == 1 )==""\n"
100R"==(src += IC * ID * IH * IW * G * group_mb; )==""\n"
101R"==(src += (IW * IH * id + IW * ih + iw + PW); )==""\n"
102R"==(#else )==""\n"
103R"==(src += IC_BLOCK * ID * IH * IW * G * group_mb; )==""\n"
104R"==(src += IC_BLOCK * (IW * IH * id + IW * ih + iw + PW); )==""\n"
105R"==(#endif )==""\n"
106R"==(wei += 4 * KDHW_SIZE * OC_BLOCK * (group_oc + oc); )==""\n"
107R"==(/* WORK WITH SLM */ )==""\n"
108R"==(const bool left_tail = iw < 0 || iw >= IW; )==""\n"
109R"==(const bool left_nozero_tail = sub_group_id == 0 && iw >= 0; )==""\n"
110R"==(const bool right_tail = (iw + PW + SLM_TAIL >= IW) && (iw + PW < IW); )==""\n"
111R"==(const bool empty = (iw + PW >= IW); )==""\n"
112R"==(const bool right_nozero_tail )==""\n"
113R"==(= sp == (LWS_1 - 1) && (iw + PW + SLM_TAIL < IW); )==""\n"
114R"==(/* KD */ )==""\n"
115R"==(#if KD > 1 )==""\n"
116R"==(for (int kd = 0; kd < KD; kd++) { )==""\n"
117R"==(if (kd * (1 + DD) + id < 0 || kd * (1 + DD) + id >= ID) { )==""\n"
118R"==(S_part += SRC_SLM_SIZE * KH; )==""\n"
119R"==(src += IC_BLOCK * IW * IH * (1 + DD); )==""\n"
120R"==(continue; )==""\n"
121R"==(} )==""\n"
122R"==(#endif )==""\n"
123R"==(/* KH */ )==""\n"
124R"==(#if KH > 1 )==""\n"
125R"==(for (int kh = 0; kh < KH; kh++) { )==""\n"
126R"==(if (kh * (1 + DH) + ih < 0 || kh * (1 + DH) + ih >= IH) { )==""\n"
127R"==(S_part += SRC_SLM_SIZE; )==""\n"
128R"==(src += IC_BLOCK * IW * (1 + DH); )==""\n"
129R"==(continue; )==""\n"
130R"==(} )==""\n"
131R"==(#endif )==""\n"
132R"==(/* KW */ )==""\n"
133R"==(/* left tail */ )==""\n"
134R"==(if (is_mb_padding) { goto end_conv_calculate; } )==""\n"
135R"==(#if PW > 0 || OW != OWX )==""\n"
136R"==(if (left_tail) { )==""\n"
137R"==(for (int i = -PW; i < 0; i += 8) { )==""\n"
138R"==(if (i + sub_local_id < 0) S_part[i + sub_local_id] = 0; )==""\n"
139R"==(} )==""\n"
140R"==(} )==""\n"
141R"==(#endif )==""\n"
142R"==(/* right tail */ )==""\n"
143R"==(#if ZERO_TAIL > 0 )==""\n"
144R"==(if (right_tail) { )==""\n"
145R"==(for (int i = SLM_TAIL; i < SW * OW_BLOCK + (KW - 1) * (1 + DW); )==""\n"
146R"==(i++) { )==""\n"
147R"==(S_part[i] = 0; )==""\n"
148R"==(} )==""\n"
149R"==(} )==""\n"
150R"==(#if SLM_NCHUNK < OW_NCHUNK )==""\n"
151R"==(if (empty) { )==""\n"
152R"==(for (int i = 0; i < SW * OW_BLOCK + (KW - 1) * (1 + DW); i++) { )==""\n"
153R"==(S_part[i] = 0; )==""\n"
154R"==(} )==""\n"
155R"==(} )==""\n"
156R"==(#endif )==""\n"
157R"==(#endif )==""\n"
158R"==(#if SLM_NCHUNK < OW_NCHUNK || OW != OWX )==""\n"
159R"==(if (iw + PW < IW) { )==""\n"
160R"==(#endif )==""\n"
161R"==(#if OW_NCHUNK > LWS_1 )==""\n"
162R"==(/* Copy tails in case of multigroups */ )==""\n"
163R"==(if (ow < OW) { )==""\n"
164R"==(#if PW > 0 )==""\n"
165R"==(if (left_nozero_tail) { )==""\n"
166R"==(for (int i = -PW; i < 0; i += 8) { )==""\n"
167R"==(if (i + sub_local_id < 0) { )==""\n"
168R"==(#if NCHW == 1 )==""\n"
169R"==(GET_INT_BLOCK(S_part, i + sub_local_id, src, )==""\n"
170R"==(i + sub_local_id); )==""\n"
171R"==(#else )==""\n"
172R"==(S_part[i + sub_local_id] = (( )==""\n"
173R"==(__global uint *)src)[i + sub_local_id]; )==""\n"
174R"==(#endif )==""\n"
175R"==(} )==""\n"
176R"==(} )==""\n"
177R"==(} )==""\n"
178R"==(#endif )==""\n"
179R"==(if (right_nozero_tail) { )==""\n"
180R"==(for (int i = SW * OW_BLOCK; )==""\n"
181R"==(i < SW * OW_BLOCK + (KW - 1) * (1 + DW) - PW; )==""\n"
182R"==(i += 8) { )==""\n"
183R"==(if (i + sub_local_id < SW * OW_BLOCK )==""\n"
184R"==(+ (KW - 1) * (1 + DW) - PW) { )==""\n"
185R"==(#if NCHW == 1 )==""\n"
186R"==(GET_INT_BLOCK(S_part, i + sub_local_id, src, )==""\n"
187R"==(i + sub_local_id); )==""\n"
188R"==(#else )==""\n"
189R"==(S_part[i + sub_local_id] = (( )==""\n"
190R"==(__global uint *)src)[i + sub_local_id]; )==""\n"
191R"==(#endif )==""\n"
192R"==(} )==""\n"
193R"==(} )==""\n"
194R"==(} )==""\n"
195R"==(#endif )==""\n"
196R"==(#if SLM_TAIL != OW_BLOCK * SW )==""\n"
197R"==(/* Copy last block to SLM */ )==""\n"
198R"==(if (right_tail) { )==""\n"
199R"==(__attribute__((opencl_unroll_hint)) for (int i = 0; )==""\n"
200R"==(i < SLM_TAIL; )==""\n"
201R"==(i += 8) { )==""\n"
202R"==(if (i + sub_local_id < SLM_TAIL) { )==""\n"
203R"==(#if NCHW == 1 )==""\n"
204R"==(GET_INT_BLOCK(S_part, i + sub_local_id, src, )==""\n"
205R"==(i + sub_local_id); )==""\n"
206R"==(#else )==""\n"
207R"==(S_part[i + sub_local_id] = (( )==""\n"
208R"==(__global uint *)src)[i + sub_local_id]; )==""\n"
209R"==(#endif )==""\n"
210R"==(} )==""\n"
211R"==(} )==""\n"
212R"==(} else { )==""\n"
213R"==(#endif )==""\n"
214R"==(#if (SW * OW_BLOCK) % 8 == 0 )==""\n"
215R"==(/* Copy block to SLM */ )==""\n"
216R"==(__attribute__(( )==""\n"
217R"==(opencl_unroll_hint)) for (int i = 0; )==""\n"
218R"==(i < SW * OW_BLOCK; )==""\n"
219R"==(i += 8) { )==""\n"
220R"==(#if NCHW == 1 )==""\n"
221R"==(uchar4 res = 0; )==""\n"
222R"==(for (int j = 0; j < IC; j++) { )==""\n"
223R"==(if ((ulong)&src[(i + j * IH * IW * ID)] % 4 )==""\n"
224R"==(== 0) )==""\n"
225R"==(res[j] = intel_sub_group_block_read_uc( )==""\n"
226R"==(src + i + j * IH * IW * ID); )==""\n"
227R"==(else )==""\n"
228R"==(res[j] = src[i + j * IH * IW * ID )==""\n"
229R"==(+ get_sub_group_local_id()]; )==""\n"
230R"==(} )==""\n"
231R"==(WRITE_SLM_BLOCK(S_part + i, as_int(res)); )==""\n"
232R"==(#else )==""\n"
233R"==(WRITE_SLM_BLOCK(S_part + i, )==""\n"
234R"==(intel_sub_group_block_read(( )==""\n"
235R"==(const __global uint )==""\n"
236R"==(*)(&src[i * IC_BLOCK]))); )==""\n"
237R"==(#endif )==""\n"
238R"==(} )==""\n"
239R"==(#elif (SW * OW_BLOCK) % 4 == 0 && NCHW == 0 )==""\n"
240R"==(__attribute__((opencl_unroll_hint)) for (int i = 0; i < SW * OW_BLOCK; )==""\n"
241R"==(i += 4) { )==""\n"
242R"==(WRITE_SLM_BLOCK_SHORT(S_part + i, )==""\n"
243R"==(intel_sub_group_block_read_us( )==""\n"
244R"==((const __global ushort *)(&src[i * IC_BLOCK]))); )==""\n"
245R"==(} )==""\n"
246R"==(#else )==""\n"
247R"==(__attribute__((opencl_unroll_hint)) for (int i = 0; i < SW * OW_BLOCK; )==""\n"
248R"==(i++) { )==""\n"
249R"==(GET_INT_BLOCK(S_part, i, src, i); )==""\n"
250R"==(} )==""\n"
251R"==(#endif )==""\n"
252R"==(#if SLM_TAIL != OW_BLOCK * SW )==""\n"
253R"==(} )==""\n"
254R"==(#endif )==""\n"
255R"==(#if OW_NCHUNK > LWS_1 )==""\n"
256R"==(} )==""\n"
257R"==(#endif )==""\n"
258R"==(#if SLM_NCHUNK < OW_NCHUNK || OW != OWX )==""\n"
259R"==(} )==""\n"
260R"==(#endif )==""\n"
261R"==(/* right tail */ )==""\n"
262R"==(#if ZERO_TAIL > 0 )==""\n"
263R"==(if (right_tail) { )==""\n"
264R"==(for (int i = SLM_TAIL; i < SW * OW_BLOCK + (KW - 1) * (1 + DW); )==""\n"
265R"==(i += 8) { )==""\n"
266R"==(if (i + sub_local_id < SW * OW_BLOCK + (KW - 1) * (1 + DW)) )==""\n"
267R"==(S_part[i + sub_local_id] = 0; )==""\n"
268R"==(} )==""\n"
269R"==(} )==""\n"
270R"==(#if SLM_NCHUNK < OW_NCHUNK || OW != OWX )==""\n"
271R"==(if (empty) { )==""\n"
272R"==(for (int i = 0; i < SW * OW_BLOCK + (KW - 1) * (1 + DW); )==""\n"
273R"==(i += 8) { )==""\n"
274R"==(if (i + sub_local_id < SW * OW_BLOCK + (KW - 1) * (1 + DW)) )==""\n"
275R"==(S_part[i + sub_local_id] = 0; )==""\n"
276R"==(} )==""\n"
277R"==(} )==""\n"
278R"==(#endif )==""\n"
279R"==(#endif )==""\n"
280R"==(#if KH > 1 )==""\n"
281R"==(S_part += SRC_SLM_SIZE; )==""\n"
282R"==(src += IC_BLOCK * IW * (1 + DH); )==""\n"
283R"==(} )==""\n"
284R"==(S_part -= SRC_SLM_SIZE * KH; )==""\n"
285R"==(src -= IC_BLOCK * KH * IW * (1 + DH); )==""\n"
286R"==(#endif )==""\n"
287R"==(#if KD > 1 )==""\n"
288R"==(S_part += SRC_SLM_SIZE * KH; )==""\n"
289R"==(src += IC_BLOCK * IW * IH * (1 + DD); )==""\n"
290R"==(} )==""\n"
291R"==(#endif )==""\n"
292R"==(end_conv_calculate: )==""\n"
293R"==(barrier(CLK_LOCAL_MEM_FENCE); )==""\n"
294R"==(SRC_MMAD_DATA8_T S; )==""\n"
295R"==(int8 W0 = 0, W1 = 0, W2 = 0, W3 = 0; )==""\n"
296R"==(int W00 = 0, W10 = 0, W20 = 0, W30 = 0; )==""\n"
297R"==(int8 C00 = 0; )==""\n"
298R"==(int8 C10 = 0; )==""\n"
299R"==(int8 C20 = 0; )==""\n"
300R"==(int8 C30 = 0; )==""\n"
301R"==(#if OW_BLOCK == 12 )==""\n"
302R"==(SRC_MMAD_DATA4_T SS; )==""\n"
303R"==(int4 C01 = 0; )==""\n"
304R"==(int4 C11 = 0; )==""\n"
305R"==(int4 C21 = 0; )==""\n"
306R"==(int4 C31 = 0; )==""\n"
307R"==(#endif )==""\n"
308R"==(#if OW_BLOCK == 16 )==""\n"
309R"==(int8 C01 = 0; )==""\n"
310R"==(int8 C11 = 0; )==""\n"
311R"==(int8 C21 = 0; )==""\n"
312R"==(int8 C31 = 0; )==""\n"
313R"==(#endif )==""\n"
314R"==(#if WITH_SRC_ZPOINTS )==""\n"
315R"==(#if WITH_SRC_ZPOINTS_PER_IC )==""\n"
316R"==(int4 z = 0; )==""\n"
317R"==(if (IC > 0) z.s0 = src_zpoints[0]; )==""\n"
318R"==(if (IC > 1) z.s1 = src_zpoints[1]; )==""\n"
319R"==(if (IC > 2) z.s2 = src_zpoints[2]; )==""\n"
320R"==(if (IC > 3) z.s3 = src_zpoints[3]; )==""\n"
321R"==(#else )==""\n"
322R"==(int4 z = read_src_zero_point(src_zpoints); )==""\n"
323R"==(#endif )==""\n"
324R"==(#endif )==""\n"
325R"==(#if !WITH_SRC_ZPOINTS )==""\n"
326R"==(for (int i = 0; i < KDHW_SIZE - KDHW_SIZE % 8; i += 8) { )==""\n"
327R"==(const int ihw = (i + sub_local_id) % (KW * KH); )==""\n"
328R"==(const int filter_iw = (ihw % KW) * (1 + DW); )==""\n"
329R"==(const int filter_ih = ihw / KW; )==""\n"
330R"==(const int filter_id = (i + sub_local_id) / (KH * KW); )==""\n"
331R"==(const int filter = (filter_ih * (1 + DH) + ih >= 0) )==""\n"
332R"==(&& (filter_ih * (1 + DH) + ih < IH) )==""\n"
333R"==(&& (filter_id * (1 + DD) + id >= 0 )==""\n"
334R"==(&& filter_id * (1 + DD) + id < ID); )==""\n"
335R"==(BLOCK_READ_WHT8(W0, 0); )==""\n"
336R"==(#if OC_PADD8 * 4 > OC_BLOCK )==""\n"
337R"==(BLOCK_READ_WHT8(W1, KDHW_SIZE * OC_BLOCK); )==""\n"
338R"==(#endif )==""\n"
339R"==(#if OC_PADD8 * 4 > OC_BLOCK * 2 )==""\n"
340R"==(BLOCK_READ_WHT8(W2, 2 * KDHW_SIZE * OC_BLOCK); )==""\n"
341R"==(#endif )==""\n"
342R"==(#if OC_PADD8 * 4 > OC_BLOCK * 3 )==""\n"
343R"==(BLOCK_READ_WHT8(W3, 3 * KDHW_SIZE * OC_BLOCK); )==""\n"
344R"==(#endif )==""\n"
345R"==(if (filter) { )==""\n"
346R"==(S.s0 = S_work[SW * 0 + SRC_SLM_SIZE * KH * filter_id )==""\n"
347R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
348R"==(S.s1 = S_work[SW * 1 + SRC_SLM_SIZE * KH * filter_id )==""\n"
349R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
350R"==(S.s2 = S_work[SW * 2 + SRC_SLM_SIZE * KH * filter_id )==""\n"
351R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
352R"==(S.s3 = S_work[SW * 3 + SRC_SLM_SIZE * KH * filter_id )==""\n"
353R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
354R"==(S.s4 = S_work[SW * 4 + SRC_SLM_SIZE * KH * filter_id )==""\n"
355R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
356R"==(S.s5 = S_work[SW * 5 + SRC_SLM_SIZE * KH * filter_id )==""\n"
357R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
358R"==(S.s6 = S_work[SW * 6 + SRC_SLM_SIZE * KH * filter_id )==""\n"
359R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
360R"==(S.s7 = S_work[SW * 7 + SRC_SLM_SIZE * KH * filter_id )==""\n"
361R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
362R"==(#if OW_BLOCK == 12 )==""\n"
363R"==(SS.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
364R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
365R"==(SS.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
366R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
367R"==(SS.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
368R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
369R"==(SS.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
370R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
371R"==(#endif )==""\n"
372R"==(} else { )==""\n"
373R"==(S = 0; )==""\n"
374R"==(#if OW_BLOCK == 12 )==""\n"
375R"==(SS = 0; )==""\n"
376R"==(#endif )==""\n"
377R"==(} )==""\n"
378R"==(C00 = mmad8x8(S, W0, C00); )==""\n"
379R"==(C10 = mmad8x8(S, W1, C10); )==""\n"
380R"==(C20 = mmad8x8(S, W2, C20); )==""\n"
381R"==(C30 = mmad8x8(S, W3, C30); )==""\n"
382R"==(#if OW_BLOCK == 12 )==""\n"
383R"==(C01 = mmad8x4(SS, W0, C01); )==""\n"
384R"==(C11 = mmad8x4(SS, W1, C11); )==""\n"
385R"==(C21 = mmad8x4(SS, W2, C21); )==""\n"
386R"==(C31 = mmad8x4(SS, W3, C31); )==""\n"
387R"==(#endif )==""\n"
388R"==(#if OW_BLOCK == 16 )==""\n"
389R"==(if (filter) { )==""\n"
390R"==(S.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
391R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
392R"==(S.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
393R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
394R"==(S.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
395R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
396R"==(S.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
397R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
398R"==(S.s4 = S_work[SW * 12 + SRC_SLM_SIZE * KH * filter_id )==""\n"
399R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
400R"==(S.s5 = S_work[SW * 13 + SRC_SLM_SIZE * KH * filter_id )==""\n"
401R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
402R"==(S.s6 = S_work[SW * 14 + SRC_SLM_SIZE * KH * filter_id )==""\n"
403R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
404R"==(S.s7 = S_work[SW * 15 + SRC_SLM_SIZE * KH * filter_id )==""\n"
405R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
406R"==(} else { )==""\n"
407R"==(S = 0; )==""\n"
408R"==(} )==""\n"
409R"==(C01 = mmad8x8(S, W0, C01); )==""\n"
410R"==(C11 = mmad8x8(S, W1, C11); )==""\n"
411R"==(C21 = mmad8x8(S, W2, C21); )==""\n"
412R"==(C31 = mmad8x8(S, W3, C31); )==""\n"
413R"==(#endif )==""\n"
414R"==(wei += OC_BLOCK * 8; )==""\n"
415R"==(} )==""\n"
416R"==(#endif )==""\n"
417R"==(#if WITH_SRC_ZPOINTS )==""\n"
418R"==(for (int i = 0; i < KDHW_SIZE; i++) )==""\n"
419R"==(#else )==""\n"
420R"==(for (int i = KDHW_SIZE - KDHW_SIZE % 8; i < KDHW_SIZE; i++) )==""\n"
421R"==(#endif )==""\n"
422R"==({ )==""\n"
423R"==(const int ihw = (i) % (KW * KH); )==""\n"
424R"==(const int filter_iw = (ihw % KW) * (1 + DW); )==""\n"
425R"==(const int filter_ih = ihw / KW; )==""\n"
426R"==(const int filter_id = (i) / (KH * KW); )==""\n"
427R"==(const int filter = (filter_ih * (1 + DH) + ih >= 0) )==""\n"
428R"==(&& (filter_ih * (1 + DH) + ih < IH) )==""\n"
429R"==(&& (filter_id * (1 + DD) + id >= 0 )==""\n"
430R"==(&& filter_id * (1 + DD) + id < ID); )==""\n"
431R"==(if (filter) { )==""\n"
432R"==(BLOCK_READ_WHT(W00, 0); )==""\n"
433R"==(#if OC_PADD8 * 4 > OC_BLOCK )==""\n"
434R"==(BLOCK_READ_WHT(W10, KDHW_SIZE * OC_BLOCK); )==""\n"
435R"==(#endif )==""\n"
436R"==(#if OC_PADD8 * 4 > OC_BLOCK * 2 )==""\n"
437R"==(BLOCK_READ_WHT(W20, 2 * KDHW_SIZE * OC_BLOCK); )==""\n"
438R"==(#endif )==""\n"
439R"==(#if OC_PADD8 * 4 > OC_BLOCK * 3 )==""\n"
440R"==(BLOCK_READ_WHT(W30, 3 * KDHW_SIZE * OC_BLOCK); )==""\n"
441R"==(#endif )==""\n"
442R"==(#if WITH_SRC_ZPOINTS )==""\n"
443R"==(int4 src_comp; )==""\n"
444R"==(src_comp.s0 = calc_src_compensation_x4(z, W00); )==""\n"
445R"==(src_comp.s1 = calc_src_compensation_x4(z, W10); )==""\n"
446R"==(src_comp.s2 = calc_src_compensation_x4(z, W20); )==""\n"
447R"==(src_comp.s3 = calc_src_compensation_x4(z, W30); )==""\n"
448R"==(unroll_for(uint j = 0; j < 8; ++j) { )==""\n"
449R"==(if (filter_iw + iw + j * SW >= 0 )==""\n"
450R"==(&& filter_iw + iw + j * SW < IW) { )==""\n"
451R"==(C00[j] -= src_comp.s0; )==""\n"
452R"==(C10[j] -= src_comp.s1; )==""\n"
453R"==(C20[j] -= src_comp.s2; )==""\n"
454R"==(C30[j] -= src_comp.s3; )==""\n"
455R"==(} )==""\n"
456R"==(} )==""\n"
457R"==(#if (OW_BLOCK == 12) || (OW_BLOCK == 16) )==""\n"
458R"==(unroll_for(uint j = 8; j < OW_BLOCK; ++j) { )==""\n"
459R"==(if (filter_iw + iw + j * SW >= 0 )==""\n"
460R"==(&& filter_iw + iw + j * SW < IW) { )==""\n"
461R"==(C01[j - 8] -= src_comp.s0; )==""\n"
462R"==(C11[j - 8] -= src_comp.s1; )==""\n"
463R"==(C21[j - 8] -= src_comp.s2; )==""\n"
464R"==(C31[j - 8] -= src_comp.s3; )==""\n"
465R"==(} )==""\n"
466R"==(} )==""\n"
467R"==(#endif )==""\n"
468R"==(#endif )==""\n"
469R"==(S.s0 = S_work[SW * 0 + SRC_SLM_SIZE * KH * filter_id )==""\n"
470R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
471R"==(S.s1 = S_work[SW * 1 + SRC_SLM_SIZE * KH * filter_id )==""\n"
472R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
473R"==(S.s2 = S_work[SW * 2 + SRC_SLM_SIZE * KH * filter_id )==""\n"
474R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
475R"==(S.s3 = S_work[SW * 3 + SRC_SLM_SIZE * KH * filter_id )==""\n"
476R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
477R"==(S.s4 = S_work[SW * 4 + SRC_SLM_SIZE * KH * filter_id )==""\n"
478R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
479R"==(S.s5 = S_work[SW * 5 + SRC_SLM_SIZE * KH * filter_id )==""\n"
480R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
481R"==(S.s6 = S_work[SW * 6 + SRC_SLM_SIZE * KH * filter_id )==""\n"
482R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
483R"==(S.s7 = S_work[SW * 7 + SRC_SLM_SIZE * KH * filter_id )==""\n"
484R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
485R"==(#if OW_BLOCK == 12 )==""\n"
486R"==(SS.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
487R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
488R"==(SS.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
489R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
490R"==(SS.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
491R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
492R"==(SS.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
493R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
494R"==(#endif )==""\n"
495R"==(C00.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W00), C00.s0); )==""\n"
496R"==(C00.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W00), C00.s1); )==""\n"
497R"==(C00.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W00), C00.s2); )==""\n"
498R"==(C00.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W00), C00.s3); )==""\n"
499R"==(C00.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W00), C00.s4); )==""\n"
500R"==(C00.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W00), C00.s5); )==""\n"
501R"==(C00.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W00), C00.s6); )==""\n"
502R"==(C00.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W00), C00.s7); )==""\n"
503R"==(C10.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W10), C10.s0); )==""\n"
504R"==(C10.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W10), C10.s1); )==""\n"
505R"==(C10.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W10), C10.s2); )==""\n"
506R"==(C10.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W10), C10.s3); )==""\n"
507R"==(C10.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W10), C10.s4); )==""\n"
508R"==(C10.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W10), C10.s5); )==""\n"
509R"==(C10.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W10), C10.s6); )==""\n"
510R"==(C10.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W10), C10.s7); )==""\n"
511R"==(C20.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W20), C20.s0); )==""\n"
512R"==(C20.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W20), C20.s1); )==""\n"
513R"==(C20.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W20), C20.s2); )==""\n"
514R"==(C20.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W20), C20.s3); )==""\n"
515R"==(C20.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W20), C20.s4); )==""\n"
516R"==(C20.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W20), C20.s5); )==""\n"
517R"==(C20.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W20), C20.s6); )==""\n"
518R"==(C20.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W20), C20.s7); )==""\n"
519R"==(C30.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W30), C30.s0); )==""\n"
520R"==(C30.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W30), C30.s1); )==""\n"
521R"==(C30.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W30), C30.s2); )==""\n"
522R"==(C30.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W30), C30.s3); )==""\n"
523R"==(C30.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W30), C30.s4); )==""\n"
524R"==(C30.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W30), C30.s5); )==""\n"
525R"==(C30.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W30), C30.s6); )==""\n"
526R"==(C30.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W30), C30.s7); )==""\n"
527R"==(#if OW_BLOCK == 12 )==""\n"
528R"==(C01.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W00), C01.s0); )==""\n"
529R"==(C01.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W00), C01.s1); )==""\n"
530R"==(C01.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W00), C01.s2); )==""\n"
531R"==(C01.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W00), C01.s3); )==""\n"
532R"==(C11.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W10), C11.s0); )==""\n"
533R"==(C11.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W10), C11.s1); )==""\n"
534R"==(C11.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W10), C11.s2); )==""\n"
535R"==(C11.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W10), C11.s3); )==""\n"
536R"==(C21.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W20), C21.s0); )==""\n"
537R"==(C21.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W20), C21.s1); )==""\n"
538R"==(C21.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W20), C21.s2); )==""\n"
539R"==(C21.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W20), C21.s3); )==""\n"
540R"==(C31.s0 = idot4(AS_SRC_DATA4_T(SS.s0), as_char4(W30), C31.s0); )==""\n"
541R"==(C31.s1 = idot4(AS_SRC_DATA4_T(SS.s1), as_char4(W30), C31.s1); )==""\n"
542R"==(C31.s2 = idot4(AS_SRC_DATA4_T(SS.s2), as_char4(W30), C31.s2); )==""\n"
543R"==(C31.s3 = idot4(AS_SRC_DATA4_T(SS.s3), as_char4(W30), C31.s3); )==""\n"
544R"==(#endif )==""\n"
545R"==(#if OW_BLOCK == 16 )==""\n"
546R"==(S.s0 = S_work[SW * 8 + SRC_SLM_SIZE * KH * filter_id )==""\n"
547R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
548R"==(S.s1 = S_work[SW * 9 + SRC_SLM_SIZE * KH * filter_id )==""\n"
549R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
550R"==(S.s2 = S_work[SW * 10 + SRC_SLM_SIZE * KH * filter_id )==""\n"
551R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
552R"==(S.s3 = S_work[SW * 11 + SRC_SLM_SIZE * KH * filter_id )==""\n"
553R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
554R"==(S.s4 = S_work[SW * 12 + SRC_SLM_SIZE * KH * filter_id )==""\n"
555R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
556R"==(S.s5 = S_work[SW * 13 + SRC_SLM_SIZE * KH * filter_id )==""\n"
557R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
558R"==(S.s6 = S_work[SW * 14 + SRC_SLM_SIZE * KH * filter_id )==""\n"
559R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
560R"==(S.s7 = S_work[SW * 15 + SRC_SLM_SIZE * KH * filter_id )==""\n"
561R"==(+ SRC_SLM_SIZE * filter_ih + filter_iw]; )==""\n"
562R"==(C01.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W00), C01.s0); )==""\n"
563R"==(C01.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W00), C01.s1); )==""\n"
564R"==(C01.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W00), C01.s2); )==""\n"
565R"==(C01.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W00), C01.s3); )==""\n"
566R"==(C01.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W00), C01.s4); )==""\n"
567R"==(C01.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W00), C01.s5); )==""\n"
568R"==(C01.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W00), C01.s6); )==""\n"
569R"==(C01.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W00), C01.s7); )==""\n"
570R"==(C11.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W10), C11.s0); )==""\n"
571R"==(C11.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W10), C11.s1); )==""\n"
572R"==(C11.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W10), C11.s2); )==""\n"
573R"==(C11.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W10), C11.s3); )==""\n"
574R"==(C11.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W10), C11.s4); )==""\n"
575R"==(C11.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W10), C11.s5); )==""\n"
576R"==(C11.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W10), C11.s6); )==""\n"
577R"==(C11.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W10), C11.s7); )==""\n"
578R"==(C21.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W20), C21.s0); )==""\n"
579R"==(C21.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W20), C21.s1); )==""\n"
580R"==(C21.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W20), C21.s2); )==""\n"
581R"==(C21.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W20), C21.s3); )==""\n"
582R"==(C21.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W20), C21.s4); )==""\n"
583R"==(C21.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W20), C21.s5); )==""\n"
584R"==(C21.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W20), C21.s6); )==""\n"
585R"==(C21.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W20), C21.s7); )==""\n"
586R"==(C31.s0 = idot4(AS_SRC_DATA4_T(S.s0), as_char4(W30), C31.s0); )==""\n"
587R"==(C31.s1 = idot4(AS_SRC_DATA4_T(S.s1), as_char4(W30), C31.s1); )==""\n"
588R"==(C31.s2 = idot4(AS_SRC_DATA4_T(S.s2), as_char4(W30), C31.s2); )==""\n"
589R"==(C31.s3 = idot4(AS_SRC_DATA4_T(S.s3), as_char4(W30), C31.s3); )==""\n"
590R"==(C31.s4 = idot4(AS_SRC_DATA4_T(S.s4), as_char4(W30), C31.s4); )==""\n"
591R"==(C31.s5 = idot4(AS_SRC_DATA4_T(S.s5), as_char4(W30), C31.s5); )==""\n"
592R"==(C31.s6 = idot4(AS_SRC_DATA4_T(S.s6), as_char4(W30), C31.s6); )==""\n"
593R"==(C31.s7 = idot4(AS_SRC_DATA4_T(S.s7), as_char4(W30), C31.s7); )==""\n"
594R"==(#endif )==""\n"
595R"==(} )==""\n"
596R"==(wei += OC_BLOCK; )==""\n"
597R"==(} )==""\n"
598R"==(DST_DATA16_T R1, R2, R3, R4; )==""\n"
599R"==(#if SCALES_PER_OC )==""\n"
600R"==(float4 scales; )==""\n"
601R"==(block_read_scales( )==""\n"
602R"==(&scales, (group_oc + oc) * OC_BLOCK, sub_local_id, runtime_scales); )==""\n"
603R"==(#endif )==""\n"
604R"==(#if WITH_BIAS )==""\n"
605R"==(float4 bia; )==""\n"
606R"==(BLOCK_READ_BIA(bia, (group_oc + oc) * OC_BLOCK); )==""\n"
607R"==(#define QUANTIZE_ADD_BIAS() tmp = SCALE * fma(tmp, (float4)1, bia); )==""\n"
608R"==(#define QUANTIZE_ADD_BIAS_4() \ )==""\n"
609R"==(tmp0 = ((float8)SCALE_VEC4) * fma(tmp0, (float8)1, bia.s01230123); \ )==""\n"
610R"==(tmp1 = ((float8)SCALE_VEC4) * fma(tmp1, (float8)1, bia.s01230123); )==""\n"
611R"==(#else )==""\n"
612R"==(#define QUANTIZE_ADD_BIAS() tmp *= SCALE; )==""\n"
613R"==(#define QUANTIZE_ADD_BIAS_4() \ )==""\n"
614R"==(tmp0 *= SCALE_VEC4; \ )==""\n"
615R"==(tmp1 *= SCALE_VEC4; )==""\n"
616R"==(#endif )==""\n"
617R"==(#if WITH_DST_ZPOINTS )==""\n"
618R"==(int4 dst_zp = read_dst_zero_points_32c( )==""\n"
619R"==(dst_compensation, (group_oc + oc) * OC_BLOCK); )==""\n"
620R"==(#define ADD_DST_COMPENSATION() tmp += convert_float4(dst_zp); )==""\n"
621R"==(#define ADD_DST_COMPENSATION_4() \ )==""\n"
622R"==(tmp0 += convert_float8(dst_zp.s01230123); \ )==""\n"
623R"==(tmp1 += convert_float8(dst_zp.s01230123); )==""\n"
624R"==(#else )==""\n"
625R"==(#define ADD_DST_COMPENSATION() )==""\n"
626R"==(#define ADD_DST_COMPENSATION_4() )==""\n"
627R"==(#endif )==""\n"
628R"==(#if WITH_SRC_ZPOINTS )==""\n"
629R"==(#define ZERO_PAD_DST() tmp = zero_pad_dst_32c(tmp, (group_oc + oc) * OC_BLOCK); )==""\n"
630R"==(#define ZERO_PAD_DST_4() \ )==""\n"
631R"==(const int4 zp_mask = as_int4(zero_pad_dst_32c( \ )==""\n"
632R"==(as_float4((int4)(~0u)), (group_oc + oc) * OC_BLOCK)); \ )==""\n"
633R"==(tmp0 = as_float8(as_int8(tmp0) & zp_mask.s01230123); \ )==""\n"
634R"==(tmp1 = as_float8(as_int8(tmp1) & zp_mask.s01230123); )==""\n"
635R"==(#else )==""\n"
636R"==(#define ZERO_PAD_DST() )==""\n"
637R"==(#define ZERO_PAD_DST_4() )==""\n"
638R"==(#endif )==""\n"
639R"==(#if WITH_POST_OP )==""\n"
640R"==(#define POST_OP_COMMON(accumulator, dest_data, dest_data_type, oc_block, i) \ )==""\n"
641R"==(const int po_mb = group_mb % MB; \ )==""\n"
642R"==(const int po_oc = (oc * OC_BLOCK) % (OC * G); \ )==""\n"
643R"==(APPLY_POST_OPS_TRY_BURST(accumulator, float, dest_data, dest_data_type, \ )==""\n"
644R"==(po_mb, 1, po_oc, oc_block *SUB_GROUP_SIZE, sub_local_id); )==""\n"
645R"==(#define DO_POST_OP(i) \ )==""\n"
646R"==({ \ )==""\n"
647R"==(SUM_DATA4_T d; \ )==""\n"
648R"==(if (WITH_SUM) d = AS_SUM_DATA4_T(BLOCK_READ_DST4(dst)); \ )==""\n"
649R"==(POST_OP_COMMON(tmp, d, SUM_DATA_T, 4, i); \ )==""\n"
650R"==(} )==""\n"
651R"==(#define DO_POST_OP_4(i) \ )==""\n"
652R"==({ \ )==""\n"
653R"==(SUM_DATA16_T d; \ )==""\n"
654R"==(if (WITH_SUM) d = AS_SUM_DATA16_T(BLOCK_READ_DST16(dst)); \ )==""\n"
655R"==(float16 tmp_x16 = (float16)(tmp0, tmp1); \ )==""\n"
656R"==(POST_OP_COMMON(tmp_x16, d, SUM_DATA_T, 4, i); \ )==""\n"
657R"==(tmp0 = tmp_x16.s01234567; \ )==""\n"
658R"==(tmp1 = tmp_x16.s89abcdef; \ )==""\n"
659R"==(} )==""\n"
660R"==(#else )==""\n"
661R"==(#define DO_POST_OP(i) ; )==""\n"
662R"==(#define DO_POST_OP_4(i) ; )==""\n"
663R"==(#endif )==""\n"
664R"==(#define PACK(C0, C1, C2, C3, idx) \ )==""\n"
665R"==(do { \ )==""\n"
666R"==(tmp[0] = C0[idx]; \ )==""\n"
667R"==(tmp[1] = C1[idx]; \ )==""\n"
668R"==(tmp[2] = C2[idx]; \ )==""\n"
669R"==(tmp[3] = C3[idx]; \ )==""\n"
670R"==(} while (0) )==""\n"
671R"==(#define PACK_4(C0, C1, C2, C3, idx) \ )==""\n"
672R"==(do { \ )==""\n"
673R"==(tmp0.s0 = C0[idx]; \ )==""\n"
674R"==(tmp0.s1 = C1[idx]; \ )==""\n"
675R"==(tmp0.s2 = C2[idx]; \ )==""\n"
676R"==(tmp0.s3 = C3[idx]; \ )==""\n"
677R"==(\ )==""\n"
678R"==(tmp0.s4 = C0[idx + 1]; \ )==""\n"
679R"==(tmp0.s5 = C1[idx + 1]; \ )==""\n"
680R"==(tmp0.s6 = C2[idx + 1]; \ )==""\n"
681R"==(tmp0.s7 = C3[idx + 1]; \ )==""\n"
682R"==(\ )==""\n"
683R"==(tmp1.s0 = C0[idx + 2]; \ )==""\n"
684R"==(tmp1.s1 = C1[idx + 2]; \ )==""\n"
685R"==(tmp1.s2 = C2[idx + 2]; \ )==""\n"
686R"==(tmp1.s3 = C3[idx + 2]; \ )==""\n"
687R"==(\ )==""\n"
688R"==(tmp1.s4 = C0[idx + 3]; \ )==""\n"
689R"==(tmp1.s5 = C1[idx + 3]; \ )==""\n"
690R"==(tmp1.s6 = C2[idx + 3]; \ )==""\n"
691R"==(tmp1.s7 = C3[idx + 3]; \ )==""\n"
692R"==(} while (0) )==""\n"
693R"==(#define CONVERT_PACK() \ )==""\n"
694R"==(do { \ )==""\n"
695R"==(tmp_cvt = CONVERT_DST_DATA4_T(tmp); \ )==""\n"
696R"==(} while (0) )==""\n"
697R"==(#define CONVERT_PACK_4() \ )==""\n"
698R"==(do { \ )==""\n"
699R"==(R.s01234567 = CONVERT_DST_DATA8_T(tmp0); \ )==""\n"
700R"==(R.s89abcdef = CONVERT_DST_DATA8_T(tmp1); \ )==""\n"
701R"==(} while (0) )==""\n"
702R"==(#define STORE_DST(C0, C1, C2, C3, i) \ )==""\n"
703R"==(do { \ )==""\n"
704R"==(PACK(C0, C1, C2, C3, i); \ )==""\n"
705R"==(if (!is_mb_padding) { \ )==""\n"
706R"==(QUANTIZE_ADD_BIAS(); \ )==""\n"
707R"==(DO_POST_OP(i); \ )==""\n"
708R"==(ADD_DST_COMPENSATION(); \ )==""\n"
709R"==(ZERO_PAD_DST(); \ )==""\n"
710R"==(CONVERT_PACK(); \ )==""\n"
711R"==(} else { \ )==""\n"
712R"==(tmp_cvt = 0; \ )==""\n"
713R"==(} \ )==""\n"
714R"==(BLOCK_WRITE_DST4(dst, tmp_cvt); \ )==""\n"
715R"==(dst += OC_BLOCK * MB_BLOCK; \ )==""\n"
716R"==(} while (0) )==""\n"
717R"==(#define STORE_DST_4(C0, C1, C2, C3, i) \ )==""\n"
718R"==(do { \ )==""\n"
719R"==(PACK_4(C0, C1, C2, C3, i); \ )==""\n"
720R"==(QUANTIZE_ADD_BIAS_4(); \ )==""\n"
721R"==(DO_POST_OP_4(i); \ )==""\n"
722R"==(ADD_DST_COMPENSATION_4(); \ )==""\n"
723R"==(ZERO_PAD_DST_4(); \ )==""\n"
724R"==(CONVERT_PACK_4(); \ )==""\n"
725R"==(BLOCK_WRITE_DST16(dst, R); \ )==""\n"
726R"==(dst += 4 * OC_BLOCK; \ )==""\n"
727R"==(} while (0) )==""\n"
728R"==(if (ow < OW) { )==""\n"
729R"==(float4 tmp; )==""\n"
730R"==(DST_DATA4_T tmp_cvt; )==""\n"
731R"==(float8 tmp0, tmp1; )==""\n"
732R"==(DST_DATA16_T R; )==""\n"
733R"==(#if OW_TAIL )==""\n"
734R"==(if (ow + OW_BLOCK < OW) { )==""\n"
735R"==(#endif )==""\n"
736R"==(#if MB_BLOCK == 32 )==""\n"
737R"==(STORE_DST(C00, C10, C20, C30, 0); )==""\n"
738R"==(STORE_DST(C00, C10, C20, C30, 1); )==""\n"
739R"==(STORE_DST(C00, C10, C20, C30, 2); )==""\n"
740R"==(STORE_DST(C00, C10, C20, C30, 3); )==""\n"
741R"==(STORE_DST(C00, C10, C20, C30, 4); )==""\n"
742R"==(STORE_DST(C00, C10, C20, C30, 5); )==""\n"
743R"==(STORE_DST(C00, C10, C20, C30, 6); )==""\n"
744R"==(STORE_DST(C00, C10, C20, C30, 7); )==""\n"
745R"==(#if OW_BLOCK >= 12 )==""\n"
746R"==(STORE_DST(C01, C11, C21, C31, 0); )==""\n"
747R"==(STORE_DST(C01, C11, C21, C31, 1); )==""\n"
748R"==(STORE_DST(C01, C11, C21, C31, 2); )==""\n"
749R"==(STORE_DST(C01, C11, C21, C31, 3); )==""\n"
750R"==(#endif )==""\n"
751R"==(#if OW_BLOCK == 16 )==""\n"
752R"==(STORE_DST(C01, C11, C21, C31, 4); )==""\n"
753R"==(STORE_DST(C01, C11, C21, C31, 5); )==""\n"
754R"==(STORE_DST(C01, C11, C21, C31, 6); )==""\n"
755R"==(STORE_DST(C01, C11, C21, C31, 7); )==""\n"
756R"==(#endif )==""\n"
757R"==(#else )==""\n"
758R"==(STORE_DST_4(C00, C10, C20, C30, 0); )==""\n"
759R"==(STORE_DST_4(C00, C10, C20, C30, 4); )==""\n"
760R"==(#if OW_BLOCK >= 12 )==""\n"
761R"==(STORE_DST_4(C01, C11, C21, C31, 0); )==""\n"
762R"==(#endif )==""\n"
763R"==(#if OW_BLOCK >= 16 )==""\n"
764R"==(STORE_DST_4(C01, C11, C21, C31, 4); )==""\n"
765R"==(#endif )==""\n"
766R"==(#endif )==""\n"
767R"==(#if OW_TAIL )==""\n"
768R"==(} else { )==""\n"
769R"==(#if OW_TAIL < 4 )==""\n"
770R"==(for (int i = 0; i < OW_TAIL; i++) { )==""\n"
771R"==(STORE_DST(C00, C10, C20, C30, i); )==""\n"
772R"==(} )==""\n"
773R"==(#else )==""\n"
774R"==(#if MB_BLOCK == 32 )==""\n"
775R"==(STORE_DST(C00, C10, C20, C30, 0); )==""\n"
776R"==(STORE_DST(C00, C10, C20, C30, 1); )==""\n"
777R"==(STORE_DST(C00, C10, C20, C30, 2); )==""\n"
778R"==(STORE_DST(C00, C10, C20, C30, 3); )==""\n"
779R"==(#else )==""\n"
780R"==(STORE_DST_4(C00, C10, C20, C30, 0); )==""\n"
781R"==(#endif )==""\n"
782R"==(#endif )==""\n"
783R"==(#if OW_TAIL > 4 )==""\n"
784R"==(#if OW_TAIL < 8 )==""\n"
785R"==(for (int i = 4; i < OW_TAIL; i++) { )==""\n"
786R"==(STORE_DST(C00, C10, C20, C30, i); )==""\n"
787R"==(} )==""\n"
788R"==(#else )==""\n"
789R"==(#if MB_BLOCK == 32 )==""\n"
790R"==(STORE_DST(C00, C10, C20, C30, 4); )==""\n"
791R"==(STORE_DST(C00, C10, C20, C30, 5); )==""\n"
792R"==(STORE_DST(C00, C10, C20, C30, 6); )==""\n"
793R"==(STORE_DST(C00, C10, C20, C30, 7); )==""\n"
794R"==(#else )==""\n"
795R"==(STORE_DST_4(C00, C10, C20, C30, 4); )==""\n"
796R"==(#endif )==""\n"
797R"==(#endif )==""\n"
798R"==(#if OW_TAIL > 8 )==""\n"
799R"==(#if OW_TAIL < 12 )==""\n"
800R"==(for (int i = 8; i < OW_TAIL; i++) { )==""\n"
801R"==(STORE_DST(C01, C11, C21, C31, i - 8); )==""\n"
802R"==(} )==""\n"
803R"==(#else )==""\n"
804R"==(#if MB_BLOCK == 32 )==""\n"
805R"==(STORE_DST(C01, C11, C21, C31, 0); )==""\n"
806R"==(STORE_DST(C01, C11, C21, C31, 1); )==""\n"
807R"==(STORE_DST(C01, C11, C21, C31, 2); )==""\n"
808R"==(STORE_DST(C01, C11, C21, C31, 3); )==""\n"
809R"==(#else )==""\n"
810R"==(STORE_DST_4(C01, C11, C21, C31, 0); )==""\n"
811R"==(#endif )==""\n"
812R"==(#endif )==""\n"
813R"==(#if OW_TAIL > 12 )==""\n"
814R"==(#if OW_TAIL < 16 )==""\n"
815R"==(for (int i = 12; i < OW_TAIL; i++) { )==""\n"
816R"==(STORE_DST(C01, C11, C21, C31, i - 8); )==""\n"
817R"==(} )==""\n"
818R"==(#else )==""\n"
819R"==(#if MB_BLOCK == 32 )==""\n"
820R"==(STORE_DST(C01, C11, C21, C31, 4); )==""\n"
821R"==(STORE_DST(C01, C11, C21, C31, 5); )==""\n"
822R"==(STORE_DST(C01, C11, C21, C31, 6); )==""\n"
823R"==(STORE_DST(C01, C11, C21, C31, 7); )==""\n"
824R"==(#else )==""\n"
825R"==(STORE_DST_4(C01, C11, C21, C31, 4); )==""\n"
826R"==(#endif )==""\n"
827R"==(#endif )==""\n"
828R"==(#endif )==""\n"
829R"==(#endif )==""\n"
830R"==(#endif )==""\n"
831R"==(} )==""\n"
832R"==(#endif )==""\n"
833R"==(} )==""\n"
834R"==(} )==""\n"
835R"==()==";
836}
837}
838}
839}