1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *xe_lp_conv_nhwc_fwd_dw_ow_block_x8s8x_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2020-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
21R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
23R"==(#define PWX (PW > 1 ? PW : 1) )==""\n"
24R"==(#if SCALES_PER_OC )==""\n"
25R"==(#define SCALE scales.s0101010101010101 )==""\n"
26R"==(#elif SCALES_COMMON )==""\n"
27R"==(#define SCALE runtime_scales[0] )==""\n"
28R"==(#else )==""\n"
29R"==(#define SCALE 1 )==""\n"
30R"==(#endif )==""\n"
31R"==(void block_read_dst( )==""\n"
32R"==(int n, DST_DATA_T *d, const __global DST_DATA_T *dst, const int g); )==""\n"
33R"==(void block_write_dst( )==""\n"
34R"==(int n, const DST_DATA_T *d, __global DST_DATA_T *dst, const int g); )==""\n"
35R"==(void block_read_src(int n, ushort *s, const __global ushort *src, const int g); )==""\n"
36R"==(__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) )==""\n"
37R"==(__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) __kernel void )==""\n"
38R"==(conv_nhwc_fwd_dw_ow_block_x8s8x(const __global uchar *src, )==""\n"
39R"==(const __global char *wei, const __global float *bias, )==""\n"
40R"==(__global DST_DATA_T *dst POST_OP_ARGS, )==""\n"
41R"==(const __global float *runtime_scales, )==""\n"
42R"==(const __global int *src_compensation, const __global int *src_zpoints, )==""\n"
43R"==(const __global int *dst_compensation) { )==""\n"
44R"==(const int osp = get_global_id(1); )==""\n"
45R"==(const int od = osp / (OWB * OH); )==""\n"
46R"==(const int ohw = osp % (OWB * OH); )==""\n"
47R"==(const int ow = (ohw % OWB) * OW_BLOCK; )==""\n"
48R"==(const int oh = ohw / OWB; )==""\n"
49R"==(const int g = get_group_id(0) * OC_BLOCK; )==""\n"
50R"==(const int mb = get_global_id(2) * MB_BLOCK; )==""\n"
51R"==(const int id = od * SD - PD; )==""\n"
52R"==(const int ih = oh * SH - PH; )==""\n"
53R"==(const int iw = ow * SW - PW; )==""\n"
54R"==(const int sglid = get_sub_group_local_id(); )==""\n"
55R"==(int dst_off, src_off; )==""\n"
56R"==(dst_off = mb * OD * OH * OW * G + (od * OH * OW + oh * OW + ow) * G + g; )==""\n"
57R"==(src_off = mb * ID * IH * IW * G + (id * IH * IW + ih * IW + iw) * G + g; )==""\n"
58R"==(dst += dst_off; )==""\n"
59R"==(src += src_off; )==""\n"
60R"==(wei += g * KD * KH * KW; )==""\n"
61R"==(int16 S0 = 0; )==""\n"
62R"==(int16 S1 = 0; )==""\n"
63R"==(#if SCALES_PER_OC )==""\n"
64R"==(float2 scales = 1; )==""\n"
65R"==(scales.s0 = runtime_scales[g + 2 * sglid]; )==""\n"
66R"==(if (g + 2 * sglid + 1 < G) scales.s1 = runtime_scales[g + 2 * sglid + 1]; )==""\n"
67R"==(#endif )==""\n"
68R"==(for (int kd = 0; kd < KD; kd++) { )==""\n"
69R"==(if (kd * (1 + DD) + id < 0 || kd * (1 + DD) + id >= ID) { )==""\n"
70R"==(src += G * IH * IW * (1 + DD); )==""\n"
71R"==(wei += OC_BLOCK * KH * KW; )==""\n"
72R"==(continue; )==""\n"
73R"==(} )==""\n"
74R"==(__attribute__((opencl_unroll_hint)) for (int kh = 0; kh < KH; kh++) { )==""\n"
75R"==(if (kh * (1 + DH) + ih < 0 || kh * (1 + DH) + ih >= IH) { )==""\n"
76R"==(src += G * MB_BLOCK * IW * (1 + DH); )==""\n"
77R"==(wei += OC_BLOCK * KW; )==""\n"
78R"==(continue; )==""\n"
79R"==(} )==""\n"
80R"==(#if SW == 2 )==""\n"
81R"==(ushort16 AAA = 0; )==""\n"
82R"==(#else )==""\n"
83R"==(ushort AAA; )==""\n"
84R"==(#endif )==""\n"
85R"==(ushort16 AA = 0; )==""\n"
86R"==(#if OW % (SW * OW_BLOCK) + KW - 1 \ )==""\n"
87R"==(>= SW - 1 + PW )==""\n"
88R"==(/* get main block */ )==""\n"
89R"==(if (iw + SW * (OW_BLOCK) + KW - 1 > IW) { )==""\n"
90R"==(if (iw >= 0) { )==""\n"
91R"==(block_read_src(1, ((ushort *)&AA) + 0, )==""\n"
92R"==((const __global ushort *)(&src[0 * G]), g); )==""\n"
93R"==(#if PW >= 2 )==""\n"
94R"==(block_read_src(1, ((ushort *)&AA) + 1, )==""\n"
95R"==((const __global ushort *)(&src[1 * G]), g); )==""\n"
96R"==(#endif )==""\n"
97R"==(#if PW >= 3 )==""\n"
98R"==(block_read_src(1, ((ushort *)&AA) + 2, )==""\n"
99R"==((const __global ushort *)(&src[2 * G]), g); )==""\n"
100R"==(#endif )==""\n"
101R"==(} )==""\n"
102R"==(#if IW_TAIL > 16 )==""\n"
103R"==(#if PW == 2 )==""\n"
104R"==(block_read_src(8, ((ushort *)&AA) + 2, )==""\n"
105R"==((const __global ushort *)(&src[(PWX)*G]), g); )==""\n"
106R"==(block_read_src(4, ((ushort *)&AA) + 10, )==""\n"
107R"==((const __global ushort *)(&src[(PWX + 8) * G]), g); )==""\n"
108R"==(block_read_src(2, ((ushort *)&AA) + 14, )==""\n"
109R"==((const __global ushort *)(&src[(PWX + 12) * G]), g); )==""\n"
110R"==(#elif PW == 3 )==""\n"
111R"==(block_read_src(8, ((ushort *)&AA) + 3, )==""\n"
112R"==((const __global ushort *)(&src[(PWX)*G]), g); )==""\n"
113R"==(block_read_src(4, ((ushort *)&AA) + 11, )==""\n"
114R"==((const __global ushort *)(&src[(PWX + 8) * G]), g); )==""\n"
115R"==(block_read_src(1, ((ushort *)&AA) + 15, )==""\n"
116R"==((const __global ushort *)(&src[(PWX + 12) * G]), g); )==""\n"
117R"==(#else )==""\n"
118R"==(block_read_src(8, ((ushort *)&AA) + 1, )==""\n"
119R"==((const __global ushort *)(&src[(PWX)*G]), g); )==""\n"
120R"==(block_read_src(4, ((ushort *)&AA) + 9, )==""\n"
121R"==((const __global ushort *)(&src[(PWX + 8) * G]), g); )==""\n"
122R"==(block_read_src(2, ((ushort *)&AA) + 13, )==""\n"
123R"==((const __global ushort *)(&src[(PWX + 12) * G]), g); )==""\n"
124R"==(block_read_src(1, ((ushort *)&AA) + 15, )==""\n"
125R"==((const __global ushort *)(&src[(PWX + 14) * G]), g); )==""\n"
126R"==(#endif )==""\n"
127R"==(#if ((IW_TAIL - 16) & 0b1000) == 0b1000 )==""\n"
128R"==(block_read_src(8, ((ushort *)&AAA), )==""\n"
129R"==((const __global ushort *)(&src[(16) * G]), g); )==""\n"
130R"==(#endif )==""\n"
131R"==(#if ((IW_TAIL - 16) & 0b100) == 0b100 )==""\n"
132R"==(block_read_src(4, ((ushort *)&AAA) + ((IW_TAIL - 16) & 0b1000), )==""\n"
133R"==((const __global ushort )==""\n"
134R"==(*)(&src[(16 + ((IW_TAIL - 16) & 0b1000)) )==""\n"
135R"==(* G]), )==""\n"
136R"==(g); )==""\n"
137R"==(#endif )==""\n"
138R"==(#if ((IW_TAIL - 16) & 0b10) == 0b10 )==""\n"
139R"==(block_read_src(2, ((ushort *)&AAA) + ((IW_TAIL - 16) & 0b1100), )==""\n"
140R"==((const __global ushort )==""\n"
141R"==(*)(&src[(16 + ((IW_TAIL - 16) & 0b1100)) )==""\n"
142R"==(* G]), )==""\n"
143R"==(g); )==""\n"
144R"==(#endif )==""\n"
145R"==(#if ((IW_TAIL - 16) & 0b1) == 0b1 )==""\n"
146R"==(block_read_src(1, ((ushort *)&AAA) + ((IW_TAIL - 16) & 0b1110), )==""\n"
147R"==((const __global ushort )==""\n"
148R"==(*)(&src[(16 + ((IW_TAIL - 16) & 0b1110)) )==""\n"
149R"==(* G]), )==""\n"
150R"==(g); )==""\n"
151R"==(#endif )==""\n"
152R"==(#else )==""\n"
153R"==(#if ((IW_TAIL - PWX) & 0b1000) == 0b1000 )==""\n"
154R"==(block_read_src(8, ((ushort *)&AA) + PWX, )==""\n"
155R"==((const __global ushort *)(&src[(PWX)*G]), g); )==""\n"
156R"==(#endif )==""\n"
157R"==(#if ((IW_TAIL - PWX) & 0b100) == 0b100 )==""\n"
158R"==(block_read_src(4, )==""\n"
159R"==(((ushort *)&AA) + PWX + ((IW_TAIL - PWX) & 0b1000), )==""\n"
160R"==((const __global ushort )==""\n"
161R"==(*)(&src[(PWX )==""\n"
162R"==(+ ((IW_TAIL - PWX) )==""\n"
163R"==(& 0b1000)) )==""\n"
164R"==(* G]), )==""\n"
165R"==(g); )==""\n"
166R"==(#endif )==""\n"
167R"==(#if ((IW_TAIL - PWX) & 0b10) == 0b10 )==""\n"
168R"==(block_read_src(2, )==""\n"
169R"==(((ushort *)&AA) + PWX + ((IW_TAIL - PWX) & 0b1100), )==""\n"
170R"==((const __global ushort )==""\n"
171R"==(*)(&src[(PWX )==""\n"
172R"==(+ ((IW_TAIL - PWX) )==""\n"
173R"==(& 0b1100)) )==""\n"
174R"==(* G]), )==""\n"
175R"==(g); )==""\n"
176R"==(#endif )==""\n"
177R"==(#if ((IW_TAIL - PWX) & 0b1) == 0b1 )==""\n"
178R"==(block_read_src(1, )==""\n"
179R"==(((ushort *)&AA) + PWX + ((IW_TAIL - PWX) & 0b1110), )==""\n"
180R"==((const __global ushort )==""\n"
181R"==(*)(&src[(PWX )==""\n"
182R"==(+ ((IW_TAIL - PWX) )==""\n"
183R"==(& 0b1110)) )==""\n"
184R"==(* G]), )==""\n"
185R"==(g); )==""\n"
186R"==(#endif )==""\n"
187R"==(#endif )==""\n"
188R"==(} else { )==""\n"
189R"==(#endif )==""\n"
190R"==(#if SW == 1 )==""\n"
191R"==(#define READ_BLOCK (OW_BLOCK + KW - 1 - PWX) )==""\n"
192R"==(if (iw >= 0) { )==""\n"
193R"==(block_read_src(1, ((ushort *)&AA) + 0, )==""\n"
194R"==((const __global ushort *)(&src[0 * G]), g); )==""\n"
195R"==(#if PW >= 2 )==""\n"
196R"==(block_read_src(1, ((ushort *)&AA) + 1, )==""\n"
197R"==((const __global ushort *)(&src[1 * G]), g); )==""\n"
198R"==(#endif )==""\n"
199R"==(#if PW >= 3 )==""\n"
200R"==(block_read_src(1, ((ushort *)&AA) + 2, )==""\n"
201R"==((const __global ushort *)(&src[2 * G]), g); )==""\n"
202R"==(#endif )==""\n"
203R"==(} )==""\n"
204R"==(#if (READ_BLOCK & 0b1000) == 0b1000 )==""\n"
205R"==(block_read_src(8, ((ushort *)&AA) + PWX, )==""\n"
206R"==((const __global ushort *)(&src[(PWX)*G]), g); )==""\n"
207R"==(#endif )==""\n"
208R"==(#if (READ_BLOCK & 0b100) == 0b100 )==""\n"
209R"==(block_read_src(4, ((ushort *)&AA) + PWX + (READ_BLOCK & 0b1000), )==""\n"
210R"==((const __global ushort )==""\n"
211R"==(*)(&src[(PWX + (READ_BLOCK & 0b1000)) )==""\n"
212R"==(* G]), )==""\n"
213R"==(g); )==""\n"
214R"==(#endif )==""\n"
215R"==(#if (READ_BLOCK & 0b10) == 0b10 )==""\n"
216R"==(block_read_src(2, ((ushort *)&AA) + PWX + (READ_BLOCK & 0b1100), )==""\n"
217R"==((const __global ushort )==""\n"
218R"==(*)(&src[(PWX + (READ_BLOCK & 0b1100)) )==""\n"
219R"==(* G]), )==""\n"
220R"==(g); )==""\n"
221R"==(#endif )==""\n"
222R"==(#if (READ_BLOCK & 0b1) == 0b1 )==""\n"
223R"==(block_read_src(1, ((ushort *)&AA) + PWX + (READ_BLOCK & 0b1110), )==""\n"
224R"==((const __global ushort )==""\n"
225R"==(*)(&src[(PWX + (READ_BLOCK & 0b1110)) )==""\n"
226R"==(* G]), )==""\n"
227R"==(g); )==""\n"
228R"==(#endif )==""\n"
229R"==(#elif SW == 2 )==""\n"
230R"==(#if OW_BLOCK + KW - 1 >= 8 )==""\n"
231R"==(#define READ_BLOCK (2 * (OW_BLOCK) + KW - 1) )==""\n"
232R"==(if (iw >= 0) { )==""\n"
233R"==(block_read_src(16, (ushort *)&AA, )==""\n"
234R"==((const __global ushort *)(&src[0 * G]), g); )==""\n"
235R"==(} else { )==""\n"
236R"==(#if PW == 0 )==""\n"
237R"==(block_read_src(16, (ushort *)&AA, )==""\n"
238R"==((const __global ushort *)(&src[0 * G]), g); )==""\n"
239R"==(#elif PW == 2 )==""\n"
240R"==(block_read_src(8, ((ushort *)&AA) + 2, )==""\n"
241R"==((const __global ushort *)(&src[(PW)*G]), g); )==""\n"
242R"==(block_read_src(4, ((ushort *)&AA) + 10, )==""\n"
243R"==((const __global ushort *)(&src[(PW + 8) * G]), g); )==""\n"
244R"==(block_read_src(2, ((ushort *)&AA) + 14, )==""\n"
245R"==((const __global ushort *)(&src[(PW + 12) * G]), g); )==""\n"
246R"==(#elif PW == 3 )==""\n"
247R"==(block_read_src(8, ((ushort *)&AA) + 3, )==""\n"
248R"==((const __global ushort *)(&src[(PW)*G]), g); )==""\n"
249R"==(block_read_src(4, ((ushort *)&AA) + 11, )==""\n"
250R"==((const __global ushort *)(&src[(PW + 8) * G]), g); )==""\n"
251R"==(block_read_src(1, ((ushort *)&AA) + 15, )==""\n"
252R"==((const __global ushort *)(&src[(PW + 12) * G]), g); )==""\n"
253R"==(#else )==""\n"
254R"==(block_read_src(8, ((ushort *)&AA) + 1, )==""\n"
255R"==((const __global ushort *)(&src[(PW)*G]), g); )==""\n"
256R"==(block_read_src(4, ((ushort *)&AA) + 9, )==""\n"
257R"==((const __global ushort *)(&src[(PW + 8) * G]), g); )==""\n"
258R"==(block_read_src(2, ((ushort *)&AA) + 13, )==""\n"
259R"==((const __global ushort *)(&src[(PW + 12) * G]), g); )==""\n"
260R"==(block_read_src(1, ((ushort *)&AA) + 15, )==""\n"
261R"==((const __global ushort *)(&src[(PW + 14) * G]), g); )==""\n"
262R"==(#endif )==""\n"
263R"==(} )==""\n"
264R"==(#if ((READ_BLOCK - 16) & 0b1000) == 0b1000 )==""\n"
265R"==(block_read_src(8, ((ushort *)&AAA), )==""\n"
266R"==((const __global ushort *)(&src[(16) * G]), g); )==""\n"
267R"==(#endif )==""\n"
268R"==(#if ((READ_BLOCK - 16) & 0b100) == 0b100 )==""\n"
269R"==(block_read_src(4, ((ushort *)&AAA) + ((READ_BLOCK - 16) & 0b1000), )==""\n"
270R"==((const __global ushort )==""\n"
271R"==(*)(&src[(16 + ((READ_BLOCK - 16) & 0b1000)) )==""\n"
272R"==(* G]), )==""\n"
273R"==(g); )==""\n"
274R"==(#endif )==""\n"
275R"==(#if ((READ_BLOCK - 16) & 0b10) == 0b10 )==""\n"
276R"==(block_read_src(2, ((ushort *)&AAA) + ((READ_BLOCK - 16) & 0b1100), )==""\n"
277R"==((const __global ushort )==""\n"
278R"==(*)(&src[(16 + ((READ_BLOCK - 16) & 0b1100)) )==""\n"
279R"==(* G]), )==""\n"
280R"==(g); )==""\n"
281R"==(#endif )==""\n"
282R"==(#if ((READ_BLOCK - 16) & 0b1) == 0b1 )==""\n"
283R"==(block_read_src(1, ((ushort *)&AAA) + ((READ_BLOCK - 16) & 0b1110), )==""\n"
284R"==((const __global ushort )==""\n"
285R"==(*)(&src[(16 + ((READ_BLOCK - 16) & 0b1110)) )==""\n"
286R"==(* G]), )==""\n"
287R"==(g); )==""\n"
288R"==(#endif )==""\n"
289R"==(#else )==""\n"
290R"==(#define READ_BLOCK (2 * (OW_BLOCK) + KW - 1 - PWX) )==""\n"
291R"==(if (iw >= 0) { )==""\n"
292R"==(block_read_src(1, ((ushort *)&AA) + 0, )==""\n"
293R"==((const __global ushort *)(&src[0 * G]), g); )==""\n"
294R"==(#if PW >= 2 )==""\n"
295R"==(block_read_src(1, ((ushort *)&AA) + 1, )==""\n"
296R"==((const __global ushort *)(&src[1 * G]), g); )==""\n"
297R"==(#endif )==""\n"
298R"==(#if PW >= 3 )==""\n"
299R"==(block_read_src(1, ((ushort *)&AA) + 2, )==""\n"
300R"==((const __global ushort *)(&src[2 * G]), g); )==""\n"
301R"==(#endif )==""\n"
302R"==(} )==""\n"
303R"==(#if (READ_BLOCK & 0b1000) == 0b1000 )==""\n"
304R"==(block_read_src(8, ((ushort *)&AA) + PWX, )==""\n"
305R"==((const __global ushort *)(&src[(PWX)*G]), g); )==""\n"
306R"==(#endif )==""\n"
307R"==(#if (READ_BLOCK & 0b100) == 0b100 )==""\n"
308R"==(block_read_src(4, ((ushort *)&AA) + PWX + (READ_BLOCK & 0b1000), )==""\n"
309R"==((const __global ushort )==""\n"
310R"==(*)(&src[(PWX + (READ_BLOCK & 0b1000)) * G]), )==""\n"
311R"==(g); )==""\n"
312R"==(#endif )==""\n"
313R"==(#if (READ_BLOCK & 0b10) == 0b10 )==""\n"
314R"==(block_read_src(2, ((ushort *)&AA) + PWX + (READ_BLOCK & 0b1100), )==""\n"
315R"==((const __global ushort )==""\n"
316R"==(*)(&src[(PWX + (READ_BLOCK & 0b1100)) * G]), )==""\n"
317R"==(g); )==""\n"
318R"==(#endif )==""\n"
319R"==(#if (READ_BLOCK & 0b1) == 0b1 )==""\n"
320R"==(block_read_src(2, ((ushort *)&AA) + PWX + (READ_BLOCK & 0b1110), )==""\n"
321R"==((const __global ushort )==""\n"
322R"==(*)(&src[(PWX + (READ_BLOCK & 0b1110)) * G]), )==""\n"
323R"==(g); )==""\n"
324R"==(#endif )==""\n"
325R"==(#endif )==""\n"
326R"==(#endif )==""\n"
327R"==(#if OW % (SW * OW_BLOCK) + KW - 1 >= SW - 1 + PW )==""\n"
328R"==(} )==""\n"
329R"==(#endif )==""\n"
330R"==(#if OW > OWX )==""\n"
331R"==(if (iw + READ_BLOCK > IW) { )==""\n"
332R"==(if (iw < IW) { )==""\n"
333R"==(for (int i = IW - iw; i < READ_BLOCK; i++) { )==""\n"
334R"==(if (i < 16) { )==""\n"
335R"==(((ushort *)&AA)[i] = 0; )==""\n"
336R"==(} )==""\n"
337R"==(#if SW == 2 )==""\n"
338R"==(else { )==""\n"
339R"==(((ushort *)&AAA)[i - 16] = 0; )==""\n"
340R"==(} )==""\n"
341R"==(#endif )==""\n"
342R"==(} )==""\n"
343R"==(} else { )==""\n"
344R"==(AA = 0; )==""\n"
345R"==(#if SW == 2 )==""\n"
346R"==(AAA = 0; )==""\n"
347R"==(#endif )==""\n"
348R"==(} )==""\n"
349R"==(} )==""\n"
350R"==(#endif )==""\n"
351R"==(ushort4 WW = 0; )==""\n"
352R"==(#if KW > 4 )==""\n"
353R"==(#error "Wrong KW" )==""\n"
354R"==(#endif )==""\n"
355R"==(#if KW == 4 )==""\n"
356R"==(WW = intel_sub_group_block_read_us4((const __global ushort *)wei); )==""\n"
357R"==(#endif )==""\n"
358R"==(#if KW == 3 )==""\n"
359R"==(WW.s01 = intel_sub_group_block_read_us2( )==""\n"
360R"==((const __global ushort *)wei); )==""\n"
361R"==(WW.s2 = intel_sub_group_block_read_us( )==""\n"
362R"==((const __global ushort *)wei + OC_BLOCK); )==""\n"
363R"==(#endif )==""\n"
364R"==(#if KW == 2 )==""\n"
365R"==(WW.s01 = intel_sub_group_block_read_us2( )==""\n"
366R"==((const __global ushort *)wei); )==""\n"
367R"==(#endif )==""\n"
368R"==(#if KW == 1 )==""\n"
369R"==(WW.s0 = intel_sub_group_block_read_us((const __global ushort *)wei); )==""\n"
370R"==(#endif )==""\n"
371R"==(SRC_DATA16_T A0 = 0, A1 = 0; )==""\n"
372R"==(A0.s01234567 = AS_SRC_DATA16_T(AA.s01234567).s02468ace; )==""\n"
373R"==(A0.s89abcdef = AS_SRC_DATA16_T(AA.s89abcdef).s02468ace; )==""\n"
374R"==(A1.s01234567 = AS_SRC_DATA16_T(AA.s01234567).s13579bdf; )==""\n"
375R"==(A1.s89abcdef = AS_SRC_DATA16_T(AA.s89abcdef).s13579bdf; )==""\n"
376R"==(#if SW == 2 )==""\n"
377R"==(SRC_DATA16_T right0, right1; )==""\n"
378R"==(right0.s01234567 = AS_SRC_DATA16_T(AAA.s01234567).s02468ace; )==""\n"
379R"==(right0.s89abcdef = AS_SRC_DATA16_T(AAA.s89abcdef).s02468ace; )==""\n"
380R"==(right1.s01234567 = AS_SRC_DATA16_T(AAA.s01234567).s13579bdf; )==""\n"
381R"==(right1.s89abcdef = AS_SRC_DATA16_T(AAA.s89abcdef).s13579bdf; )==""\n"
382R"==(#else )==""\n"
383R"==(#if OW_BLOCK >= 14 )==""\n"
384R"==(SRC_DATA_T right0, right1; )==""\n"
385R"==(right0 = AS_SRC_DATA2_T(AAA).s0; )==""\n"
386R"==(right1 = AS_SRC_DATA2_T(AAA).s1; )==""\n"
387R"==(#endif )==""\n"
388R"==(#endif )==""\n"
389R"==(char8 W = as_char8(WW); )==""\n"
390R"==(#if SW == 1 )==""\n"
391R"==(S0.s0 = idot4(A0.s0123, W.s0246, S0.s0); )==""\n"
392R"==(#if OW_BLOCK >= 2 )==""\n"
393R"==(S0.s2 = idot4(A0.s1234, W.s0246, S0.s2); )==""\n"
394R"==(#endif )==""\n"
395R"==(#if OW_BLOCK >= 3 )==""\n"
396R"==(S0.s4 = idot4(A0.s2345, W.s0246, S0.s4); )==""\n"
397R"==(#endif )==""\n"
398R"==(#if OW_BLOCK >= 4 )==""\n"
399R"==(S0.s6 = idot4(A0.s3456, W.s0246, S0.s6); )==""\n"
400R"==(#endif )==""\n"
401R"==(#if OW_BLOCK >= 5 )==""\n"
402R"==(S0.s8 = idot4(A0.s4567, W.s0246, S0.s8); )==""\n"
403R"==(#endif )==""\n"
404R"==(#if OC_BLOCK >= 6 )==""\n"
405R"==(S0.sa = idot4(A0.s5678, W.s0246, S0.sa); )==""\n"
406R"==(#endif )==""\n"
407R"==(#if OW_BLOCK >= 7 )==""\n"
408R"==(S0.sc = idot4(A0.s6789, W.s0246, S0.sc); )==""\n"
409R"==(#endif )==""\n"
410R"==(#if OW_BLOCK >= 8 )==""\n"
411R"==(S0.se = idot4(A0.s789a, W.s0246, S0.se); )==""\n"
412R"==(#endif )==""\n"
413R"==(#if OW_BLOCK >= 9 )==""\n"
414R"==(S1.s0 = idot4(A0.s89ab, W.s0246, S1.s0); )==""\n"
415R"==(#endif )==""\n"
416R"==(#if OW_BLOCK >= 10 )==""\n"
417R"==(S1.s2 = idot4(A0.s9abc, W.s0246, S1.s2); )==""\n"
418R"==(#endif )==""\n"
419R"==(#if OW_BLOCK >= 11 )==""\n"
420R"==(S1.s4 = idot4(A0.sabcd, W.s0246, S1.s4); )==""\n"
421R"==(#endif )==""\n"
422R"==(#if OW_BLOCK >= 12 )==""\n"
423R"==(S1.s6 = idot4(A0.sbcde, W.s0246, S1.s6); )==""\n"
424R"==(#endif )==""\n"
425R"==(#if OW_BLOCK >= 13 )==""\n"
426R"==(S1.s8 = idot4(A0.scdef, W.s0246, S1.s8); )==""\n"
427R"==(#endif )==""\n"
428R"==(#if OW_BLOCK >= 14 )==""\n"
429R"==(S1.sa = idot4((SRC_DATA4_T)(A0.sde, A0.sf, right0), W.s0246, S1.sa); )==""\n"
430R"==(#endif )==""\n"
431R"==(#if OW_BLOCK >= 15 )==""\n"
432R"==(S1.sc = idot4( )==""\n"
433R"==((SRC_DATA4_T)(A0.sef, right0, right1), W.s0246, S1.sc); )==""\n"
434R"==(#endif )==""\n"
435R"==(S0.s1 = idot4(A1.s0123, W.s1357, S0.s1); )==""\n"
436R"==(#if OW_BLOCK >= 2 )==""\n"
437R"==(S0.s3 = idot4(A1.s1234, W.s1357, S0.s3); )==""\n"
438R"==(#endif )==""\n"
439R"==(#if OW_BLOCK >= 3 )==""\n"
440R"==(S0.s5 = idot4(A1.s2345, W.s1357, S0.s5); )==""\n"
441R"==(#endif )==""\n"
442R"==(#if OW_BLOCK >= 4 )==""\n"
443R"==(S0.s7 = idot4(A1.s3456, W.s1357, S0.s7); )==""\n"
444R"==(#endif )==""\n"
445R"==(#if OW_BLOCK >= 5 )==""\n"
446R"==(S0.s9 = idot4(A1.s4567, W.s1357, S0.s9); )==""\n"
447R"==(#endif )==""\n"
448R"==(#if OW_BLOCK >= 6 )==""\n"
449R"==(S0.sb = idot4(A1.s5678, W.s1357, S0.sb); )==""\n"
450R"==(#endif )==""\n"
451R"==(#if OW_BLOCK >= 7 )==""\n"
452R"==(S0.sd = idot4(A1.s6789, W.s1357, S0.sd); )==""\n"
453R"==(#endif )==""\n"
454R"==(#if OW_BLOCK >= 8 )==""\n"
455R"==(S0.sf = idot4(A1.s789a, W.s1357, S0.sf); )==""\n"
456R"==(#endif )==""\n"
457R"==(#if OW_BLOCK >= 9 )==""\n"
458R"==(S1.s1 = idot4(A1.s89ab, W.s1357, S1.s1); )==""\n"
459R"==(#endif )==""\n"
460R"==(#if OW_BLOCK >= 10 )==""\n"
461R"==(S1.s3 = idot4(A1.s9abc, W.s1357, S1.s3); )==""\n"
462R"==(#endif )==""\n"
463R"==(#if OW_BLOCK >= 11 )==""\n"
464R"==(S1.s5 = idot4(A1.sabcd, W.s1357, S1.s5); )==""\n"
465R"==(#endif )==""\n"
466R"==(#if OW_BLOCK >= 12 )==""\n"
467R"==(S1.s7 = idot4(A1.sbcde, W.s1357, S1.s7); )==""\n"
468R"==(#endif )==""\n"
469R"==(#if OW_BLOCK >= 13 )==""\n"
470R"==(S1.s9 = idot4(A1.scdef, W.s1357, S1.s9); )==""\n"
471R"==(#endif )==""\n"
472R"==(#if OW_BLOCK >= 14 )==""\n"
473R"==(S1.sb = idot4((SRC_DATA4_T)(A1.sde, A1.sf, 0), W.s1357, S1.sb); )==""\n"
474R"==(#endif )==""\n"
475R"==(#if OW_BLOCK >= 15 )==""\n"
476R"==(S1.sd = idot4((SRC_DATA4_T)(A1.sef, right1, 0), W.s1357, S1.sd); )==""\n"
477R"==(#endif )==""\n"
478R"==(#elif SW == 2 )==""\n"
479R"==(S0.s0 = idot4(A0.s0123, W.s0246, S0.s0); )==""\n"
480R"==(#if OW_BLOCK >= 2 )==""\n"
481R"==(S0.s2 = idot4(A0.s2345, W.s0246, S0.s2); )==""\n"
482R"==(#endif )==""\n"
483R"==(#if OW_BLOCK >= 3 )==""\n"
484R"==(S0.s4 = idot4(A0.s4567, W.s0246, S0.s4); )==""\n"
485R"==(#endif )==""\n"
486R"==(#if OW_BLOCK >= 4 )==""\n"
487R"==(S0.s6 = idot4(A0.s6789, W.s0246, S0.s6); )==""\n"
488R"==(#endif )==""\n"
489R"==(#if OW_BLOCK >= 5 )==""\n"
490R"==(S0.s8 = idot4(A0.s89ab, W.s0246, S0.s8); )==""\n"
491R"==(#endif )==""\n"
492R"==(#if OW_BLOCK >= 6 )==""\n"
493R"==(S0.sa = idot4(A0.sabcd, W.s0246, S0.sa); )==""\n"
494R"==(#endif )==""\n"
495R"==(#if OW_BLOCK >= 7 )==""\n"
496R"==(S0.sc = idot4(A0.scdef, W.s0246, S0.sc); )==""\n"
497R"==(#endif )==""\n"
498R"==(#if OW_BLOCK >= 8 )==""\n"
499R"==(S0.se = idot4((SRC_DATA4_T)(A0.sef, right0.s0, right0.s1), W.s0246, )==""\n"
500R"==(S0.se); )==""\n"
501R"==(#endif )==""\n"
502R"==(#if OW_BLOCK >= 9 )==""\n"
503R"==(S1.s0 = idot4(right0.s0123, W.s0246, S1.s0); )==""\n"
504R"==(#endif )==""\n"
505R"==(#if OW_BLOCK >= 10 )==""\n"
506R"==(S1.s2 = idot4(right0.s2345, W.s0246, S1.s2); )==""\n"
507R"==(#endif )==""\n"
508R"==(#if OW_BLOCK >= 11 )==""\n"
509R"==(S1.s4 = idot4(right0.s4567, W.s0246, S1.s4); )==""\n"
510R"==(#endif )==""\n"
511R"==(#if OW_BLOCK >= 12 )==""\n"
512R"==(S1.s6 = idot4(right0.s6789, W.s0246, S1.s6); )==""\n"
513R"==(#endif )==""\n"
514R"==(#if OW_BLOCK >= 13 )==""\n"
515R"==(S1.s8 = idot4(right0.s89ab, W.s0246, S1.s8); )==""\n"
516R"==(#endif )==""\n"
517R"==(#if OW_BLOCK >= 14 )==""\n"
518R"==(S1.sa = idot4(right0.sabcd, W.s0246, S1.sa); )==""\n"
519R"==(#endif )==""\n"
520R"==(#if OW_BLOCK >= 15 )==""\n"
521R"==(S1.sc = idot4(right0.scdef, W.s0246, S1.sc); )==""\n"
522R"==(#endif )==""\n"
523R"==(S0.s1 = idot4(A1.s0123, W.s1357, S0.s1); )==""\n"
524R"==(#if OW_BLOCK >= 2 )==""\n"
525R"==(S0.s3 = idot4(A1.s2345, W.s1357, S0.s3); )==""\n"
526R"==(#endif )==""\n"
527R"==(#if OW_BLOCK >= 3 )==""\n"
528R"==(S0.s5 = idot4(A1.s4567, W.s1357, S0.s5); )==""\n"
529R"==(#endif )==""\n"
530R"==(#if OW_BLOCK >= 4 )==""\n"
531R"==(S0.s7 = idot4(A1.s6789, W.s1357, S0.s7); )==""\n"
532R"==(#endif )==""\n"
533R"==(#if OW_BLOCK >= 5 )==""\n"
534R"==(S0.s9 = idot4(A1.s89ab, W.s1357, S0.s9); )==""\n"
535R"==(#endif )==""\n"
536R"==(#if OW_BLOCK >= 6 )==""\n"
537R"==(S0.sb = idot4(A1.sabcd, W.s1357, S0.sb); )==""\n"
538R"==(#endif )==""\n"
539R"==(#if OW_BLOCK >= 7 )==""\n"
540R"==(S0.sd = idot4(A1.scdef, W.s1357, S0.sd); )==""\n"
541R"==(#endif )==""\n"
542R"==(#if OW_BLOCK >= 8 )==""\n"
543R"==(S0.sf = idot4((SRC_DATA4_T)(A1.sef, right1.s0, right1.s1), W.s1357, )==""\n"
544R"==(S0.sf); )==""\n"
545R"==(#endif )==""\n"
546R"==(#if OW_BLOCK >= 9 )==""\n"
547R"==(S1.s1 = idot4(right1.s0123, W.s1357, S1.s1); )==""\n"
548R"==(#endif )==""\n"
549R"==(#if OW_BLOCK >= 10 )==""\n"
550R"==(S1.s3 = idot4(right1.s2345, W.s1357, S1.s3); )==""\n"
551R"==(#endif )==""\n"
552R"==(#if OW_BLOCK >= 11 )==""\n"
553R"==(S1.s5 = idot4(right1.s4567, W.s1357, S1.s5); )==""\n"
554R"==(#endif )==""\n"
555R"==(#if OW_BLOCK >= 12 )==""\n"
556R"==(S1.s7 = idot4(right1.s6789, W.s1357, S1.s7); )==""\n"
557R"==(#endif )==""\n"
558R"==(#if OW_BLOCK >= 13 )==""\n"
559R"==(S1.s9 = idot4(right1.s89ab, W.s1357, S1.s9); )==""\n"
560R"==(#endif )==""\n"
561R"==(#if OW_BLOCK >= 14 )==""\n"
562R"==(S1.sb = idot4(right1.sabcd, W.s1357, S1.sb); )==""\n"
563R"==(#endif )==""\n"
564R"==(#if OW_BLOCK >= 15 )==""\n"
565R"==(S1.sd = idot4(right1.scdef, W.s1357, S1.sd); )==""\n"
566R"==(#endif )==""\n"
567R"==(#else )==""\n"
568R"==(#error )==""\n"
569R"==(#endif )==""\n"
570R"==(src += G * IW * (1 + DH); )==""\n"
571R"==(wei += OC_BLOCK * KW; )==""\n"
572R"==(} )==""\n"
573R"==(src += G * IW * (IH * (1 + DD) - KH * (1 + DH)); )==""\n"
574R"==(} )==""\n"
575R"==(#if WITH_BIAS || (WITH_POST_OP && !SUM_SCALE1 || SCALES_PER_OC || SCALES_COMMON) )==""\n"
576R"==(float16 tmp00 = convert_float16(S0); )==""\n"
577R"==(float16 tmp01 = convert_float16(S1); )==""\n"
578R"==(#if WITH_BIAS )==""\n"
579R"==(float2 B; )==""\n"
580R"==(B.s0 = bias[g + 2 * sglid]; )==""\n"
581R"==(B.s1 = bias[g + 2 * sglid + 1]; )==""\n"
582R"==(tmp00 += convert_float16(B.s0101010101010101); )==""\n"
583R"==(tmp01 += convert_float16(B.s0101010101010101); )==""\n"
584R"==(#endif )==""\n"
585R"==(tmp00 *= SCALE; )==""\n"
586R"==(tmp01 *= SCALE; )==""\n"
587R"==(#define ACC_DATA_TYPE float )==""\n"
588R"==(#define ACC0 tmp00 )==""\n"
589R"==(#define ACC1 tmp01 )==""\n"
590R"==(#else )==""\n"
591R"==(#define ACC_DATA_TYPE int )==""\n"
592R"==(#define ACC0 S0 )==""\n"
593R"==(#define ACC1 S1 )==""\n"
594R"==(#endif )==""\n"
595R"==(DST_DATA16_T D0 = 0; )==""\n"
596R"==(DST_DATA16_T D1 = 0; )==""\n"
597R"==(#if WITH_SUM )==""\n"
598R"==(if (OW_TAIL != 0 && ow + OW_BLOCK >= OW) { )==""\n"
599R"==(block_read_dst(min(8, OW_TAIL), &D0, dst, g); )==""\n"
600R"==(block_read_dst(OW_TAIL - 8, &D1, dst + 8 * G, g); )==""\n"
601R"==(} else { )==""\n"
602R"==(block_read_dst(min(8, OW_BLOCK), &D0, dst, g); )==""\n"
603R"==(block_read_dst(OW_BLOCK - 8, &D1, dst + 8 * G, g); )==""\n"
604R"==(} )==""\n"
605R"==(#endif )==""\n"
606R"==(SUM_DATA16_T D0_sdt = AS_SUM_DATA16_T(D0); )==""\n"
607R"==(SUM_DATA16_T D1_sdt = AS_SUM_DATA16_T(D1); )==""\n"
608R"==(#define APPLY_POST_OPS_COMMON(accumulator, sum, offset) \ )==""\n"
609R"==({ \ )==""\n"
610R"==(for (int didx = 0; didx < 16; ++didx) { \ )==""\n"
611R"==(int po_mb = mb; \ )==""\n"
612R"==(int po_oc = g * OC + 2 * get_sub_group_local_id() + (didx % 2); \ )==""\n"
613R"==(ACC_DATA_TYPE accum = accumulator[didx]; \ )==""\n"
614R"==(SUM_DATA_T sum_di = sum[didx]; \ )==""\n"
615R"==(APPLY_POST_OPS_SERIAL_BINARY_2D(accum, ACC_DATA_TYPE, sum_di, \ )==""\n"
616R"==(SUM_DATA_T, po_mb, 1, po_oc, 1); \ )==""\n"
617R"==(accumulator[didx] = accum; \ )==""\n"
618R"==(} \ )==""\n"
619R"==(} )==""\n"
620R"==(APPLY_POST_OPS_COMMON(ACC0, D0_sdt, 0); )==""\n"
621R"==(APPLY_POST_OPS_COMMON(ACC1, D1_sdt, 8); )==""\n"
622R"==(DST_DATA16_T R0 = CONVERT_DST_DATA16_T(ACC0); )==""\n"
623R"==(DST_DATA16_T R1 = CONVERT_DST_DATA16_T(ACC1); )==""\n"
624R"==(if (OW_TAIL != 0 && ow + OW_BLOCK > OW) { )==""\n"
625R"==(block_write_dst(min(8, OW_TAIL), &R0, dst, g); )==""\n"
626R"==(block_write_dst(OW_TAIL - 8, &R1, dst + 8 * G, g); )==""\n"
627R"==(} else { )==""\n"
628R"==(block_write_dst(min(8, OW_BLOCK), &R0, dst, g); )==""\n"
629R"==(block_write_dst(OW_BLOCK - 8, &R1, dst + 8 * G, g); )==""\n"
630R"==(} )==""\n"
631R"==(} )==""\n"
632R"==(void block_read_dst( )==""\n"
633R"==(int n, DST_DATA_T *d, const __global DST_DATA_T *dst, const int g) { )==""\n"
634R"==(int sglid = get_sub_group_local_id(); )==""\n"
635R"==(#if DST_DT_S8 || DST_DT_U8 )==""\n"
636R"==(__attribute__((opencl_unroll_hint)) )==""\n"
637R"==(for (int i = 0; i < n; i++) { )==""\n"
638R"==(const int gtail = (g + OC_BLOCK) <= G ? G : G % OC_BLOCK; )==""\n"
639R"==(if (2 * sglid < gtail) { d[i * 2] = dst[i * G + 2 * sglid]; } )==""\n"
640R"==(if (1 + 2 * sglid < gtail) { )==""\n"
641R"==(d[i * 2 + 1] = dst[i * G + 1 + 2 * sglid]; )==""\n"
642R"==(} )==""\n"
643R"==(} )==""\n"
644R"==(return; )==""\n"
645R"==(#elif DST_DT_S32 || DST_DT_F32 )==""\n"
646R"==(__attribute__((opencl_unroll_hint)) )==""\n"
647R"==(for (int i = 0; i < n; i++) { )==""\n"
648R"==(uint block0 = 0, block1 = 0; )==""\n"
649R"==(if ((g + OC_BLOCK) <= G || sglid < (G % OC_BLOCK)) { )==""\n"
650R"==(block0 = ((const __global uint *)(dst + i * G))[sglid]; )==""\n"
651R"==(} )==""\n"
652R"==(if ((g + OC_BLOCK) <= G || sglid < (G % OC_BLOCK) - SUB_GROUP_SIZE) { )==""\n"
653R"==(block1 = ((const __global uint *)(dst + i * G )==""\n"
654R"==(+ SUB_GROUP_SIZE))[sglid]; )==""\n"
655R"==(} )==""\n"
656R"==(int from00 = min(2 * sglid, SUB_GROUP_SIZE - 1); )==""\n"
657R"==(int from01 = max(2 * sglid - 16, 0); )==""\n"
658R"==(int from10 = min(2 * sglid + 1, SUB_GROUP_SIZE - 1); )==""\n"
659R"==(int from11 = max(2 * sglid + 1 - 16, 0); )==""\n"
660R"==(uint block00 = intel_sub_group_shuffle(block0, from00); )==""\n"
661R"==(uint block01 = intel_sub_group_shuffle(block1, from01); )==""\n"
662R"==(uint block10 = intel_sub_group_shuffle(block0, from10); )==""\n"
663R"==(uint block11 = intel_sub_group_shuffle(block1, from11); )==""\n"
664R"==(block0 = (2 * sglid < SUB_GROUP_SIZE) ? block00 : block01; )==""\n"
665R"==(block1 = (2 * sglid + 1 < SUB_GROUP_SIZE) ? block10 : block11; )==""\n"
666R"==(*(DST_DATA2_T *)(&d[i * 2]) = AS_DST_DATA2_T((uint2)(block0, block1)); )==""\n"
667R"==(} )==""\n"
668R"==(return; )==""\n"
669R"==(#else )==""\n"
670R"==(#error "Not expected" )==""\n"
671R"==(#endif )==""\n"
672R"==(} )==""\n"
673R"==(void block_write_dst( )==""\n"
674R"==(int n, const DST_DATA_T *d, __global DST_DATA_T *dst, const int g) { )==""\n"
675R"==(int sglid = get_sub_group_local_id(); )==""\n"
676R"==(__attribute__((opencl_unroll_hint)) )==""\n"
677R"==(for (int i = 0; i < n; i++) { )==""\n"
678R"==(DST_DATA2_T block = AS_DST_DATA2_T(*(DST_DATA2_T *)(&d[i * 2])); )==""\n"
679R"==(const int gtail = (g + OC_BLOCK) <= G ? G : G % OC_BLOCK; )==""\n"
680R"==(if (2 * sglid < gtail) { dst[i * G + 2 * sglid] = block.S0; } )==""\n"
681R"==(if (1 + 2 * sglid < gtail) { dst[i * G + 1 + 2 * sglid] = block.S1; } )==""\n"
682R"==(} )==""\n"
683R"==(return; )==""\n"
684R"==(} )==""\n"
685R"==(void block_read_src(int n, ushort *s, const __global ushort *src, const int g) { )==""\n"
686R"==(int sglid = get_sub_group_local_id(); )==""\n"
687R"==(#if G % 2 == 0 )==""\n"
688R"==(for (int i = 0; i < n; i++) { )==""\n"
689R"==(if (sglid < ((g + IC_BLOCK) <= G ? 16 : (G % IC_BLOCK) / 2)) { )==""\n"
690R"==(s[i] = src[i * G / 2 + sglid]; )==""\n"
691R"==(} else )==""\n"
692R"==(s[i] = 0; )==""\n"
693R"==(} )==""\n"
694R"==(#else )==""\n"
695R"==(uchar *sc = (uchar *)s; )==""\n"
696R"==(const __global uchar *src_c = (const __global uchar *)src; )==""\n"
697R"==(if ((g + IC_BLOCK) <= G || (g + IC_BLOCK > G && sglid < G % IC_BLOCK - 1)) { )==""\n"
698R"==(for (int i = 0; i < n; i++) { )==""\n"
699R"==(sc[i * 2] = src_c[i * G + sglid * 2]; )==""\n"
700R"==(sc[i * 2 + 1] = src_c[i * G + sglid * 2 + 1]; )==""\n"
701R"==(} )==""\n"
702R"==(} else if (sglid == G % IC_BLOCK - 1) { )==""\n"
703R"==(for (int i = 0; i < n; i++) { )==""\n"
704R"==(sc[i * 2] = src_c[i * G + sglid * 2]; )==""\n"
705R"==(sc[i * 2 + 1] = 0; )==""\n"
706R"==(} )==""\n"
707R"==(} else { )==""\n"
708R"==(for (int i = 0; i < n; i++) { )==""\n"
709R"==(sc[i * 2] = 0; )==""\n"
710R"==(sc[i * 2 + 1] = 0; )==""\n"
711R"==(} )==""\n"
712R"==(} )==""\n"
713R"==(#endif )==""\n"
714R"==(} )==""\n"
715R"==()==";
716}
717}
718}
719}