1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *xe_lp_conv_dw_fwd_data_ow_block_x8s8x_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2020-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
21R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
23R"==(inline ushort16 read_block16(const __global ushort *p) )==""\n"
24R"==(__attribute__((overloadable)) { )==""\n"
25R"==(ushort16 __builtin_IB_simd_block_read_16_global_h(const __global ushort *p) )==""\n"
26R"==(__attribute__((const)); )==""\n"
27R"==(return __builtin_IB_simd_block_read_16_global_h(p); )==""\n"
28R"==(} )==""\n"
29R"==(#define PWX (PW > 1 ? PW : 1) )==""\n"
30R"==(#if SCALES_PER_OC )==""\n"
31R"==(#define SCALE scales.s0101010101010101 )==""\n"
32R"==(#elif SCALES_COMMON )==""\n"
33R"==(#define SCALE runtime_scales[0] )==""\n"
34R"==(#else )==""\n"
35R"==(#define SCALE 1 )==""\n"
36R"==(#endif )==""\n"
37R"==(void block_read_dst(int n, DST_DATA_T *d, const __global DST_DATA_T *dst); )==""\n"
38R"==(void block_write_dst(int n, const DST_DATA_T *d, __global DST_DATA_T *dst); )==""\n"
39R"==(__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) )==""\n"
40R"==(__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) __kernel void )==""\n"
41R"==(conv_dw_fwd_ow_block_x8s8x(const __global uchar *src, const __global char *wei, )==""\n"
42R"==(const __global float *bias, __global DST_DATA_T *dst POST_OP_ARGS, )==""\n"
43R"==(const __global float *runtime_scales, )==""\n"
44R"==(const __global int *src_compensation, const __global int *src_zpoints, )==""\n"
45R"==(const __global int *dst_compensation) { )==""\n"
46R"==(const int osp = get_global_id(1); )==""\n"
47R"==(const int od = osp / (OWB * OH); )==""\n"
48R"==(const int ohw = osp % (OWB * OH); )==""\n"
49R"==(const int ow = (ohw % OWB) * OW_BLOCK; )==""\n"
50R"==(const int oh = ohw / OWB; )==""\n"
51R"==(const int g = get_group_id(0) * OC_BLOCK; )==""\n"
52R"==(const int mb = get_global_id(2) * MB_BLOCK; )==""\n"
53R"==(const int id = od * SD - PD; )==""\n"
54R"==(const int ih = oh * SH - PH; )==""\n"
55R"==(const int iw = ow * SW - PW; )==""\n"
56R"==(const __global uchar *src_original = src; )==""\n"
57R"==(dst += mb * G_PADDED * OD * OH * OW + g * OD * OH * OW * MB_BLOCK )==""\n"
58R"==(+ (od * OH * OW + oh * OW + ow) * MB_BLOCK * OC_BLOCK; )==""\n"
59R"==(src += mb * G_PADDED * ID * IH * IW + g * ID * IH * IW * MB_BLOCK )==""\n"
60R"==(+ (id * IH * IW + ih * IW + iw) * MB_BLOCK * IC_BLOCK; )==""\n"
61R"==(wei += g * KD * KH * KW; )==""\n"
62R"==(int16 S0 = 0; )==""\n"
63R"==(int16 S1 = 0; )==""\n"
64R"==(#if SCALES_PER_OC )==""\n"
65R"==(float2 scales = 1; )==""\n"
66R"==(if (g + 2 * get_sub_group_local_id() < G) )==""\n"
67R"==(scales.s0 = runtime_scales[g + 2 * get_sub_group_local_id()]; )==""\n"
68R"==(if (g + 2 * get_sub_group_local_id() + 1 < G) )==""\n"
69R"==(scales.s1 = runtime_scales[g + 2 * get_sub_group_local_id() + 1]; )==""\n"
70R"==(#endif )==""\n"
71R"==(for (int kd = 0; kd < KD; kd++) { )==""\n"
72R"==(if (kd * (1 + DD) + id < 0 || kd * (1 + DD) + id >= ID) { )==""\n"
73R"==(src += IC_BLOCK * MB_BLOCK * IH * IW * (1 + DD); )==""\n"
74R"==(wei += OC_BLOCK * KH * KW; )==""\n"
75R"==(continue; )==""\n"
76R"==(} )==""\n"
77R"==(__attribute__((opencl_unroll_hint)) for (int kh = 0; kh < KH; kh++) { )==""\n"
78R"==(if (kh * (1 + DH) + ih < 0 || kh * (1 + DH) + ih >= IH) { )==""\n"
79R"==(src += IC_BLOCK * MB_BLOCK * IW * (1 + DH); )==""\n"
80R"==(wei += OC_BLOCK * KW; )==""\n"
81R"==(continue; )==""\n"
82R"==(} )==""\n"
83R"==(#if SW == 2 )==""\n"
84R"==(ushort16 AAA = 0; )==""\n"
85R"==(#else )==""\n"
86R"==(ushort AAA; )==""\n"
87R"==(#endif )==""\n"
88R"==(ushort16 AA = 0; )==""\n"
89R"==(#if OW % (SW * OW_BLOCK) + KW - 1 \ )==""\n"
90R"==(>= SW - 1 + PW )==""\n"
91R"==(/* get main block */ )==""\n"
92R"==(if (iw + SW * (OW_BLOCK) + KW - 1 > IW) { )==""\n"
93R"==(if (iw >= 0) { )==""\n"
94R"==(AA.s0 = intel_sub_group_block_read_us( )==""\n"
95R"==((const __global ushort *)(&src[0 * IC_BLOCK])); )==""\n"
96R"==(#if PW >= 2 )==""\n"
97R"==(AA.s1 = intel_sub_group_block_read_us( )==""\n"
98R"==((const __global ushort *)(&src[1 * IC_BLOCK])); )==""\n"
99R"==(#endif )==""\n"
100R"==(#if PW >= 3 )==""\n"
101R"==(AA.s2 = intel_sub_group_block_read_us( )==""\n"
102R"==((const __global ushort *)(&src[2 * IC_BLOCK])); )==""\n"
103R"==(#endif )==""\n"
104R"==(} )==""\n"
105R"==(#if IW_TAIL > 16 )==""\n"
106R"==(#if PW == 2 )==""\n"
107R"==(AA.s23456789 = intel_sub_group_block_read_us8( )==""\n"
108R"==((const __global ushort *)(&src[PWX * IC_BLOCK])); )==""\n"
109R"==(AA.sabcd = intel_sub_group_block_read_us4( )==""\n"
110R"==((const __global ushort *)(&src[(PWX + 8) * IC_BLOCK])); )==""\n"
111R"==(AA.sef = intel_sub_group_block_read_us2( )==""\n"
112R"==((const __global ushort *)(&src[(PWX + 12) * IC_BLOCK])); )==""\n"
113R"==(#elif PW == 3 )==""\n"
114R"==(AA.s3456789a = intel_sub_group_block_read_us8( )==""\n"
115R"==((const __global ushort *)(&src[PWX * IC_BLOCK])); )==""\n"
116R"==(AA.sbcde = intel_sub_group_block_read_us4( )==""\n"
117R"==((const __global ushort *)(&src[(PWX + 8) * IC_BLOCK])); )==""\n"
118R"==(AA.sf = intel_sub_group_block_read_us( )==""\n"
119R"==((const __global ushort *)(&src[(PWX + 12) * IC_BLOCK])); )==""\n"
120R"==(#else )==""\n"
121R"==(AA.s12345678 = intel_sub_group_block_read_us8( )==""\n"
122R"==((const __global ushort *)(&src[PWX * IC_BLOCK])); )==""\n"
123R"==(AA.s9abc = intel_sub_group_block_read_us4( )==""\n"
124R"==((const __global ushort *)(&src[(PWX + 8) * IC_BLOCK])); )==""\n"
125R"==(AA.sde = intel_sub_group_block_read_us2( )==""\n"
126R"==((const __global ushort *)(&src[(PWX + 12) * IC_BLOCK])); )==""\n"
127R"==(AA.sf = intel_sub_group_block_read_us( )==""\n"
128R"==((const __global ushort *)(&src[(PWX + 14) * IC_BLOCK])); )==""\n"
129R"==(#endif )==""\n"
130R"==(#if ((IW_TAIL - 16) & 0b1000) == 0b1000 )==""\n"
131R"==(AAA.s01234567 = intel_sub_group_block_read_us8( )==""\n"
132R"==((const __global ushort *)(&src[(16) * IC_BLOCK])); )==""\n"
133R"==(#endif )==""\n"
134R"==(#if ((IW_TAIL - 16) & 0b100) == 0b100 )==""\n"
135R"==(*((ushort4 *)(((ushort *)&AAA) + ((IW_TAIL - 16) & 0b1000))) )==""\n"
136R"==(= intel_sub_group_block_read_us4((const __global ushort )==""\n"
137R"==(*)(&src[(16 + ((IW_TAIL - 16) & 0b1000)) )==""\n"
138R"==(* IC_BLOCK])); )==""\n"
139R"==(#endif )==""\n"
140R"==(#if ((IW_TAIL - 16) & 0b10) == 0b10 )==""\n"
141R"==(*((ushort2 *)(((ushort *)&AAA) + ((IW_TAIL - 16) & 0b1100))) )==""\n"
142R"==(= intel_sub_group_block_read_us2((const __global ushort )==""\n"
143R"==(*)(&src[(16 + ((IW_TAIL - 16) & 0b1100)) )==""\n"
144R"==(* IC_BLOCK])); )==""\n"
145R"==(#endif )==""\n"
146R"==(#if ((IW_TAIL - 16) & 0b1) == 0b1 )==""\n"
147R"==(*(((ushort *)&AAA) + ((IW_TAIL - 16) & 0b1110)) )==""\n"
148R"==(= intel_sub_group_block_read_us((const __global ushort )==""\n"
149R"==(*)(&src[(16 + ((IW_TAIL - 16) & 0b1110)) )==""\n"
150R"==(* IC_BLOCK])); )==""\n"
151R"==(#endif )==""\n"
152R"==(#else )==""\n"
153R"==(#if ((IW_TAIL - PWX) & 0b1000) == 0b1000 )==""\n"
154R"==(*((ushort8 *)(((ushort *)&AA) + PWX)) )==""\n"
155R"==(= intel_sub_group_block_read_us8(( )==""\n"
156R"==(const __global ushort *)(&src[PWX * IC_BLOCK])); )==""\n"
157R"==(#endif )==""\n"
158R"==(#if ((IW_TAIL - PWX) & 0b100) == 0b100 )==""\n"
159R"==(*((ushort4 *)(((ushort *)&AA) + PWX )==""\n"
160R"==(+ ((IW_TAIL - PWX) & 0b1000))) )==""\n"
161R"==(= intel_sub_group_block_read_us4((const __global ushort )==""\n"
162R"==(*)(&src[(PWX )==""\n"
163R"==(+ ((IW_TAIL - PWX) )==""\n"
164R"==(& 0b1000)) )==""\n"
165R"==(* IC_BLOCK])); )==""\n"
166R"==(#endif )==""\n"
167R"==(#if ((IW_TAIL - PWX) & 0b10) == 0b10 )==""\n"
168R"==(*((ushort2 *)(((ushort *)&AA) + PWX )==""\n"
169R"==(+ ((IW_TAIL - PWX) & 0b1100))) )==""\n"
170R"==(= intel_sub_group_block_read_us2((const __global ushort )==""\n"
171R"==(*)(&src[(PWX )==""\n"
172R"==(+ ((IW_TAIL - PWX) )==""\n"
173R"==(& 0b1100)) )==""\n"
174R"==(* IC_BLOCK])); )==""\n"
175R"==(#endif )==""\n"
176R"==(#if ((IW_TAIL - PWX) & 0b1) == 0b1 )==""\n"
177R"==(*(((ushort *)&AA) + PWX + ((IW_TAIL - PWX) & 0b1110)) )==""\n"
178R"==(= intel_sub_group_block_read_us((const __global ushort )==""\n"
179R"==(*)(&src[(PWX )==""\n"
180R"==(+ ((IW_TAIL - PWX) )==""\n"
181R"==(& 0b1110)) )==""\n"
182R"==(* IC_BLOCK])); )==""\n"
183R"==(#endif )==""\n"
184R"==(#endif )==""\n"
185R"==(} else { )==""\n"
186R"==(#endif )==""\n"
187R"==(#if SW == 1 )==""\n"
188R"==(#define READ_BLOCK (OW_BLOCK + KW - 1 - PWX) )==""\n"
189R"==(if (iw >= 0) { )==""\n"
190R"==(AA.s0 = intel_sub_group_block_read_us( )==""\n"
191R"==((const __global ushort *)(&src[0 * IC_BLOCK])); )==""\n"
192R"==(#if PW >= 2 )==""\n"
193R"==(AA.s1 = intel_sub_group_block_read_us( )==""\n"
194R"==((const __global ushort *)(&src[1 * IC_BLOCK])); )==""\n"
195R"==(#endif )==""\n"
196R"==(#if PW >= 3 )==""\n"
197R"==(AA.s2 = intel_sub_group_block_read_us( )==""\n"
198R"==((const __global ushort *)(&src[2 * IC_BLOCK])); )==""\n"
199R"==(#endif )==""\n"
200R"==(} )==""\n"
201R"==(#if (READ_BLOCK & 0b1000) == 0b1000 )==""\n"
202R"==(*((ushort8 *)(((ushort *)&AA) + PWX)) )==""\n"
203R"==(= intel_sub_group_block_read_us8(( )==""\n"
204R"==(const __global ushort *)(&src[(PWX)*IC_BLOCK])); )==""\n"
205R"==(#endif )==""\n"
206R"==(#if (READ_BLOCK & 0b100) == 0b100 )==""\n"
207R"==(*((ushort4 *)(((ushort *)&AA) + PWX + (READ_BLOCK & 0b1000))) )==""\n"
208R"==(= intel_sub_group_block_read_us4((const __global ushort )==""\n"
209R"==(*)(&src[(PWX + (READ_BLOCK & 0b1000)) )==""\n"
210R"==(* IC_BLOCK])); )==""\n"
211R"==(#endif )==""\n"
212R"==(#if (READ_BLOCK & 0b10) == 0b10 )==""\n"
213R"==(*((ushort2 *)(((ushort *)&AA) + PWX + (READ_BLOCK & 0b1100))) )==""\n"
214R"==(= intel_sub_group_block_read_us2((const __global ushort )==""\n"
215R"==(*)(&src[(PWX + (READ_BLOCK & 0b1100)) )==""\n"
216R"==(* IC_BLOCK])); )==""\n"
217R"==(#endif )==""\n"
218R"==(#if (READ_BLOCK & 0b1) == 0b1 )==""\n"
219R"==(*(((ushort *)&AA) + PWX + (READ_BLOCK & 0b1110)) )==""\n"
220R"==(= intel_sub_group_block_read_us((const __global ushort )==""\n"
221R"==(*)(&src[(PWX + (READ_BLOCK & 0b1110)) )==""\n"
222R"==(* IC_BLOCK])); )==""\n"
223R"==(#endif )==""\n"
224R"==(#elif SW == 2 )==""\n"
225R"==(#if OW_BLOCK + KW - 1 >= 8 )==""\n"
226R"==(#define READ_BLOCK (2 * (OW_BLOCK) + KW - 1) )==""\n"
227R"==(if (iw >= 0) { )==""\n"
228R"==(AA = read_block16( )==""\n"
229R"==((const __global ushort *)(&src[0 * IC_BLOCK])); )==""\n"
230R"==(} else { )==""\n"
231R"==(#if PW == 0 )==""\n"
232R"==(AA = read_block16( )==""\n"
233R"==((const __global ushort *)(&src[0 * IC_BLOCK])); )==""\n"
234R"==(#elif PW == 2 )==""\n"
235R"==(AA.s23456789 = intel_sub_group_block_read_us8( )==""\n"
236R"==((const __global ushort *)(&src[PW * IC_BLOCK])); )==""\n"
237R"==(AA.sabcd = intel_sub_group_block_read_us4( )==""\n"
238R"==((const __global ushort *)(&src[(PW + 8) * IC_BLOCK])); )==""\n"
239R"==(AA.sef = intel_sub_group_block_read_us2( )==""\n"
240R"==((const __global ushort *)(&src[(PW + 12) * IC_BLOCK])); )==""\n"
241R"==(#elif PW == 3 )==""\n"
242R"==(AA.s3456789a = intel_sub_group_block_read_us8( )==""\n"
243R"==((const __global ushort *)(&src[PW * IC_BLOCK])); )==""\n"
244R"==(AA.sbcde = intel_sub_group_block_read_us4( )==""\n"
245R"==((const __global ushort *)(&src[(PW + 8) * IC_BLOCK])); )==""\n"
246R"==(AA.sf = intel_sub_group_block_read_us( )==""\n"
247R"==((const __global ushort *)(&src[(PW + 12) * IC_BLOCK])); )==""\n"
248R"==(#else )==""\n"
249R"==(AA.s12345678 = intel_sub_group_block_read_us8( )==""\n"
250R"==((const __global ushort *)(&src[PW * IC_BLOCK])); )==""\n"
251R"==(AA.s9abc = intel_sub_group_block_read_us4( )==""\n"
252R"==((const __global ushort *)(&src[(PW + 8) * IC_BLOCK])); )==""\n"
253R"==(AA.sde = intel_sub_group_block_read_us2( )==""\n"
254R"==((const __global ushort *)(&src[(PW + 12) * IC_BLOCK])); )==""\n"
255R"==(AA.sf = intel_sub_group_block_read_us( )==""\n"
256R"==((const __global ushort *)(&src[(PW + 14) * IC_BLOCK])); )==""\n"
257R"==(#endif )==""\n"
258R"==(} )==""\n"
259R"==(#if ((READ_BLOCK - 16) & 0b1000) == 0b1000 )==""\n"
260R"==(AAA.s01234567 = intel_sub_group_block_read_us8( )==""\n"
261R"==((const __global ushort *)(&src[(16) * IC_BLOCK])); )==""\n"
262R"==(#endif )==""\n"
263R"==(#if ((READ_BLOCK - 16) & 0b100) == 0b100 )==""\n"
264R"==(*((ushort4 *)(((ushort *)&AAA) + ((READ_BLOCK - 16) & 0b1000))) )==""\n"
265R"==(= intel_sub_group_block_read_us4((const __global ushort )==""\n"
266R"==(*)(&src[(16 + ((READ_BLOCK - 16) & 0b1000)) )==""\n"
267R"==(* IC_BLOCK])); )==""\n"
268R"==(#endif )==""\n"
269R"==(#if ((READ_BLOCK - 16) & 0b10) == 0b10 )==""\n"
270R"==(*((ushort2 *)(((ushort *)&AAA) + ((READ_BLOCK - 16) & 0b1100))) )==""\n"
271R"==(= intel_sub_group_block_read_us2((const __global ushort )==""\n"
272R"==(*)(&src[(16 + ((READ_BLOCK - 16) & 0b1100)) )==""\n"
273R"==(* IC_BLOCK])); )==""\n"
274R"==(#endif )==""\n"
275R"==(#if ((READ_BLOCK - 16) & 0b1) == 0b1 )==""\n"
276R"==(*(((ushort *)&AAA) + ((READ_BLOCK - 16) & 0b1110)) )==""\n"
277R"==(= intel_sub_group_block_read_us((const __global ushort )==""\n"
278R"==(*)(&src[(16 + ((READ_BLOCK - 16) & 0b1110)) )==""\n"
279R"==(* IC_BLOCK])); )==""\n"
280R"==(#endif )==""\n"
281R"==(#else )==""\n"
282R"==(#define READ_BLOCK (2 * (OW_BLOCK) + KW - 1 - PWX) )==""\n"
283R"==(if (iw >= 0) { )==""\n"
284R"==(AA.s0 = intel_sub_group_block_read_us( )==""\n"
285R"==((const __global ushort *)(&src[0 * IC_BLOCK])); )==""\n"
286R"==(#if PW >= 2 )==""\n"
287R"==(AA.s1 = intel_sub_group_block_read_us( )==""\n"
288R"==((const __global ushort *)(&src[1 * IC_BLOCK])); )==""\n"
289R"==(#endif )==""\n"
290R"==(#if PW >= 3 )==""\n"
291R"==(AA.s2 = intel_sub_group_block_read_us( )==""\n"
292R"==((const __global ushort *)(&src[2 * IC_BLOCK])); )==""\n"
293R"==(#endif )==""\n"
294R"==(} )==""\n"
295R"==(#if (READ_BLOCK & 0b1000) == 0b1000 )==""\n"
296R"==(*((ushort8 *)(((ushort *)&AA) + PWX)) )==""\n"
297R"==(= intel_sub_group_block_read_us8( )==""\n"
298R"==((const __global ushort *)(&src[(PWX)*IC_BLOCK])); )==""\n"
299R"==(#endif )==""\n"
300R"==(#if (READ_BLOCK & 0b100) == 0b100 )==""\n"
301R"==(*((ushort4 *)(((ushort *)&AA) + PWX + (READ_BLOCK & 0b1000))) )==""\n"
302R"==(= intel_sub_group_block_read_us4((const __global ushort )==""\n"
303R"==(*)(&src[(PWX + (READ_BLOCK & 0b1000)) )==""\n"
304R"==(* IC_BLOCK])); )==""\n"
305R"==(#endif )==""\n"
306R"==(#if (READ_BLOCK & 0b10) == 0b10 )==""\n"
307R"==(*((ushort2 *)(((ushort *)&AA) + PWX + (READ_BLOCK & 0b1100))) )==""\n"
308R"==(= intel_sub_group_block_read_us2((const __global ushort )==""\n"
309R"==(*)(&src[(PWX + (READ_BLOCK & 0b1100)) )==""\n"
310R"==(* IC_BLOCK])); )==""\n"
311R"==(#endif )==""\n"
312R"==(#if (READ_BLOCK & 0b1) == 0b1 )==""\n"
313R"==(*(((ushort *)&AA) + PWX + (READ_BLOCK & 0b1110)) )==""\n"
314R"==(= intel_sub_group_block_read_us((const __global ushort )==""\n"
315R"==(*)(&src[(PWX + (READ_BLOCK & 0b1110)) )==""\n"
316R"==(* IC_BLOCK])); )==""\n"
317R"==(#endif )==""\n"
318R"==(#endif )==""\n"
319R"==(#endif )==""\n"
320R"==(#if OW % (SW * OW_BLOCK) + KW - 1 >= SW - 1 + PW )==""\n"
321R"==(} )==""\n"
322R"==(#endif )==""\n"
323R"==(#if OW > OWX )==""\n"
324R"==(if (iw + READ_BLOCK > IW) { )==""\n"
325R"==(if (iw < IW) { )==""\n"
326R"==(for (int i = IW - iw; i < READ_BLOCK; i++) { )==""\n"
327R"==(if (i < 16) { )==""\n"
328R"==(AA[i] = 0; )==""\n"
329R"==(} )==""\n"
330R"==(#if SW == 2 )==""\n"
331R"==(else { )==""\n"
332R"==(AAA[i - 16] = 0; )==""\n"
333R"==(} )==""\n"
334R"==(#endif )==""\n"
335R"==(} )==""\n"
336R"==(} else { )==""\n"
337R"==(AA = 0; )==""\n"
338R"==(#if SW == 2 )==""\n"
339R"==(AAA = 0; )==""\n"
340R"==(#endif )==""\n"
341R"==(} )==""\n"
342R"==(} )==""\n"
343R"==(#endif )==""\n"
344R"==(ushort4 WW = 0; )==""\n"
345R"==(#if KW == 4 )==""\n"
346R"==(WW = intel_sub_group_block_read_us4((const __global ushort *)wei); )==""\n"
347R"==(#endif )==""\n"
348R"==(#if KW == 3 )==""\n"
349R"==(WW.s01 = intel_sub_group_block_read_us2( )==""\n"
350R"==((const __global ushort *)wei); )==""\n"
351R"==(WW.s2 = intel_sub_group_block_read_us( )==""\n"
352R"==((const __global ushort *)wei + OC_BLOCK); )==""\n"
353R"==(#endif )==""\n"
354R"==(#if KW == 2 )==""\n"
355R"==(WW.s01 = intel_sub_group_block_read_us2( )==""\n"
356R"==((const __global ushort *)wei); )==""\n"
357R"==(#endif )==""\n"
358R"==(#if KW == 1 )==""\n"
359R"==(WW.s0 = intel_sub_group_block_read_us((const __global ushort *)wei); )==""\n"
360R"==(#endif )==""\n"
361R"==(SRC_DATA16_T A0 = 0, A1 = 0; )==""\n"
362R"==(A0.s01234567 = AS_SRC_DATA16_T(AA.s01234567).s02468ace; )==""\n"
363R"==(A0.s89abcdef = AS_SRC_DATA16_T(AA.s89abcdef).s02468ace; )==""\n"
364R"==(A1.s01234567 = AS_SRC_DATA16_T(AA.s01234567).s13579bdf; )==""\n"
365R"==(A1.s89abcdef = AS_SRC_DATA16_T(AA.s89abcdef).s13579bdf; )==""\n"
366R"==(#if SW == 2 )==""\n"
367R"==(SRC_DATA16_T right0, right1; )==""\n"
368R"==(right0.s01234567 = AS_SRC_DATA16_T(AAA.s01234567).s02468ace; )==""\n"
369R"==(right0.s89abcdef = AS_SRC_DATA16_T(AAA.s89abcdef).s02468ace; )==""\n"
370R"==(right1.s01234567 = AS_SRC_DATA16_T(AAA.s01234567).s13579bdf; )==""\n"
371R"==(right1.s89abcdef = AS_SRC_DATA16_T(AAA.s89abcdef).s13579bdf; )==""\n"
372R"==(#else )==""\n"
373R"==(#if OW_BLOCK >= 14 )==""\n"
374R"==(SRC_DATA_T right0, right1; )==""\n"
375R"==(right0 = AS_SRC_DATA2_T(AAA).s0; )==""\n"
376R"==(right1 = AS_SRC_DATA2_T(AAA).s1; )==""\n"
377R"==(#endif )==""\n"
378R"==(#endif )==""\n"
379R"==(char8 W = as_char8(WW); )==""\n"
380R"==(#if SW == 1 )==""\n"
381R"==(S0.s0 = idot4(A0.s0123, W.s0246, S0.s0); )==""\n"
382R"==(#if OW_BLOCK >= 2 )==""\n"
383R"==(S0.s2 = idot4(A0.s1234, W.s0246, S0.s2); )==""\n"
384R"==(#endif )==""\n"
385R"==(#if OW_BLOCK >= 3 )==""\n"
386R"==(S0.s4 = idot4(A0.s2345, W.s0246, S0.s4); )==""\n"
387R"==(#endif )==""\n"
388R"==(#if OW_BLOCK >= 4 )==""\n"
389R"==(S0.s6 = idot4(A0.s3456, W.s0246, S0.s6); )==""\n"
390R"==(#endif )==""\n"
391R"==(#if OW_BLOCK >= 5 )==""\n"
392R"==(S0.s8 = idot4(A0.s4567, W.s0246, S0.s8); )==""\n"
393R"==(#endif )==""\n"
394R"==(#if OC_BLOCK >= 6 )==""\n"
395R"==(S0.sa = idot4(A0.s5678, W.s0246, S0.sa); )==""\n"
396R"==(#endif )==""\n"
397R"==(#if OW_BLOCK >= 7 )==""\n"
398R"==(S0.sc = idot4(A0.s6789, W.s0246, S0.sc); )==""\n"
399R"==(#endif )==""\n"
400R"==(#if OW_BLOCK >= 8 )==""\n"
401R"==(S0.se = idot4(A0.s789a, W.s0246, S0.se); )==""\n"
402R"==(#endif )==""\n"
403R"==(#if OW_BLOCK >= 9 )==""\n"
404R"==(S1.s0 = idot4(A0.s89ab, W.s0246, S1.s0); )==""\n"
405R"==(#endif )==""\n"
406R"==(#if OW_BLOCK >= 10 )==""\n"
407R"==(S1.s2 = idot4(A0.s9abc, W.s0246, S1.s2); )==""\n"
408R"==(#endif )==""\n"
409R"==(#if OW_BLOCK >= 11 )==""\n"
410R"==(S1.s4 = idot4(A0.sabcd, W.s0246, S1.s4); )==""\n"
411R"==(#endif )==""\n"
412R"==(#if OW_BLOCK >= 12 )==""\n"
413R"==(S1.s6 = idot4(A0.sbcde, W.s0246, S1.s6); )==""\n"
414R"==(#endif )==""\n"
415R"==(#if OW_BLOCK >= 13 )==""\n"
416R"==(S1.s8 = idot4(A0.scdef, W.s0246, S1.s8); )==""\n"
417R"==(#endif )==""\n"
418R"==(#if OW_BLOCK >= 14 )==""\n"
419R"==(S1.sa = idot4((SRC_DATA4_T)(A0.sde, A0.sf, right0), W.s0246, S1.sa); )==""\n"
420R"==(#endif )==""\n"
421R"==(#if OW_BLOCK >= 15 )==""\n"
422R"==(S1.sc = idot4( )==""\n"
423R"==((SRC_DATA4_T)(A0.sef, right0, right1), W.s0246, S1.sc); )==""\n"
424R"==(#endif )==""\n"
425R"==(S0.s1 = idot4(A1.s0123, W.s1357, S0.s1); )==""\n"
426R"==(#if OW_BLOCK >= 2 )==""\n"
427R"==(S0.s3 = idot4(A1.s1234, W.s1357, S0.s3); )==""\n"
428R"==(#endif )==""\n"
429R"==(#if OW_BLOCK >= 3 )==""\n"
430R"==(S0.s5 = idot4(A1.s2345, W.s1357, S0.s5); )==""\n"
431R"==(#endif )==""\n"
432R"==(#if OW_BLOCK >= 4 )==""\n"
433R"==(S0.s7 = idot4(A1.s3456, W.s1357, S0.s7); )==""\n"
434R"==(#endif )==""\n"
435R"==(#if OW_BLOCK >= 5 )==""\n"
436R"==(S0.s9 = idot4(A1.s4567, W.s1357, S0.s9); )==""\n"
437R"==(#endif )==""\n"
438R"==(#if OW_BLOCK >= 6 )==""\n"
439R"==(S0.sb = idot4(A1.s5678, W.s1357, S0.sb); )==""\n"
440R"==(#endif )==""\n"
441R"==(#if OW_BLOCK >= 7 )==""\n"
442R"==(S0.sd = idot4(A1.s6789, W.s1357, S0.sd); )==""\n"
443R"==(#endif )==""\n"
444R"==(#if OW_BLOCK >= 8 )==""\n"
445R"==(S0.sf = idot4(A1.s789a, W.s1357, S0.sf); )==""\n"
446R"==(#endif )==""\n"
447R"==(#if OW_BLOCK >= 9 )==""\n"
448R"==(S1.s1 = idot4(A1.s89ab, W.s1357, S1.s1); )==""\n"
449R"==(#endif )==""\n"
450R"==(#if OW_BLOCK >= 10 )==""\n"
451R"==(S1.s3 = idot4(A1.s9abc, W.s1357, S1.s3); )==""\n"
452R"==(#endif )==""\n"
453R"==(#if OW_BLOCK >= 11 )==""\n"
454R"==(S1.s5 = idot4(A1.sabcd, W.s1357, S1.s5); )==""\n"
455R"==(#endif )==""\n"
456R"==(#if OW_BLOCK >= 12 )==""\n"
457R"==(S1.s7 = idot4(A1.sbcde, W.s1357, S1.s7); )==""\n"
458R"==(#endif )==""\n"
459R"==(#if OW_BLOCK >= 13 )==""\n"
460R"==(S1.s9 = idot4(A1.scdef, W.s1357, S1.s9); )==""\n"
461R"==(#endif )==""\n"
462R"==(#if OW_BLOCK >= 14 )==""\n"
463R"==(S1.sb = idot4((SRC_DATA4_T)(A1.sde, A1.sf, 0), W.s1357, S1.sb); )==""\n"
464R"==(#endif )==""\n"
465R"==(#if OW_BLOCK >= 15 )==""\n"
466R"==(S1.sd = idot4((SRC_DATA4_T)(A1.sef, right1, 0), W.s1357, S1.sd); )==""\n"
467R"==(#endif )==""\n"
468R"==(#elif SW == 2 )==""\n"
469R"==(S0.s0 = idot4(A0.s0123, W.s0246, S0.s0); )==""\n"
470R"==(#if OW_BLOCK >= 2 )==""\n"
471R"==(S0.s2 = idot4(A0.s2345, W.s0246, S0.s2); )==""\n"
472R"==(#endif )==""\n"
473R"==(#if OW_BLOCK >= 3 )==""\n"
474R"==(S0.s4 = idot4(A0.s4567, W.s0246, S0.s4); )==""\n"
475R"==(#endif )==""\n"
476R"==(#if OW_BLOCK >= 4 )==""\n"
477R"==(S0.s6 = idot4(A0.s6789, W.s0246, S0.s6); )==""\n"
478R"==(#endif )==""\n"
479R"==(#if OW_BLOCK >= 5 )==""\n"
480R"==(S0.s8 = idot4(A0.s89ab, W.s0246, S0.s8); )==""\n"
481R"==(#endif )==""\n"
482R"==(#if OW_BLOCK >= 6 )==""\n"
483R"==(S0.sa = idot4(A0.sabcd, W.s0246, S0.sa); )==""\n"
484R"==(#endif )==""\n"
485R"==(#if OW_BLOCK >= 7 )==""\n"
486R"==(S0.sc = idot4(A0.scdef, W.s0246, S0.sc); )==""\n"
487R"==(#endif )==""\n"
488R"==(#if OW_BLOCK >= 8 )==""\n"
489R"==(S0.se = idot4((SRC_DATA4_T)(A0.sef, right0.s0, right0.s1), W.s0246, )==""\n"
490R"==(S0.se); )==""\n"
491R"==(#endif )==""\n"
492R"==(#if OW_BLOCK >= 9 )==""\n"
493R"==(S1.s0 = idot4(right0.s0123, W.s0246, S1.s0); )==""\n"
494R"==(#endif )==""\n"
495R"==(#if OW_BLOCK >= 10 )==""\n"
496R"==(S1.s2 = idot4(right0.s2345, W.s0246, S1.s2); )==""\n"
497R"==(#endif )==""\n"
498R"==(#if OW_BLOCK >= 11 )==""\n"
499R"==(S1.s4 = idot4(right0.s4567, W.s0246, S1.s4); )==""\n"
500R"==(#endif )==""\n"
501R"==(#if OW_BLOCK >= 12 )==""\n"
502R"==(S1.s6 = idot4(right0.s6789, W.s0246, S1.s6); )==""\n"
503R"==(#endif )==""\n"
504R"==(#if OW_BLOCK >= 13 )==""\n"
505R"==(S1.s8 = idot4(right0.s89ab, W.s0246, S1.s8); )==""\n"
506R"==(#endif )==""\n"
507R"==(#if OW_BLOCK >= 14 )==""\n"
508R"==(S1.sa = idot4(right0.sabcd, W.s0246, S1.sa); )==""\n"
509R"==(#endif )==""\n"
510R"==(#if OW_BLOCK >= 15 )==""\n"
511R"==(S1.sc = idot4(right0.scdef, W.s0246, S1.sc); )==""\n"
512R"==(#endif )==""\n"
513R"==(S0.s1 = idot4(A1.s0123, W.s1357, S0.s1); )==""\n"
514R"==(#if OW_BLOCK >= 2 )==""\n"
515R"==(S0.s3 = idot4(A1.s2345, W.s1357, S0.s3); )==""\n"
516R"==(#endif )==""\n"
517R"==(#if OW_BLOCK >= 3 )==""\n"
518R"==(S0.s5 = idot4(A1.s4567, W.s1357, S0.s5); )==""\n"
519R"==(#endif )==""\n"
520R"==(#if OW_BLOCK >= 4 )==""\n"
521R"==(S0.s7 = idot4(A1.s6789, W.s1357, S0.s7); )==""\n"
522R"==(#endif )==""\n"
523R"==(#if OW_BLOCK >= 5 )==""\n"
524R"==(S0.s9 = idot4(A1.s89ab, W.s1357, S0.s9); )==""\n"
525R"==(#endif )==""\n"
526R"==(#if OW_BLOCK >= 6 )==""\n"
527R"==(S0.sb = idot4(A1.sabcd, W.s1357, S0.sb); )==""\n"
528R"==(#endif )==""\n"
529R"==(#if OW_BLOCK >= 7 )==""\n"
530R"==(S0.sd = idot4(A1.scdef, W.s1357, S0.sd); )==""\n"
531R"==(#endif )==""\n"
532R"==(#if OW_BLOCK >= 8 )==""\n"
533R"==(S0.sf = idot4((SRC_DATA4_T)(A1.sef, right1.s0, right1.s1), W.s1357, )==""\n"
534R"==(S0.sf); )==""\n"
535R"==(#endif )==""\n"
536R"==(#if OW_BLOCK >= 9 )==""\n"
537R"==(S1.s1 = idot4(right1.s0123, W.s1357, S1.s1); )==""\n"
538R"==(#endif )==""\n"
539R"==(#if OW_BLOCK >= 10 )==""\n"
540R"==(S1.s3 = idot4(right1.s2345, W.s1357, S1.s3); )==""\n"
541R"==(#endif )==""\n"
542R"==(#if OW_BLOCK >= 11 )==""\n"
543R"==(S1.s5 = idot4(right1.s4567, W.s1357, S1.s5); )==""\n"
544R"==(#endif )==""\n"
545R"==(#if OW_BLOCK >= 12 )==""\n"
546R"==(S1.s7 = idot4(right1.s6789, W.s1357, S1.s7); )==""\n"
547R"==(#endif )==""\n"
548R"==(#if OW_BLOCK >= 13 )==""\n"
549R"==(S1.s9 = idot4(right1.s89ab, W.s1357, S1.s9); )==""\n"
550R"==(#endif )==""\n"
551R"==(#if OW_BLOCK >= 14 )==""\n"
552R"==(S1.sb = idot4(right1.sabcd, W.s1357, S1.sb); )==""\n"
553R"==(#endif )==""\n"
554R"==(#if OW_BLOCK >= 15 )==""\n"
555R"==(S1.sd = idot4(right1.scdef, W.s1357, S1.sd); )==""\n"
556R"==(#endif )==""\n"
557R"==(#else )==""\n"
558R"==(#error )==""\n"
559R"==(#endif )==""\n"
560R"==(src += IC_BLOCK * MB_BLOCK * IW * (1 + DH); )==""\n"
561R"==(wei += OC_BLOCK * KW; )==""\n"
562R"==(} )==""\n"
563R"==(src += IC_BLOCK * MB_BLOCK * IW * (IH * (1 + DD) - KH * (1 + DH)); )==""\n"
564R"==(} )==""\n"
565R"==(#if WITH_BIAS || (WITH_POST_OP && !SUM_SCALE1 || SCALES_PER_OC || SCALES_COMMON) )==""\n"
566R"==(float16 tmp00 = convert_float16(S0); )==""\n"
567R"==(float16 tmp01 = convert_float16(S1); )==""\n"
568R"==(#if WITH_BIAS )==""\n"
569R"==(float2 B; )==""\n"
570R"==(int g_off = g + 2 * get_sub_group_local_id(); )==""\n"
571R"==(B.s0 = g_off >= G ? 0 : bias[g_off]; )==""\n"
572R"==(B.s1 = g_off + 1 >= G ? 0 : bias[g_off + 1]; )==""\n"
573R"==(tmp00 += convert_float16(B.s0101010101010101); )==""\n"
574R"==(tmp01 += convert_float16(B.s0101010101010101); )==""\n"
575R"==(#endif )==""\n"
576R"==(tmp00 *= SCALE; )==""\n"
577R"==(tmp01 *= SCALE; )==""\n"
578R"==(#define ACC_DATA_TYPE float )==""\n"
579R"==(#define ACC0 tmp00 )==""\n"
580R"==(#define ACC1 tmp01 )==""\n"
581R"==(#else )==""\n"
582R"==(#define ACC_DATA_TYPE int )==""\n"
583R"==(#define ACC0 S0 )==""\n"
584R"==(#define ACC1 S1 )==""\n"
585R"==(#endif )==""\n"
586R"==(DST_DATA16_T D0 = 0; )==""\n"
587R"==(DST_DATA16_T D1 = 0; )==""\n"
588R"==(#if WITH_SUM )==""\n"
589R"==(if (OW_TAIL != 0 && ow + OW_BLOCK >= OW) { )==""\n"
590R"==(block_read_dst(min(8, OW_TAIL), &D0, dst); )==""\n"
591R"==(block_read_dst(OW_TAIL - 8, &D1, dst + 8 * OC_BLOCK); )==""\n"
592R"==(} else { )==""\n"
593R"==(block_read_dst(min(8, OW_BLOCK), &D0, dst); )==""\n"
594R"==(block_read_dst(OW_BLOCK - 8, &D1, dst + 8 * OC_BLOCK); )==""\n"
595R"==(} )==""\n"
596R"==(#endif )==""\n"
597R"==(SUM_DATA16_T D0_sdt = AS_SUM_DATA16_T(D0); )==""\n"
598R"==(SUM_DATA16_T D1_sdt = AS_SUM_DATA16_T(D1); )==""\n"
599R"==(#define APPLY_POST_OPS_COMMON(accumulator, sum, offset) \ )==""\n"
600R"==({ \ )==""\n"
601R"==(/*This kernel is using serial post op processing due to non-trivial \ )==""\n"
602R"==(data ordering in accumulator. */ \ )==""\n"
603R"==(for (int didx = 0; didx < 16; ++didx) { \ )==""\n"
604R"==(int po_mb = mb; \ )==""\n"
605R"==(int po_oc = g * OC + 2 * get_sub_group_local_id() + (didx % 2); \ )==""\n"
606R"==(ACC_DATA_TYPE accum = accumulator[didx]; \ )==""\n"
607R"==(SUM_DATA_T sum_di = sum[didx]; \ )==""\n"
608R"==(APPLY_POST_OPS_SERIAL_BINARY_2D(accum, ACC_DATA_TYPE, sum_di, \ )==""\n"
609R"==(SUM_DATA_T, po_mb, 1, po_oc, 1); \ )==""\n"
610R"==(accumulator[didx] = accum; \ )==""\n"
611R"==(} \ )==""\n"
612R"==(} )==""\n"
613R"==(APPLY_POST_OPS_COMMON(ACC0, D0_sdt, 0); )==""\n"
614R"==(APPLY_POST_OPS_COMMON(ACC1, D1_sdt, 8); )==""\n"
615R"==(DST_DATA16_T R0 = CONVERT_DST_DATA16_T(ACC0); )==""\n"
616R"==(DST_DATA16_T R1 = CONVERT_DST_DATA16_T(ACC1); )==""\n"
617R"==(if (OW_TAIL != 0 && ow + OW_BLOCK > OW) { )==""\n"
618R"==(block_write_dst(min(8, OW_TAIL), &R0, dst); )==""\n"
619R"==(block_write_dst(OW_TAIL - 8, &R1, dst + 8 * OC_BLOCK); )==""\n"
620R"==(} else { )==""\n"
621R"==(block_write_dst(min(8, OW_BLOCK), &R0, dst); )==""\n"
622R"==(block_write_dst(OW_BLOCK - 8, &R1, dst + 8 * OC_BLOCK); )==""\n"
623R"==(} )==""\n"
624R"==(} )==""\n"
625R"==(void block_read_dst(int n, DST_DATA_T *d, const __global DST_DATA_T *dst) { )==""\n"
626R"==(#if DST_DT_S8 || DST_DT_U8 )==""\n"
627R"==(__attribute__((opencl_unroll_hint)) )==""\n"
628R"==(for (int i = 0; i < n / 8 * 8; i += 8) { )==""\n"
629R"==(ushort8 block = intel_sub_group_block_read_us8( )==""\n"
630R"==((const __global ushort *)(dst + i * SUB_GROUP_SIZE * 2)); )==""\n"
631R"==(*(DST_DATA16_T *)(&d[i * 2]) = AS_DST_DATA16_T(block); )==""\n"
632R"==(} )==""\n"
633R"==(__attribute__((opencl_unroll_hint)) )==""\n"
634R"==(for (int i = n / 8 * 8; i < n; i++) { )==""\n"
635R"==(ushort block = intel_sub_group_block_read_us( )==""\n"
636R"==((const __global ushort *)(dst + i * SUB_GROUP_SIZE * 2)); )==""\n"
637R"==(*(DST_DATA2_T *)(&d[i * 2]) = AS_DST_DATA2_T(block); )==""\n"
638R"==(} )==""\n"
639R"==(return; )==""\n"
640R"==(#elif DST_DT_S32 || DST_DT_F32 )==""\n"
641R"==(int sglid = get_sub_group_local_id(); )==""\n"
642R"==(__attribute__((opencl_unroll_hint)) )==""\n"
643R"==(for (int i = 0; i < n; i++) { )==""\n"
644R"==(uint block0 = intel_sub_group_block_read( )==""\n"
645R"==((const __global uint *)(dst + i * SUB_GROUP_SIZE * 2)); )==""\n"
646R"==(uint block1 = intel_sub_group_block_read((const __global uint *)(dst )==""\n"
647R"==(+ i * SUB_GROUP_SIZE * 2 + SUB_GROUP_SIZE)); )==""\n"
648R"==(int from00 = min(2 * sglid, SUB_GROUP_SIZE - 1); )==""\n"
649R"==(int from01 = max(2 * sglid - 16, 0); )==""\n"
650R"==(int from10 = min(2 * sglid + 1, SUB_GROUP_SIZE - 1); )==""\n"
651R"==(int from11 = max(2 * sglid + 1 - 16, 0); )==""\n"
652R"==(uint block00 = intel_sub_group_shuffle(block0, from00); )==""\n"
653R"==(uint block01 = intel_sub_group_shuffle(block1, from01); )==""\n"
654R"==(uint block10 = intel_sub_group_shuffle(block0, from10); )==""\n"
655R"==(uint block11 = intel_sub_group_shuffle(block1, from11); )==""\n"
656R"==(block0 = (2 * sglid < SUB_GROUP_SIZE) ? block00 : block01; )==""\n"
657R"==(block1 = (2 * sglid + 1 < SUB_GROUP_SIZE) ? block10 : block11; )==""\n"
658R"==(*(DST_DATA2_T *)(&d[i * 2]) = AS_DST_DATA2_T((uint2)(block0, block1)); )==""\n"
659R"==(} )==""\n"
660R"==(return; )==""\n"
661R"==(#else )==""\n"
662R"==(#error "Not expected" )==""\n"
663R"==(#endif )==""\n"
664R"==(} )==""\n"
665R"==(void block_write_dst(int n, const DST_DATA_T *d, __global DST_DATA_T *dst) { )==""\n"
666R"==(#if DST_DT_S8 || DST_DT_U8 )==""\n"
667R"==(__attribute__((opencl_unroll_hint)) )==""\n"
668R"==(for (int i = 0; i < n / 8 * 8; i += 8) { )==""\n"
669R"==(intel_sub_group_block_write_us8( )==""\n"
670R"==((__global ushort *)(dst + i * SUB_GROUP_SIZE * 2), )==""\n"
671R"==(as_ushort8(*(DST_DATA16_T *)(&d[i * 2]))); )==""\n"
672R"==(} )==""\n"
673R"==(__attribute__((opencl_unroll_hint)) )==""\n"
674R"==(for (int i = n / 8 * 8; i < n; i++) { )==""\n"
675R"==(intel_sub_group_block_write_us( )==""\n"
676R"==((__global ushort *)(dst + i * SUB_GROUP_SIZE * 2), )==""\n"
677R"==(as_ushort(*(DST_DATA2_T *)(&d[i * 2]))); )==""\n"
678R"==(} )==""\n"
679R"==(return; )==""\n"
680R"==(#elif DST_DT_S32 || DST_DT_F32 )==""\n"
681R"==(int sglid = get_sub_group_local_id(); )==""\n"
682R"==(__attribute__((opencl_unroll_hint)) )==""\n"
683R"==(for (int i = 0; i < n; i++) { )==""\n"
684R"==(DST_DATA2_T block = AS_DST_DATA2_T(*(DST_DATA2_T *)(&d[i * 2])); )==""\n"
685R"==(dst[i * SUB_GROUP_SIZE * 2 + 2 * sglid] = block.S0; )==""\n"
686R"==(dst[i * SUB_GROUP_SIZE * 2 + 1 + 2 * sglid] = block.S1; )==""\n"
687R"==(} )==""\n"
688R"==(return; )==""\n"
689R"==(#else )==""\n"
690R"==(#error "Not expected" )==""\n"
691R"==(#endif )==""\n"
692R"==(} )==""\n"
693R"==()==";
694}
695}
696}
697}