1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *xe_lp_conv_bwd_data_x8s8x8_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2021 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
21R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
23R"==(#if OC % OC_BLOCK != 0 )==""\n"
24R"==(#define OC_NBLOCKS_TAIL ((OC - (OC & ~(OC_BLOCK - 1)) + 3) / 4) )==""\n"
25R"==(#else )==""\n"
26R"==(#define OC_NBLOCKS_TAIL 8 )==""\n"
27R"==(#endif )==""\n"
28R"==(#if IW_BLOCK == 4 )==""\n"
29R"==(#define BLOCK 4 )==""\n"
30R"==(#define ACC_DATA_BLOCK int4 )==""\n"
31R"==(#define A_DATA_BLOCK_T MMAD_DATA4_T )==""\n"
32R"==(#define WRITE_LOCAL block_write4 )==""\n"
33R"==(#define READ_BLOCK intel_sub_group_block_read4 )==""\n"
34R"==(DECLARE_MMAD_EMU(mmad_tail, idot4, OC_NBLOCKS_TAIL, 4, A_DATA_BLOCK_T, int8, )==""\n"
35R"==(ACC_DATA_BLOCK) )==""\n"
36R"==(#define MMAD_FULL mmad8x4 )==""\n"
37R"==(#define MMAD_TAIL mmad_tail )==""\n"
38R"==(#elif IW_BLOCK == 8 )==""\n"
39R"==(#define BLOCK 8 )==""\n"
40R"==(#define ACC_DATA_BLOCK int8 )==""\n"
41R"==(#define A_DATA_BLOCK_T MMAD_DATA8_T )==""\n"
42R"==(#define WRITE_LOCAL block_write8 )==""\n"
43R"==(#define READ_BLOCK intel_sub_group_block_read8 )==""\n"
44R"==(DECLARE_MMAD_EMU(mmad_tail, idot4, OC_NBLOCKS_TAIL, 8, A_DATA_BLOCK_T, int8, )==""\n"
45R"==(ACC_DATA_BLOCK) )==""\n"
46R"==(#define MMAD_FULL mmad8x8 )==""\n"
47R"==(#define MMAD_TAIL mmad_tail )==""\n"
48R"==(#else )==""\n"
49R"==(#error "Wrong IW_BLOCK" )==""\n"
50R"==(#endif )==""\n"
51R"==(#define BLOCK_READ_WHT_1x32(data, idx) \ )==""\n"
52R"==(data = as_int(intel_sub_group_block_read((__global uint *)&wei[idx])); )==""\n"
53R"==(#define BLOCK_READ_WHT_8x32(data, idx) \ )==""\n"
54R"==(data = as_int8(intel_sub_group_block_read8((__global uint *)&wei[idx])); )==""\n"
55R"==(#if IC % IC_BLOCK == 0 )==""\n"
56R"==(#define BLOCK_READ_BIA(data, idx) \ )==""\n"
57R"==(data = as_float4(intel_sub_group_block_read4((__global uint *)&bias[idx])); )==""\n"
58R"==(#else )==""\n"
59R"==(#define BLOCK_READ_BIA(data, idx) \ )==""\n"
60R"==(data = (float4)0; \ )==""\n"
61R"==(int i; \ )==""\n"
62R"==(for (i = idx; i < idx + IC_BLOCK && i < IC - (IC % SUB_GROUP_SIZE); \ )==""\n"
63R"==(i += SUB_GROUP_SIZE) { \ )==""\n"
64R"==(data[(i - idx) / SUB_GROUP_SIZE] = as_float( \ )==""\n"
65R"==(intel_sub_group_block_read((__global uint *)&bias[i])); \ )==""\n"
66R"==(} \ )==""\n"
67R"==(if ((get_sub_group_local_id() < IC % SUB_GROUP_SIZE) \ )==""\n"
68R"==(&& (i == IC - IC % SUB_GROUP_SIZE)) { \ )==""\n"
69R"==(data[(i - idx) / SUB_GROUP_SIZE] \ )==""\n"
70R"==(= as_float(bias[i + get_sub_group_local_id()]); \ )==""\n"
71R"==(} )==""\n"
72R"==(#endif )==""\n"
73R"==(#define HAS_PAD_W (PW > 0 || OW * SW - PW + (KW - 1) * (1 + DW) >= IW) )==""\n"
74R"==(#if IS_NHWC )==""\n"
75R"==(inline void write_ic_block4(__global SRC_DATA_T *src, int off, uchar4 value) { )==""\n"
76R"==(const int local_id = get_sub_group_local_id(); )==""\n"
77R"==(#if IC % IC_BLOCK != 0 )==""\n"
78R"==(int tail = IC - off; )==""\n"
79R"==(if (tail < IC_BLOCK) { )==""\n"
80R"==(if (local_id < tail) src[8 * 0 + local_id] = value.s0; )==""\n"
81R"==(if (local_id < tail - 8 * 1) src[1 * 8 + local_id] = value.s1; )==""\n"
82R"==(if (local_id < tail - 8 * 2) src[2 * 8 + local_id] = value.s2; )==""\n"
83R"==(if (local_id < tail - 8 * 3) src[3 * 8 + local_id] = value.s3; )==""\n"
84R"==(return; )==""\n"
85R"==(} )==""\n"
86R"==(#endif )==""\n"
87R"==(#if IC % 4 != 0 )==""\n"
88R"==(src[0 * 8 + local_id] = value.s0; )==""\n"
89R"==(src[1 * 8 + local_id] = value.s1; )==""\n"
90R"==(src[2 * 8 + local_id] = value.s2; )==""\n"
91R"==(src[3 * 8 + local_id] = value.s3; )==""\n"
92R"==(return; )==""\n"
93R"==(#else )==""\n"
94R"==(intel_sub_group_block_write_uc4((__global uchar *)src, value); )==""\n"
95R"==(return; )==""\n"
96R"==(#endif )==""\n"
97R"==(} )==""\n"
98R"==(inline void write_local(__local uint *dst_iw_slm_copy, int oc_nchunk, )==""\n"
99R"==(__global DATA_T *dst_copy) { )==""\n"
100R"==(const int local_id = get_sub_group_local_id(); )==""\n"
101R"==(#if OC % OC_BLOCK != 0 )==""\n"
102R"==(int oc_block_tail = OC % OC_BLOCK; )==""\n"
103R"==(int oc_bound_tail = oc_block_tail % 4; )==""\n"
104R"==(int max_i = (local_id * 4 < (oc_block_tail - oc_bound_tail) )==""\n"
105R"==(|| oc_nchunk < (OC_NCHUNK - 1)) )==""\n"
106R"==(? 4 )==""\n"
107R"==(: (local_id * 4 == (oc_block_tail - oc_bound_tail) ? oc_bound_tail )==""\n"
108R"==(: 0); )==""\n"
109R"==(uchar4 tmp = 0; )==""\n"
110R"==(for (int i = 0; i < max_i; ++i) { )==""\n"
111R"==(tmp[i] = dst_copy[local_id * 4 + i]; )==""\n"
112R"==(} )==""\n"
113R"==(dst_iw_slm_copy[local_id] = as_uint(tmp); )==""\n"
114R"==(return; )==""\n"
115R"==(#endif )==""\n"
116R"==(block_write(dst_iw_slm_copy, )==""\n"
117R"==(intel_sub_group_block_read((const __global uint *)(dst_copy))); )==""\n"
118R"==(return; )==""\n"
119R"==(} )==""\n"
120R"==(#endif )==""\n"
121R"==(__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) )==""\n"
122R"==(__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) )==""\n"
123R"==(__kernel void )==""\n"
124R"==(conv_bwd_data_x8s8x8(const __global SRC_DATA_T *src, const __global char *wei, )==""\n"
125R"==(const __global float *bias, __global DATA_T *dst) { )==""\n"
126R"==(const int group_ic = get_group_id(0) * IC_GROUP; )==""\n"
127R"==(const int group_sp = get_group_id(1) * SP_GROUP; )==""\n"
128R"==(const int group_mb = get_group_id(2) * MB_GROUP; )==""\n"
129R"==(const int sub_group_id = get_sub_group_id(); )==""\n"
130R"==(const int sub_group_lid = get_sub_group_local_id(); )==""\n"
131R"==(const int ic = (sub_group_id % IC_GROUP); )==""\n"
132R"==(const int sp = (sub_group_id / IC_GROUP); )==""\n"
133R"==(const int g = (group_ic + ic) / IC_NCHUNK; )==""\n"
134R"==(const int group_oc = OC_NCHUNK * g; )==""\n"
135R"==(const int gid = group_sp / (IW_NCHUNK * IH); )==""\n"
136R"==(const int gihw = group_sp % (IW_NCHUNK * IH); )==""\n"
137R"==(const int gih = gihw / IW_NCHUNK; )==""\n"
138R"==(const int giw = IW_BLOCK * (gihw % IW_NCHUNK); )==""\n"
139R"==(const int id = gid; )==""\n"
140R"==(const int iw = giw + IW_BLOCK * sp; )==""\n"
141R"==(const int ih = gih; )==""\n"
142R"==(#if IS_NHWC )==""\n"
143R"==(dst += group_mb * MB_BLOCK * OD * OH * OW * G * OC; )==""\n"
144R"==(dst += OC_BLOCK * group_oc; )==""\n"
145R"==(src += group_mb * MB_BLOCK * ID * IH * IW * G * IC; )==""\n"
146R"==(src += (IW * IH * id + IW * ih + iw) * G * IC; )==""\n"
147R"==(src += (group_ic + ic) * IC_BLOCK; )==""\n"
148R"==(#else )==""\n"
149R"==(src += IC_BLOCK * ID * IH * IW * IC_NCHUNK * G * MB_BLOCK * group_mb; )==""\n"
150R"==(src += IC_BLOCK * ID * IH * IW * MB_BLOCK * (group_ic + ic); )==""\n"
151R"==(src += IC_BLOCK * MB_BLOCK * (IW * IH * id + IW * ih + iw); )==""\n"
152R"==(dst += OC_BLOCK * OD * OH * OW * OC_NCHUNK * G * MB_BLOCK * group_mb; )==""\n"
153R"==(dst += OC_BLOCK * OD * OH * OW * MB_BLOCK * group_oc; )==""\n"
154R"==(#endif )==""\n"
155R"==(wei += OC_BLOCK * KD * KH * KW * IC_BLOCK * (group_ic + ic) * OC_NCHUNK; )==""\n"
156R"==(ACC_DATA_BLOCK C00 = 0, C01 = 0, C02 = 0, C03 = 0; )==""\n"
157R"==(__local uint dst_slm[DST_SLM_SIZE]; )==""\n"
158R"==(__local uint *dst_iw_slm = dst_slm + (OC_BLOCK / 4) * sp * IW_BLOCK; )==""\n"
159R"==(__global DATA_T *dst_tmp = dst; )==""\n"
160R"==(for (int oc_chunk = 0; oc_chunk < OC_NCHUNK; oc_chunk++) { )==""\n"
161R"==(A_DATA_BLOCK_T D0; )==""\n"
162R"==(for (int kd = 0; kd < KD; kd++) { )==""\n"
163R"==(int od = id - kd * (1 + DD) + PD; )==""\n"
164R"==(if (od < 0 || od % SD != 0) { )==""\n"
165R"==(wei += IC_BLOCK * OC_BLOCK * KH * KW; )==""\n"
166R"==(continue; )==""\n"
167R"==(} )==""\n"
168R"==(od /= SD; )==""\n"
169R"==(if (od >= OD) { )==""\n"
170R"==(wei += IC_BLOCK * OC_BLOCK * KH * KW; )==""\n"
171R"==(continue; )==""\n"
172R"==(} )==""\n"
173R"==(for (int kh = 0; kh < KH; kh++) { )==""\n"
174R"==(int oh = (ih - kh * (1 + DH) + PH); )==""\n"
175R"==(if (oh < 0 || oh % SH != 0) { )==""\n"
176R"==(wei += IC_BLOCK * OC_BLOCK * KW; )==""\n"
177R"==(continue; )==""\n"
178R"==(} )==""\n"
179R"==(oh /= SH; )==""\n"
180R"==(if (oh >= OH) { )==""\n"
181R"==(wei += IC_BLOCK * OC_BLOCK * KW; )==""\n"
182R"==(continue; )==""\n"
183R"==(} )==""\n"
184R"==(#if IS_NHWC )==""\n"
185R"==(__global DATA_T *dst_cur )==""\n"
186R"==(= dst + (G * OC * (OW * OH * od + OW * oh)); )==""\n"
187R"==(#else )==""\n"
188R"==(__global DATA_T *dst_cur = dst )==""\n"
189R"==(+ (OC_BLOCK * MB_BLOCK * (OW * OH * od + OW * oh)); )==""\n"
190R"==(#endif )==""\n"
191R"==(barrier(CLK_LOCAL_MEM_FENCE); )==""\n"
192R"==(#if !HAS_PAD_W && SW == 1 && KW == 1 && !IS_NHWC && IW % IW_BLOCK == 0 )==""\n"
193R"==(int ow_min = (iw - (KW - 1) * (1 + DW)); )==""\n"
194R"==(int ow_max = (iw + IW_BLOCK - 1); )==""\n"
195R"==(__attribute__((opencl_unroll_hint)) for (int i = ow_min; )==""\n"
196R"==(i <= ow_max; )==""\n"
197R"==(i += IW_BLOCK) { )==""\n"
198R"==(if (i < 0 || i >= OW) { )==""\n"
199R"==(block_write(dst_iw_slm + (i - ow_min) * 8, 0); )==""\n"
200R"==(continue; )==""\n"
201R"==(} )==""\n"
202R"==(WRITE_LOCAL(dst_iw_slm + (i - ow_min) * 8, )==""\n"
203R"==(READ_BLOCK((const __global uint )==""\n"
204R"==(*)(&dst_cur[i * OC_BLOCK]))); )==""\n"
205R"==(} )==""\n"
206R"==(#else )==""\n"
207R"==(int ow_min = (iw - (KW - 1) * (1 + DW) + PW); )==""\n"
208R"==(int ow_max = (iw + IW_BLOCK - 1 + PW); )==""\n"
209R"==(__attribute__((opencl_unroll_hint)) for (int i = ow_min; )==""\n"
210R"==(i <= ow_max; i++) { )==""\n"
211R"==(if (i < 0 || i % SW != 0) { )==""\n"
212R"==(block_write(dst_iw_slm + (i - ow_min) * 8, 0); )==""\n"
213R"==(continue; )==""\n"
214R"==(} )==""\n"
215R"==(int index = i / SW; )==""\n"
216R"==(if (index >= OW) { )==""\n"
217R"==(block_write(dst_iw_slm + (i - ow_min) * 8, 0); )==""\n"
218R"==(continue; )==""\n"
219R"==(} )==""\n"
220R"==(#if IS_NHWC )==""\n"
221R"==(write_local(dst_iw_slm + (i - ow_min) * 8, oc_chunk, )==""\n"
222R"==(dst_cur + (index * G * OC)); )==""\n"
223R"==(#else )==""\n"
224R"==(block_write(dst_iw_slm + (i - ow_min) * 8, )==""\n"
225R"==(block_read((const __global uint )==""\n"
226R"==(*)(&dst_cur[index * OC_BLOCK]))); )==""\n"
227R"==(#endif )==""\n"
228R"==(} )==""\n"
229R"==(#endif )==""\n"
230R"==(barrier(CLK_LOCAL_MEM_FENCE); )==""\n"
231R"==(for (int kw = 0; kw < KW; kw++) { )==""\n"
232R"==(unroll_for(int i = 0; i < IW_BLOCK; i++) { )==""\n"
233R"==(int ow_index = (iw + i - kw * (1 + DW) + PW) - ow_min; )==""\n"
234R"==(D0[i] = block_read(dst_iw_slm + (ow_index * 8)); )==""\n"
235R"==(} )==""\n"
236R"==(int8 W0 = 0, W1 = 0, W2 = 0, W3 = 0; )==""\n"
237R"==(#if OC % OC_BLOCK != 0 )==""\n"
238R"==(if (oc_chunk == OC_NCHUNK - 1) { )==""\n"
239R"==(unroll_for(int i = 0; i < OC_NBLOCKS_TAIL; ++i) )==""\n"
240R"==(BLOCK_READ_WHT_1x32(W0[i], (i + 0) * OC_BLOCK); )==""\n"
241R"==(if (IC > 8) )==""\n"
242R"==(unroll_for(int i = 0; i < OC_NBLOCKS_TAIL; ++i) )==""\n"
243R"==(BLOCK_READ_WHT_1x32( )==""\n"
244R"==(W1[i], (i + 8) * OC_BLOCK); )==""\n"
245R"==(if (IC > 16) )==""\n"
246R"==(unroll_for(int i = 0; i < OC_NBLOCKS_TAIL; ++i) )==""\n"
247R"==(BLOCK_READ_WHT_1x32( )==""\n"
248R"==(W2[i], (i + 16) * OC_BLOCK); )==""\n"
249R"==(if (IC > 24) )==""\n"
250R"==(unroll_for(int i = 0; i < OC_NBLOCKS_TAIL; ++i) )==""\n"
251R"==(BLOCK_READ_WHT_1x32( )==""\n"
252R"==(W3[i], (i + 24) * OC_BLOCK); )==""\n"
253R"==(C00 = MMAD_TAIL(D0, W0, C00); )==""\n"
254R"==(if (IC > 8) C01 = MMAD_TAIL(D0, W1, C01); )==""\n"
255R"==(if (IC > 16) C02 = MMAD_TAIL(D0, W2, C02); )==""\n"
256R"==(if (IC > 24) C03 = MMAD_TAIL(D0, W3, C03); )==""\n"
257R"==(} else )==""\n"
258R"==(#endif )==""\n"
259R"==({ )==""\n"
260R"==(BLOCK_READ_WHT_8x32(W0, 0); )==""\n"
261R"==(if (IC > 8) BLOCK_READ_WHT_8x32(W1, 8 * OC_BLOCK); )==""\n"
262R"==(if (IC > 16) BLOCK_READ_WHT_8x32(W2, 16 * OC_BLOCK); )==""\n"
263R"==(if (IC > 24) BLOCK_READ_WHT_8x32(W3, 24 * OC_BLOCK); )==""\n"
264R"==(C00 = MMAD_FULL(D0, W0, C00); )==""\n"
265R"==(if (IC > 8) C01 = MMAD_FULL(D0, W1, C01); )==""\n"
266R"==(if (IC > 16) C02 = MMAD_FULL(D0, W2, C02); )==""\n"
267R"==(if (IC > 24) C03 = MMAD_FULL(D0, W3, C03); )==""\n"
268R"==(} )==""\n"
269R"==(wei += IC_BLOCK * OC_BLOCK; )==""\n"
270R"==(} )==""\n"
271R"==(} )==""\n"
272R"==(} )==""\n"
273R"==(#if IS_NHWC )==""\n"
274R"==(dst += OC_BLOCK; )==""\n"
275R"==(#else )==""\n"
276R"==(dst += OC_BLOCK * MB_BLOCK * OH * OW * OD; )==""\n"
277R"==(#endif )==""\n"
278R"==(} )==""\n"
279R"==(#define PACK(idx) \ )==""\n"
280R"==(BIAS_SUM_RELU(D00[0], tmp[0], C00[idx], bia[0]); \ )==""\n"
281R"==(BIAS_SUM_RELU(D00[1], tmp[1], C01[idx], bia[1]); \ )==""\n"
282R"==(BIAS_SUM_RELU(D00[2], tmp[2], C02[idx], bia[2]); \ )==""\n"
283R"==(BIAS_SUM_RELU(D00[3], tmp[3], C03[idx], bia[3]); \ )==""\n"
284R"==(src_pack[idx] = as_uint(D00); )==""\n"
285R"==(#if WITH_BIAS )==""\n"
286R"==(#define BIAS_SUM_RELU(RES, TMP, ACC, BIA) \ )==""\n"
287R"==(TMP = (float)ACC + BIA; \ )==""\n"
288R"==(RES = TO_SRC(TMP); )==""\n"
289R"==(#else )==""\n"
290R"==(#define BIAS_SUM_RELU(RES, TMP, ACC, BIA) RES = TO_SRC((float)ACC); )==""\n"
291R"==(#endif )==""\n"
292R"==(uchar4 D00; )==""\n"
293R"==(uint8 src_pack; )==""\n"
294R"==(float4 bia, tmp; )==""\n"
295R"==(BLOCK_READ_BIA(bia, (group_ic + ic) * IC_BLOCK); )==""\n"
296R"==(#if IS_NHWC )==""\n"
297R"==(#if IW_TAIL )==""\n"
298R"==(if (iw + IW_BLOCK > IW) { )==""\n"
299R"==(for (int i = 0; i < IW_TAIL; ++i) { )==""\n"
300R"==(PACK(i); )==""\n"
301R"==(} )==""\n"
302R"==(__attribute__((opencl_unroll_hint(IW_TAIL))) for (int i = 0; )==""\n"
303R"==(i < IW_TAIL; i++) { )==""\n"
304R"==(write_ic_block4(src + i * G * IC, (group_ic + ic) * IC_BLOCK, )==""\n"
305R"==(as_uchar4(src_pack[i])); )==""\n"
306R"==(} )==""\n"
307R"==(} else { )==""\n"
308R"==(#endif )==""\n"
309R"==(#if IW_BLOCK == 4 || IW_BLOCK == 8 )==""\n"
310R"==(for (int i = 0; i < IW_BLOCK; ++i) { )==""\n"
311R"==(PACK(i); )==""\n"
312R"==(} )==""\n"
313R"==(write_ic_block4(src + G * IC * 0, (group_ic + ic) * IC_BLOCK, )==""\n"
314R"==(as_uchar4(src_pack[0])); )==""\n"
315R"==(write_ic_block4(src + G * IC * 1, (group_ic + ic) * IC_BLOCK, )==""\n"
316R"==(as_uchar4(src_pack[1])); )==""\n"
317R"==(write_ic_block4(src + G * IC * 2, (group_ic + ic) * IC_BLOCK, )==""\n"
318R"==(as_uchar4(src_pack[2])); )==""\n"
319R"==(write_ic_block4(src + G * IC * 3, (group_ic + ic) * IC_BLOCK, )==""\n"
320R"==(as_uchar4(src_pack[3])); )==""\n"
321R"==(#endif )==""\n"
322R"==(#if IW_BLOCK == 8 )==""\n"
323R"==(write_ic_block4(src + G * IC * 4, (group_ic + ic) * IC_BLOCK, )==""\n"
324R"==(as_uchar4(src_pack[4])); )==""\n"
325R"==(write_ic_block4(src + G * IC * 5, (group_ic + ic) * IC_BLOCK, )==""\n"
326R"==(as_uchar4(src_pack[5])); )==""\n"
327R"==(write_ic_block4(src + G * IC * 6, (group_ic + ic) * IC_BLOCK, )==""\n"
328R"==(as_uchar4(src_pack[6])); )==""\n"
329R"==(write_ic_block4(src + G * IC * 7, (group_ic + ic) * IC_BLOCK, )==""\n"
330R"==(as_uchar4(src_pack[7])); )==""\n"
331R"==(#endif )==""\n"
332R"==(#if IW_TAIL )==""\n"
333R"==(} )==""\n"
334R"==(#endif )==""\n"
335R"==(#else )==""\n"
336R"==(#if IW_TAIL )==""\n"
337R"==(if (iw + IW_BLOCK > IW) { )==""\n"
338R"==(for (int i = 0; i < IW_TAIL; ++i) { )==""\n"
339R"==(PACK(i); )==""\n"
340R"==(} )==""\n"
341R"==(__attribute__((opencl_unroll_hint(IW_TAIL))) for (int i = 0; )==""\n"
342R"==(i < IW_TAIL; i++) { )==""\n"
343R"==(intel_sub_group_block_write_uc4( )==""\n"
344R"==((__global uchar *)&src[i * IC_BLOCK], )==""\n"
345R"==(as_uchar4(src_pack[i])); )==""\n"
346R"==(} )==""\n"
347R"==(} else { )==""\n"
348R"==(#endif )==""\n"
349R"==(#if IW_BLOCK == 4 || IW_BLOCK == 8 )==""\n"
350R"==(for (int i = 0; i < IW_BLOCK; ++i) { )==""\n"
351R"==(PACK(i); )==""\n"
352R"==(} )==""\n"
353R"==(intel_sub_group_block_write_uc16((__global uchar *)&src[0 * IC_BLOCK], )==""\n"
354R"==(as_uchar16(src_pack.s0123)); )==""\n"
355R"==(#endif )==""\n"
356R"==(#if IW_BLOCK == 8 )==""\n"
357R"==(intel_sub_group_block_write_uc16((__global uchar *)&src[4 * IC_BLOCK], )==""\n"
358R"==(as_uchar16(src_pack.s4567)); )==""\n"
359R"==(#endif )==""\n"
360R"==(#if IW_TAIL )==""\n"
361R"==(} )==""\n"
362R"==(#endif )==""\n"
363R"==(#endif )==""\n"
364R"==(} )==""\n"
365R"==()==";
366}
367}
368}
369}