1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *gen9_conv_dw_fwd_data_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2021 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
21R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
22R"==(#if IS_DW != 1 )==""\n"
23R"==(#error "Kernel supports depth-wise convolutions only" )==""\n"
24R"==(#endif )==""\n"
25R"==(#ifdef DST_DT_S8 )==""\n"
26R"==(#if VER_32MB16C )==""\n"
27R"==(#define DST_MB_BLOCK MB_BLOCK )==""\n"
28R"==(#else )==""\n"
29R"==(#define DST_MB_BLOCK (MB_BLOCK * 2) )==""\n"
30R"==(#endif )==""\n"
31R"==(#define DST_OC_BLOCK (OC_BLOCK * 2) )==""\n"
32R"==(#endif )==""\n"
33R"==(#define APPLY_POST_OPS_COMMON(nelems, accumulator, dest_data, mb_shift) \ )==""\n"
34R"==({ \ )==""\n"
35R"==(const int po_mb = mb_shift + mb; \ )==""\n"
36R"==(const int po_oc = g; \ )==""\n"
37R"==(int po_mb_count; \ )==""\n"
38R"==(if (VER_16MB16C == 1) { \ )==""\n"
39R"==(po_mb_count = nelems; \ )==""\n"
40R"==(} else { \ )==""\n"
41R"==(po_mb_count = 1; \ )==""\n"
42R"==(} \ )==""\n"
43R"==(APPLY_POST_OPS_TRY_BURST(accumulator, DATA_T, dest_data, DATA_T, \ )==""\n"
44R"==(po_mb, po_mb_count, po_oc, SUB_GROUP_SIZE, \ )==""\n"
45R"==(get_sub_group_local_id()); \ )==""\n"
46R"==(} )==""\n"
47R"==(__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) )==""\n"
48R"==(#if SUB_GROUP_SIZE != 1 )==""\n"
49R"==(__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) )==""\n"
50R"==(#endif )==""\n"
51R"==(__kernel void )==""\n"
52R"==(gen9_conv_dw_fwd(const __global DATA_T *src, const __global DATA_T *wei, )==""\n"
53R"==(const __global DATA_T *bias, __global DST_DATA_T *dst POST_OP_ARGS) { )==""\n"
54R"==(MAYBE_SKIP_NON_UNIFORM_WG(); )==""\n"
55R"==(#if VER_8OW16C )==""\n"
56R"==(const int osp = get_global_id(1); )==""\n"
57R"==(const int od = osp / (OWB * OH); )==""\n"
58R"==(const int ohw = osp % (OWB * OH); )==""\n"
59R"==(const int ow = (ohw % OWB) * OW_BLOCK; )==""\n"
60R"==(const int oh = ohw / OWB; )==""\n"
61R"==(const int g )==""\n"
62R"==(= (get_group_id(0) * (LWS_0 / SUB_GROUP_SIZE) + get_sub_group_id()) )==""\n"
63R"==(* OC_BLOCK; )==""\n"
64R"==(const int mb = get_global_id(2) * MB_BLOCK; )==""\n"
65R"==(const int id = od * SD - PD; )==""\n"
66R"==(const int ih = oh * SH - PH; )==""\n"
67R"==(const int iw = ow * SW - PW; )==""\n"
68R"==(#ifdef DST_DT_S8 )==""\n"
69R"==(const int G_32block = G % 32 ? (32 + G - (G % 32)) : G; )==""\n"
70R"==(dst += mb * G_32block * OD * OH * OW )==""\n"
71R"==(+ (g / 32 * 32) * OD * OH * OW * MB_BLOCK )==""\n"
72R"==(+ (od * OH * OW + oh * OW + ow) * MB_BLOCK * (DST_OC_BLOCK) )==""\n"
73R"==(+ (g % 32); )==""\n"
74R"==(#else )==""\n"
75R"==(dst += mb * G * OD * OH * OW + g * OD * OH * OW * MB_BLOCK )==""\n"
76R"==(+ (od * OH * OW + oh * OW + ow) * MB_BLOCK * OC_BLOCK; )==""\n"
77R"==(#endif )==""\n"
78R"==(src += mb )==""\n"
79R"==(* ((G_WO_PADDING / IC_BLOCK) )==""\n"
80R"==(+ (G_WO_PADDING % IC_BLOCK > 0 ? 1 : 0)) )==""\n"
81R"==(* IC_BLOCK * ID * IH * IW )==""\n"
82R"==(+ g * ID * IH * IW * MB_BLOCK )==""\n"
83R"==(+ (id * IH * IW + ih * IW + iw) * MB_BLOCK * IC_BLOCK; )==""\n"
84R"==(wei += g * KD * KH * KW; )==""\n"
85R"==(DATA_T S00[OW_BLOCK] = {DATA_ZERO}; )==""\n"
86R"==(if (WITH_BIAS) { )==""\n"
87R"==(const int bg_off = g + get_sub_group_local_id(); )==""\n"
88R"==(DATA_T b = (G_WO_PADDING % OC_BLOCK == 0 || bg_off < G_WO_PADDING) )==""\n"
89R"==(? bias[bg_off] )==""\n"
90R"==(: DATA_ZERO; )==""\n"
91R"==(unroll_for(int k = 0; k < OW_BLOCK; k++) { S00[k] = b; } )==""\n"
92R"==(} )==""\n"
93R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
94R"==(for (int kd = 0; kd < KD; kd++) )==""\n"
95R"==(for (int kh = 0; kh < KH; kh++) { )==""\n"
96R"==(if (id + kd * (1 + DD) < 0 || id + kd * (1 + DD) >= ID) continue; )==""\n"
97R"==(if (ih + kh * (1 + DH) < 0 || ih + kh * (1 + DH) >= IH) continue; )==""\n"
98R"==(const __global DATA_T *src1 = src )==""\n"
99R"==(+ (kd * (1 + DD) * IH + kh * (1 + DH)) * IW * MB_BLOCK )==""\n"
100R"==(* IC_BLOCK; )==""\n"
101R"==(DATA_T tempA[SW * OW_BLOCK + KW * (1 + DW)] = {0}; )==""\n"
102R"==(__attribute__((opencl_unroll_hint( )==""\n"
103R"==(SW * OW_BLOCK + KW * (1 + DW)))) )==""\n"
104R"==(for (int i = 0; i < SW * OW_BLOCK + KW * (1 + DW); i++) { )==""\n"
105R"==(if ((i + iw) >= 0 && (i + iw) < IW) { )==""\n"
106R"==(tempA[i] = AS_DATA_T(BLOCK_READ((const __global BLOCK_DATA_T )==""\n"
107R"==(*)(&src1[i * IC_BLOCK]))); )==""\n"
108R"==(} )==""\n"
109R"==(} )==""\n"
110R"==(for (int kw = 0; kw < KW; kw++) { )==""\n"
111R"==(const __global DATA_T *wei1 )==""\n"
112R"==(= wei + (kd * KH * KW + kh * KW + kw) * OC_BLOCK; )==""\n"
113R"==(#else )==""\n"
114R"==(const int kw = 0; )==""\n"
115R"==(const __global DATA_T *wei1 = wei; )==""\n"
116R"==(const __global DATA_T *src1 = src; )==""\n"
117R"==(#endif )==""\n"
118R"==(DATA_T B0 = AS_DATA_T( )==""\n"
119R"==(BLOCK_READ((const __global BLOCK_DATA_T *)(wei1))); )==""\n"
120R"==(DATA_T A0; )==""\n"
121R"==(__attribute__((opencl_unroll_hint(OW_BLOCK))) )==""\n"
122R"==(for (int k = 0; k < OW_BLOCK; k++) { )==""\n"
123R"==(if (G != G_WO_PADDING && g >= G_WO_PADDING) { )==""\n"
124R"==(S00[k] = DATA_ZERO; )==""\n"
125R"==(continue; )==""\n"
126R"==(} )==""\n"
127R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
128R"==(A0 = tempA[k * SW + kw * (1 + DW)]; )==""\n"
129R"==(#else )==""\n"
130R"==(if (iw + kw * (1 + DW) + k * SW < 0 )==""\n"
131R"==(|| iw + kw * (1 + DW) + k * SW >= IW) )==""\n"
132R"==(A0 = DATA_ZERO; )==""\n"
133R"==(else )==""\n"
134R"==(A0 = AS_DATA_T(BLOCK_READ( )==""\n"
135R"==((const __global BLOCK_DATA_T *)(&src1[k * SW * IC_BLOCK]))); )==""\n"
136R"==(#endif )==""\n"
137R"==(S00[k] = fma(A0, (DATA_T)B0, S00[k]); )==""\n"
138R"==(} )==""\n"
139R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
140R"==(} )==""\n"
141R"==(} )==""\n"
142R"==(#endif )==""\n"
143R"==(DATA_T D00[OW_BLOCK] = {0}; )==""\n"
144R"==(#if WITH_SUM )==""\n"
145R"==(#ifdef DST_DT_S8 )==""\n"
146R"==(__attribute__((opencl_unroll_hint(OW_BLOCK))) )==""\n"
147R"==(for (int k = 0; k < OW_BLOCK; k++) { )==""\n"
148R"==(D00[k] = CONVERT_DATA_T(BLOCK_READ_DST( )==""\n"
149R"==((const __global DST_DATA_T *)&dst[k * DST_OC_BLOCK])); )==""\n"
150R"==(} )==""\n"
151R"==(#else )==""\n"
152R"==(__attribute__((opencl_unroll_hint(OW_BLOCK))) )==""\n"
153R"==(for (int k = 0; k < OW_BLOCK; k++) { )==""\n"
154R"==(D00[k] = AS_DATA_T( )==""\n"
155R"==(BLOCK_READ((const __global BLOCK_DATA_T *)&dst[k * OC_BLOCK])); )==""\n"
156R"==(} )==""\n"
157R"==(#endif )==""\n"
158R"==(#endif )==""\n"
159R"==(APPLY_POST_OPS_COMMON(OW_BLOCK, S00, D00, 0); )==""\n"
160R"==(if (OW % OW_BLOCK == 0 || ow + OW_BLOCK <= OW) { )==""\n"
161R"==(__attribute__((opencl_unroll_hint)) )==""\n"
162R"==(for (int k = 0; k < OW_BLOCK; k++) { )==""\n"
163R"==(#ifdef DST_DT_S8 )==""\n"
164R"==(BLOCK_WRITE_DST((__global DST_DATA_T *)&dst[k * DST_OC_BLOCK], )==""\n"
165R"==(CONVERT_DST_DATA_T(S00[k])); )==""\n"
166R"==(#else )==""\n"
167R"==(BLOCK_WRITE((__global BLOCK_DATA_T *)&dst[k * OC_BLOCK], )==""\n"
168R"==(AS_UINT_T(S00[k])); )==""\n"
169R"==(#endif )==""\n"
170R"==(} )==""\n"
171R"==(} else { )==""\n"
172R"==(__attribute__((opencl_unroll_hint)) )==""\n"
173R"==(for (int k = 0; k < OW % OW_BLOCK; k++) { )==""\n"
174R"==(#ifdef DST_DT_S8 )==""\n"
175R"==(BLOCK_WRITE_DST((__global DST_DATA_T *)&dst[k * DST_OC_BLOCK], )==""\n"
176R"==(CONVERT_DST_DATA_T(S00[k])); )==""\n"
177R"==(#else )==""\n"
178R"==(BLOCK_WRITE((__global BLOCK_DATA_T *)&dst[k * OC_BLOCK], )==""\n"
179R"==(AS_UINT_T(S00[k])); )==""\n"
180R"==(#endif )==""\n"
181R"==(} )==""\n"
182R"==(} )==""\n"
183R"==(#endif )==""\n"
184R"==(#if VER_16MB16C || VER_32MB16C )==""\n"
185R"==(const int osp = get_global_id(1); )==""\n"
186R"==(const int od = osp / (OWB * OH); )==""\n"
187R"==(const int ohw = osp % (OWB * OH); )==""\n"
188R"==(const int ow = (ohw % OWB) * OW_BLOCK; )==""\n"
189R"==(const int oh = ohw / OWB; )==""\n"
190R"==(const int g )==""\n"
191R"==(= (get_group_id(0) * (LWS_0 / SUB_GROUP_SIZE) + get_sub_group_id()) )==""\n"
192R"==(* OC_BLOCK; )==""\n"
193R"==(const int mb = get_global_id(2) * MB_BLOCK; )==""\n"
194R"==(const int id = od * SD - PD; )==""\n"
195R"==(const int ih = oh * SH - PH; )==""\n"
196R"==(const int iw = ow * SW - PW; )==""\n"
197R"==(#ifdef DST_DT_S8 )==""\n"
198R"==(const int G_32block = G % 32 ? (32 + G - (G % 32)) : G; )==""\n"
199R"==(dst += (mb / DST_MB_BLOCK) * G_32block * OD * OH * OW * DST_MB_BLOCK )==""\n"
200R"==(+ (mb % DST_MB_BLOCK) * DST_OC_BLOCK )==""\n"
201R"==(+ (g / DST_OC_BLOCK) * OD * OH * OW * DST_MB_BLOCK * DST_OC_BLOCK )==""\n"
202R"==(+ (od * OH * OW + oh * OW + ow) * DST_MB_BLOCK * DST_OC_BLOCK )==""\n"
203R"==(+ (g % DST_OC_BLOCK); )==""\n"
204R"==(#else )==""\n"
205R"==(dst += mb * G * OD * OH * OW + g * OD * OH * OW * MB_BLOCK )==""\n"
206R"==(+ (od * OH * OW + oh * OW + ow) * MB_BLOCK * OC_BLOCK; )==""\n"
207R"==(#endif )==""\n"
208R"==(src += mb )==""\n"
209R"==(* ((G_WO_PADDING / IC_BLOCK) )==""\n"
210R"==(+ (G_WO_PADDING % IC_BLOCK > 0 ? 1 : 0)) )==""\n"
211R"==(* IC_BLOCK * ID * IH * IW )==""\n"
212R"==(+ g * ID * IH * IW * MB_BLOCK )==""\n"
213R"==(+ (id * IH * IW + ih * IW + iw) * MB_BLOCK * IC_BLOCK; )==""\n"
214R"==(wei += g * KD * KH * KW; )==""\n"
215R"==(DATA8_T S00 = DATA_ZERO; )==""\n"
216R"==(DATA8_T S01 = DATA_ZERO; )==""\n"
217R"==(#if VER_32MB16C )==""\n"
218R"==(DATA8_T S02 = DATA_ZERO; )==""\n"
219R"==(DATA8_T S03 = DATA_ZERO; )==""\n"
220R"==(#endif )==""\n"
221R"==(if (WITH_BIAS) { )==""\n"
222R"==(const int bg_off = g + get_sub_group_local_id(); )==""\n"
223R"==(DATA_T b = (G_WO_PADDING % OC_BLOCK == 0 || bg_off < G_WO_PADDING) )==""\n"
224R"==(? bias[bg_off] )==""\n"
225R"==(: DATA_ZERO; )==""\n"
226R"==(unroll_for(int k = 0; k < 8; k++) { )==""\n"
227R"==(S00[k] = b; )==""\n"
228R"==(S01[k] = b; )==""\n"
229R"==(#if VER_32MB16C )==""\n"
230R"==(S02[k] = b; )==""\n"
231R"==(S03[k] = b; )==""\n"
232R"==(#endif )==""\n"
233R"==(} )==""\n"
234R"==(} )==""\n"
235R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
236R"==(for (int kd = 0; kd < KD; kd++) )==""\n"
237R"==(for (int kh = 0; kh < KH; kh++) )==""\n"
238R"==(for (int kw = 0; kw < KW; kw++) { )==""\n"
239R"==(if (id + kd * (1 + DD) < 0 || id + kd * (1 + DD) >= ID) )==""\n"
240R"==(continue; )==""\n"
241R"==(if (ih + kh * (1 + DH) < 0 || ih + kh * (1 + DH) >= IH) )==""\n"
242R"==(continue; )==""\n"
243R"==(if (iw + kw * (1 + DW) < 0 || iw + kw * (1 + DW) >= IW) )==""\n"
244R"==(continue; )==""\n"
245R"==(const __global DATA_T *wei1 )==""\n"
246R"==(= wei + (kd * KH * KW + kh * KW + kw) * OC_BLOCK; )==""\n"
247R"==(const __global DATA_T *src1 = src )==""\n"
248R"==(+ (kd * (1 + DD) * IH * IW + kh * (1 + DH) * IW )==""\n"
249R"==(+ kw * (1 + DW)) )==""\n"
250R"==(* MB_BLOCK * IC_BLOCK; )==""\n"
251R"==(#else )==""\n"
252R"==(const __global DATA_T *wei1 = wei; )==""\n"
253R"==(const __global DATA_T *src1 = src; )==""\n"
254R"==(#endif )==""\n"
255R"==(if (G != G_WO_PADDING && g >= G_WO_PADDING) { )==""\n"
256R"==(S00 = DATA_ZERO; )==""\n"
257R"==(S01 = DATA_ZERO; )==""\n"
258R"==(#if VER_32MB16C )==""\n"
259R"==(S02 = DATA_ZERO; )==""\n"
260R"==(S03 = DATA_ZERO; )==""\n"
261R"==(#endif )==""\n"
262R"==(continue; )==""\n"
263R"==(} )==""\n"
264R"==(DATA8_T A0 = AS_DATA8_T( )==""\n"
265R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)(src1))); )==""\n"
266R"==(DATA8_T A1 = AS_DATA8_T(BLOCK_READ8( )==""\n"
267R"==((const __global BLOCK_DATA_T *)&src1[8 * IC_BLOCK])); )==""\n"
268R"==(#if VER_32MB16C )==""\n"
269R"==(DATA8_T A2 = AS_DATA8_T(BLOCK_READ8( )==""\n"
270R"==((const __global BLOCK_DATA_T *)&src1[16 * IC_BLOCK])); )==""\n"
271R"==(DATA8_T A3 = AS_DATA8_T(BLOCK_READ8( )==""\n"
272R"==((const __global BLOCK_DATA_T *)&src1[24 * IC_BLOCK])); )==""\n"
273R"==(#endif )==""\n"
274R"==(DATA_T B0 = AS_DATA_T( )==""\n"
275R"==(BLOCK_READ((const __global BLOCK_DATA_T *)(wei1))); )==""\n"
276R"==(S00 = fma(A0, (DATA8_T)B0, S00); )==""\n"
277R"==(S01 = fma(A1, (DATA8_T)B0, S01); )==""\n"
278R"==(#if VER_32MB16C )==""\n"
279R"==(S02 = fma(A2, (DATA8_T)B0, S02); )==""\n"
280R"==(S03 = fma(A3, (DATA8_T)B0, S03); )==""\n"
281R"==(#endif )==""\n"
282R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
283R"==(} )==""\n"
284R"==(#endif )==""\n"
285R"==(DATA8_T D00; )==""\n"
286R"==(DATA8_T D01; )==""\n"
287R"==(#if VER_32MB16C )==""\n"
288R"==(DATA8_T D02; )==""\n"
289R"==(DATA8_T D03; )==""\n"
290R"==(#endif )==""\n"
291R"==(#if WITH_SUM )==""\n"
292R"==(#ifdef DST_DT_S8 )==""\n"
293R"==(for (int i = 0; i < 8; ++i) { )==""\n"
294R"==(D00[i] = CONVERT_DATA_T( )==""\n"
295R"==(BLOCK_READ_DST((__global DST_DATA_T *)&dst[i * 32])); )==""\n"
296R"==(D01[i] = CONVERT_DATA_T( )==""\n"
297R"==(BLOCK_READ_DST((__global DST_DATA_T *)&dst[(i * 32) + 256])); )==""\n"
298R"==(#if VER_32MB16C )==""\n"
299R"==(D02[i] = CONVERT_DATA_T( )==""\n"
300R"==(BLOCK_READ_DST((__global DST_DATA_T *)&dst[i * 32] + 512)); )==""\n"
301R"==(D03[i] = CONVERT_DATA_T( )==""\n"
302R"==(BLOCK_READ_DST((__global DST_DATA_T *)&dst[(i * 32) + 768])); )==""\n"
303R"==(#endif )==""\n"
304R"==(} )==""\n"
305R"==(#else )==""\n"
306R"==(D00 = AS_DATA8_T(BLOCK_READ8((const __global BLOCK_DATA_T *)dst)); )==""\n"
307R"==(D01 = AS_DATA8_T( )==""\n"
308R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)&dst[8 * OC_BLOCK])); )==""\n"
309R"==(#if VER_32MB16C )==""\n"
310R"==(D02 = AS_DATA8_T( )==""\n"
311R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)&dst[16 * OC_BLOCK])); )==""\n"
312R"==(D03 = AS_DATA8_T( )==""\n"
313R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)&dst[24 * OC_BLOCK])); )==""\n"
314R"==(#endif )==""\n"
315R"==(#endif )==""\n"
316R"==(#endif )==""\n"
317R"==(APPLY_POST_OPS_COMMON(8, S00, D00, 0); )==""\n"
318R"==(APPLY_POST_OPS_COMMON(8, S01, D01, 8); )==""\n"
319R"==(#if VER_32MB16C )==""\n"
320R"==(APPLY_POST_OPS_COMMON(8, S02, D02, 16); )==""\n"
321R"==(APPLY_POST_OPS_COMMON(8, S03, D03, 24); )==""\n"
322R"==(#endif )==""\n"
323R"==(#ifdef DST_DT_S8 )==""\n"
324R"==(for (int i = 0; i < 8; ++i) { )==""\n"
325R"==(BLOCK_WRITE_DST((__global DST_DATA_T *)&dst[i * DST_OC_BLOCK], )==""\n"
326R"==(CONVERT_DST_DATA_T(S00[i])); )==""\n"
327R"==(BLOCK_WRITE_DST((__global DST_DATA_T *)&dst[(i + 8) * DST_OC_BLOCK], )==""\n"
328R"==(CONVERT_DST_DATA_T(S01[i])); )==""\n"
329R"==(#if VER_32MB16C )==""\n"
330R"==(BLOCK_WRITE_DST((__global DST_DATA_T *)&dst[(i + 16) * DST_OC_BLOCK], )==""\n"
331R"==(CONVERT_DST_DATA_T(S02[i])); )==""\n"
332R"==(BLOCK_WRITE_DST((__global DST_DATA_T *)&dst[(i + 24) * DST_OC_BLOCK], )==""\n"
333R"==(CONVERT_DST_DATA_T(S03[i])); )==""\n"
334R"==(#endif )==""\n"
335R"==(} )==""\n"
336R"==(#else )==""\n"
337R"==(BLOCK_WRITE8((__global BLOCK_DATA_T *)&dst[0], AS_UINT8_T(S00)); )==""\n"
338R"==(BLOCK_WRITE8((__global BLOCK_DATA_T *)&dst[8 * OC_BLOCK], AS_UINT8_T(S01)); )==""\n"
339R"==(#if VER_32MB16C )==""\n"
340R"==(BLOCK_WRITE8((__global BLOCK_DATA_T *)&dst[16 * OC_BLOCK], AS_UINT8_T(S02)); )==""\n"
341R"==(BLOCK_WRITE8((__global BLOCK_DATA_T *)&dst[24 * OC_BLOCK], AS_UINT8_T(S03)); )==""\n"
342R"==(#endif )==""\n"
343R"==(#endif )==""\n"
344R"==(#endif )==""\n"
345R"==(return; )==""\n"
346R"==(} )==""\n"
347R"==()==";
348}
349}
350}
351}