1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *gen9_conv_bwd_data_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2021 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
21R"==(#if ID > 1 )==""\n"
22R"==(#define CASE_3D 1 )==""\n"
23R"==(#else )==""\n"
24R"==(#define CASE_3D 0 )==""\n"
25R"==(#endif )==""\n"
26R"==(#if BWD_DATA == 1 )==""\n"
27R"==(__attribute__((reqd_work_group_size(LWS_0, LWS_1, LWS_2))) )==""\n"
28R"==(__attribute__((intel_reqd_sub_group_size(SUB_GROUP_SIZE))) )==""\n"
29R"==(__kernel void )==""\n"
30R"==(gen9_conv_bwd_data(__global DATA_T *diff_src, __global DATA_T *wei, )==""\n"
31R"==(__global DATA_T *diff_dst, __global DATA_T *bias) { )==""\n"
32R"==(MAYBE_SKIP_NON_UNIFORM_WG(); )==""\n"
33R"==(#if VER_16MB16C == 1 )==""\n"
34R"==(const int sp = get_group_id(1); )==""\n"
35R"==(const int sglid = get_sub_group_local_id(); )==""\n"
36R"==(const int icb_mb = get_group_id(2); )==""\n"
37R"==(const int mb = icb_mb / (G * IC / ICB) * MB_BLOCK; )==""\n"
38R"==(const int icb = icb_mb % (G * IC / ICB); )==""\n"
39R"==(const int ic = (icb * ICB) / IC_BLOCK )==""\n"
40R"==(+ get_group_id(0) * (LWS_0 / SUB_GROUP_SIZE) + get_sub_group_id(); )==""\n"
41R"==(const int g = ic / (IC / IC_BLOCK); )==""\n"
42R"==(const int gic = ic % (IC / IC_BLOCK); )==""\n"
43R"==(#if CASE_3D )==""\n"
44R"==(const int id = sp / (IW * IH); )==""\n"
45R"==(const int ihw = sp % (IW * IH); )==""\n"
46R"==(#else )==""\n"
47R"==(const int id = 0; )==""\n"
48R"==(const int ihw = sp; )==""\n"
49R"==(#endif )==""\n"
50R"==(const int ih = ihw / IW; )==""\n"
51R"==(const int iw = ihw % IW; )==""\n"
52R"==(diff_dst += mb * OC * G * OD * OH * OW + g * OC * OD * OH * OW * MB_BLOCK; )==""\n"
53R"==(DATA8_T blockC00 = (DATA8_T)DATA_ZERO; )==""\n"
54R"==(DATA8_T blockC01 = (DATA8_T)DATA_ZERO; )==""\n"
55R"==(if (WITH_BIAS) { )==""\n"
56R"==(#if IS_DW )==""\n"
57R"==(const int bg_off = g * IC + sglid; )==""\n"
58R"==(const int bc_off = gic * IC_BLOCK; )==""\n"
59R"==(DATA_T b = (G_WO_PADDING % IC_BLOCK == 0 || bg_off < G_WO_PADDING) )==""\n"
60R"==(? bias[bg_off + bc_off] )==""\n"
61R"==(: DATA_ZERO; )==""\n"
62R"==(#else )==""\n"
63R"==(const int bg_off = g * IC; )==""\n"
64R"==(const int bc_off = gic * IC_BLOCK + sglid; )==""\n"
65R"==(DATA_T b = (IC_WO_PADDING % IC_BLOCK == 0 || bc_off < IC_WO_PADDING) )==""\n"
66R"==(? bias[bg_off + bc_off] )==""\n"
67R"==(: DATA_ZERO; )==""\n"
68R"==(#endif )==""\n"
69R"==(unroll_for(int i = 0; i < 8; ++i) { )==""\n"
70R"==(blockC00[i] = b; )==""\n"
71R"==(blockC01[i] = b; )==""\n"
72R"==(} )==""\n"
73R"==(} )==""\n"
74R"==(wei += gic * KD * KH * KW * OC_BLOCK * IC_BLOCK )==""\n"
75R"==(+ g * IC * OC * KD * KH * KW; )==""\n"
76R"==(int ocb = 0; )==""\n"
77R"==(do { )==""\n"
78R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
79R"==(for (int kd = 0; kd < KD; ++kd) )==""\n"
80R"==(for (int kh = 0; kh < KH; ++kh) )==""\n"
81R"==(for (int kw = 0; kw < KW; ++kw) { )==""\n"
82R"==(if (iw + PW < kw * (1 + DW) || ih + PH < kh * (1 + DH)) )==""\n"
83R"==(continue; )==""\n"
84R"==(#if CASE_3D )==""\n"
85R"==(if (id + PD < kd * (1 + DD)) continue; )==""\n"
86R"==(int od = id - kd * (1 + DD) + PD; )==""\n"
87R"==(if (od % SD != 0) continue; )==""\n"
88R"==(od /= SD; )==""\n"
89R"==(if (od >= OD) continue; )==""\n"
90R"==(#endif )==""\n"
91R"==(int ow = iw - kw * (1 + DW) + PW; )==""\n"
92R"==(int oh = ih - kh * (1 + DH) + PH; )==""\n"
93R"==(#if SW != 1 || SH != 1 )==""\n"
94R"==(if (ow % SW != 0 || oh % SH != 0) continue; )==""\n"
95R"==(ow /= SW; )==""\n"
96R"==(oh /= SH; )==""\n"
97R"==(#endif )==""\n"
98R"==(if (oh >= OH || ow >= OW) continue; )==""\n"
99R"==(const __global DATA_T *diff_dst1 = diff_dst )==""\n"
100R"==(+ ow * OC_BLOCK * MB_BLOCK )==""\n"
101R"==(+ oh * OW * OC_BLOCK * MB_BLOCK; )==""\n"
102R"==(#if CASE_3D )==""\n"
103R"==(diff_dst1 += od * OH * OW * OC_BLOCK * MB_BLOCK; )==""\n"
104R"==(#endif )==""\n"
105R"==(const __global DATA_T *wei1 = wei )==""\n"
106R"==(#if CASE_3D )==""\n"
107R"==(+ kd * KH * KW * OC_BLOCK * IC_BLOCK )==""\n"
108R"==(#endif )==""\n"
109R"==(+ kh * KW * OC_BLOCK * IC_BLOCK )==""\n"
110R"==(+ kw * OC_BLOCK * IC_BLOCK; )==""\n"
111R"==(#else )==""\n"
112R"==(int ow = (iw + PW); )==""\n"
113R"==(int oh = (ih + PH); )==""\n"
114R"==(#if CASE_3D )==""\n"
115R"==(int od = (id + PD); )==""\n"
116R"==(#endif )==""\n"
117R"==(bool do_ker = true; )==""\n"
118R"==(#if SW != 1 || SH != 1 || SD != 1 )==""\n"
119R"==(do_ker = ow % SW == 0 && oh % SH == 0; )==""\n"
120R"==(ow /= SW; )==""\n"
121R"==(oh /= SH; )==""\n"
122R"==(#if CASE_3D )==""\n"
123R"==(do_ker = do_ker && od % SD == 0; )==""\n"
124R"==(od /= SD; )==""\n"
125R"==(#endif )==""\n"
126R"==(#endif )==""\n"
127R"==(#if PH != 0 || PW != 0 || PD != 0 )==""\n"
128R"==(do_ker = do_ker && (oh < OH && ow < OW); )==""\n"
129R"==(#if CASE_3D )==""\n"
130R"==(do_ker = do_ker && (od < OD); )==""\n"
131R"==(#endif )==""\n"
132R"==(#endif )==""\n"
133R"==(#if SW != 1 || SH != 1 || SD != 1 || PH != 0 || PW != 0 || PD != 0 )==""\n"
134R"==(if (do_ker) { )==""\n"
135R"==(#endif )==""\n"
136R"==(const __global DATA_T *diff_dst1 = diff_dst )==""\n"
137R"==(+ ow * OC_BLOCK * MB_BLOCK + oh * OW * OC_BLOCK * MB_BLOCK; )==""\n"
138R"==(#if CASE_3D )==""\n"
139R"==(diff_dst1 += od * OH * OW * OC_BLOCK * MB_BLOCK; )==""\n"
140R"==(#endif )==""\n"
141R"==(const __global DATA_T *wei1 = wei; )==""\n"
142R"==(#endif )==""\n"
143R"==(#define LOAD_DIFF_DST(_block, _diff_dst, mb_chunk) \ )==""\n"
144R"==({ \ )==""\n"
145R"==((_block) = AS_DATA8_T( \ )==""\n"
146R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)((_diff_dst) \ )==""\n"
147R"==(+ (mb_chunk)*OC_BLOCK))); \ )==""\n"
148R"==(} )==""\n"
149R"==(#define SAVE_SRC_DIFF(_block, _diff_src, mb_chunk) \ )==""\n"
150R"==({ \ )==""\n"
151R"==(BLOCK_WRITE8((const __global BLOCK_DATA_T *)(&( \ )==""\n"
152R"==(_diff_src)[(mb_chunk)*IC_BLOCK]), \ )==""\n"
153R"==(AS_BLOCK_DATA8_T((_block))); \ )==""\n"
154R"==(} )==""\n"
155R"==(#if DT_F32 )==""\n"
156R"==(#define TRANSPOSE_8(_block, _col) \ )==""\n"
157R"==((DATA8_T)(intel_sub_group_shuffle(_block, _col)) )==""\n"
158R"==(#else )==""\n"
159R"==(#define TRANSPOSE_8(_block, _col) \ )==""\n"
160R"==((DATA8_T)(intel_sub_group_shuffle(_block[0], _col), \ )==""\n"
161R"==(intel_sub_group_shuffle(_block[1], _col), \ )==""\n"
162R"==(intel_sub_group_shuffle(_block[2], _col), \ )==""\n"
163R"==(intel_sub_group_shuffle(_block[3], _col), \ )==""\n"
164R"==(intel_sub_group_shuffle(_block[4], _col), \ )==""\n"
165R"==(intel_sub_group_shuffle(_block[5], _col), \ )==""\n"
166R"==(intel_sub_group_shuffle(_block[6], _col), \ )==""\n"
167R"==(intel_sub_group_shuffle(_block[7], _col)) )==""\n"
168R"==(#endif )==""\n"
169R"==(#define FMA8(a, b, c) fma((DATA8_T)(a), (DATA8_T)b, (DATA8_T)c) )==""\n"
170R"==(#define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB, _blockB1) \ )==""\n"
171R"==({ \ )==""\n"
172R"==(_result = FMA8(_blockB.s0, TRANSPOSE_8(_blockA, 0), _result); \ )==""\n"
173R"==(_result = FMA8(_blockB.s1, TRANSPOSE_8(_blockA, 1), _result); \ )==""\n"
174R"==(_result = FMA8(_blockB.s2, TRANSPOSE_8(_blockA, 2), _result); \ )==""\n"
175R"==(_result = FMA8(_blockB.s3, TRANSPOSE_8(_blockA, 3), _result); \ )==""\n"
176R"==(_result = FMA8(_blockB.s4, TRANSPOSE_8(_blockA, 4), _result); \ )==""\n"
177R"==(_result = FMA8(_blockB.s5, TRANSPOSE_8(_blockA, 5), _result); \ )==""\n"
178R"==(_result = FMA8(_blockB.s6, TRANSPOSE_8(_blockA, 6), _result); \ )==""\n"
179R"==(_result = FMA8(_blockB.s7, TRANSPOSE_8(_blockA, 7), _result); \ )==""\n"
180R"==(_result = FMA8(_blockB1.s0, TRANSPOSE_8(_blockA, 8), _result); \ )==""\n"
181R"==(_result = FMA8(_blockB1.s1, TRANSPOSE_8(_blockA, 9), _result); \ )==""\n"
182R"==(_result = FMA8(_blockB1.s2, TRANSPOSE_8(_blockA, 10), _result); \ )==""\n"
183R"==(_result = FMA8(_blockB1.s3, TRANSPOSE_8(_blockA, 11), _result); \ )==""\n"
184R"==(_result = FMA8(_blockB1.s4, TRANSPOSE_8(_blockA, 12), _result); \ )==""\n"
185R"==(_result = FMA8(_blockB1.s5, TRANSPOSE_8(_blockA, 13), _result); \ )==""\n"
186R"==(_result = FMA8(_blockB1.s6, TRANSPOSE_8(_blockA, 14), _result); \ )==""\n"
187R"==(_result = FMA8(_blockB1.s7, TRANSPOSE_8(_blockA, 15), _result); \ )==""\n"
188R"==(} )==""\n"
189R"==(DATA8_T blockA0, blockA1; )==""\n"
190R"==(LOAD_DIFF_DST(blockA0, diff_dst1, 0); )==""\n"
191R"==(LOAD_DIFF_DST(blockA1, diff_dst1, 8); )==""\n"
192R"==(DATA8_T blockB00 = AS_DATA8_T( )==""\n"
193R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)wei1)); )==""\n"
194R"==(DATA8_T blockB01 = AS_DATA8_T( )==""\n"
195R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)(wei1 )==""\n"
196R"==(+ 8 * IC_BLOCK))); )==""\n"
197R"==(MULTIPLY_BLOCKS_8x8(blockC00, blockA0, blockB00, blockB01); )==""\n"
198R"==(MULTIPLY_BLOCKS_8x8(blockC01, blockA1, blockB00, blockB01); )==""\n"
199R"==(#undef TRANSPOSE_BLOCK_8 )==""\n"
200R"==(#undef MULTIPLY_BLOCKS_8x8 )==""\n"
201R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
202R"==(} )==""\n"
203R"==(#else )==""\n"
204R"==(#if SW != 1 || SH != 1 || SD != 1 || PH != 0 || PW != 0 || PD != 0 )==""\n"
205R"==(} )==""\n"
206R"==(#endif )==""\n"
207R"==(#endif )==""\n"
208R"==(diff_dst += OC_BLOCK * OD * OH * OW * MB_BLOCK; )==""\n"
209R"==(wei += IC * KD * KH * KW * OC_BLOCK; )==""\n"
210R"==(ocb += OC_BLOCK; )==""\n"
211R"==(} while (ocb < OC); )==""\n"
212R"==(__global DATA_T *src_write0 = diff_src + mb * IC * G * ID * IH * IW )==""\n"
213R"==(+ gic * ID * IH * IW * IC_BLOCK * MB_BLOCK )==""\n"
214R"==(+ g * IC * ID * IH * IW * MB_BLOCK )==""\n"
215R"==(+ id * IH * IW * IC_BLOCK * MB_BLOCK + ih * IW * IC_BLOCK * MB_BLOCK )==""\n"
216R"==(+ iw * IC_BLOCK * MB_BLOCK; )==""\n"
217R"==(SAVE_SRC_DIFF(blockC00, src_write0, 0); )==""\n"
218R"==(SAVE_SRC_DIFF(blockC01, src_write0, 8); )==""\n"
219R"==(#endif )==""\n"
220R"==(#if VER_8OW16C == 1 )==""\n"
221R"==(const int sp = get_group_id(1); )==""\n"
222R"==(const int sglid = get_sub_group_local_id(); )==""\n"
223R"==(const int icb_mb = get_group_id(2); )==""\n"
224R"==(const int mb = icb_mb / (G * IC / ICB); )==""\n"
225R"==(const int icb = icb_mb % (G * IC / ICB); )==""\n"
226R"==(const int ic = (icb * ICB) / IC_BLOCK )==""\n"
227R"==(+ get_group_id(0) * (LWS_0 / SUB_GROUP_SIZE) + get_sub_group_id(); )==""\n"
228R"==(const int g = ic / (IC / IC_BLOCK); )==""\n"
229R"==(const int gic = ic % (IC / IC_BLOCK); )==""\n"
230R"==(#if CASE_3D )==""\n"
231R"==(const int id = sp / (IWB * IH); )==""\n"
232R"==(const int ihw = sp % (IWB * IH); )==""\n"
233R"==(#else )==""\n"
234R"==(const int id = 0; )==""\n"
235R"==(const int ihw = sp; )==""\n"
236R"==(#endif )==""\n"
237R"==(const int ih = ihw / IWB; )==""\n"
238R"==(const int iw = (ihw % IWB) * IW_BLOCK; )==""\n"
239R"==(diff_dst += mb * OC * G * OD * OH * OW + g * OC * OD * OH * OW * MB_BLOCK; )==""\n"
240R"==(DATA_T blockC00[IW_BLOCK] = {DATA_ZERO}; )==""\n"
241R"==(if (WITH_BIAS) { )==""\n"
242R"==(#if IS_DW )==""\n"
243R"==(const int bg_off = g * IC + sglid; )==""\n"
244R"==(const int bc_off = gic * IC_BLOCK; )==""\n"
245R"==(DATA_T b = (G_WO_PADDING % IC_BLOCK == 0 || bg_off < G_WO_PADDING) )==""\n"
246R"==(? bias[bg_off + bc_off] )==""\n"
247R"==(: DATA_ZERO; )==""\n"
248R"==(#else )==""\n"
249R"==(const int bg_off = g * IC; )==""\n"
250R"==(const int bc_off = gic * IC_BLOCK + sglid; )==""\n"
251R"==(DATA_T b = (IC_WO_PADDING % IC_BLOCK == 0 || bc_off < IC_WO_PADDING) )==""\n"
252R"==(? bias[bg_off + bc_off] )==""\n"
253R"==(: DATA_ZERO; )==""\n"
254R"==(#endif )==""\n"
255R"==(unroll_for(int i = 0; i < IW_BLOCK; ++i) { blockC00[i] = b; } )==""\n"
256R"==(} )==""\n"
257R"==(wei += gic * KD * KH * KW * OC_BLOCK * IC_BLOCK )==""\n"
258R"==(+ g * IC * OC * KD * KH * KW; )==""\n"
259R"==(for (int ocb = 0; ocb < OC; ocb += OC_BLOCK) { )==""\n"
260R"==(const __global DATA_T *diff_dst1; )==""\n"
261R"==(const __global DATA_T *wei1; )==""\n"
262R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
263R"==(for (int kd = 0; kd < KD; ++kd) { )==""\n"
264R"==(#if CASE_3D )==""\n"
265R"==(if (id + PD < kd * (1 + DD)) continue; )==""\n"
266R"==(int od = id - kd * (1 + DD) + PD; )==""\n"
267R"==(if (od % SD != 0) continue; )==""\n"
268R"==(od /= SD; )==""\n"
269R"==(if (od >= OD) continue; )==""\n"
270R"==(#endif )==""\n"
271R"==(for (int kh = 0; kh < KH; ++kh) { )==""\n"
272R"==(if (ih + PH < kh * (1 + DH)) continue; )==""\n"
273R"==(int oh = ih - kh * (1 + DH) + PH; )==""\n"
274R"==(if (oh % SH != 0) continue; )==""\n"
275R"==(oh /= SH; )==""\n"
276R"==(if (oh >= OH) continue; )==""\n"
277R"==(__attribute__((opencl_unroll_hint(KW))) )==""\n"
278R"==(for (int kw = 0; kw < KW; ++kw) { )==""\n"
279R"==(diff_dst1 = diff_dst + oh * OW * OC_BLOCK * MB_BLOCK; )==""\n"
280R"==(#if CASE_3D )==""\n"
281R"==(diff_dst1 += od * OH * OW * OC_BLOCK * MB_BLOCK; )==""\n"
282R"==(#endif )==""\n"
283R"==(wei1 = wei )==""\n"
284R"==(#if CASE_3D )==""\n"
285R"==(+ kd * KH * KW * OC_BLOCK * IC_BLOCK )==""\n"
286R"==(#endif )==""\n"
287R"==(+ kh * KW * OC_BLOCK * IC_BLOCK )==""\n"
288R"==(+ kw * OC_BLOCK * IC_BLOCK; )==""\n"
289R"==(#else )==""\n"
290R"==(int oh = (ih + PH); )==""\n"
291R"==(#if CASE_3D )==""\n"
292R"==(int od = (id + PD); )==""\n"
293R"==(#endif )==""\n"
294R"==(bool do_ker = true; )==""\n"
295R"==(#if SW != 1 || SH != 1 || SD != 1 )==""\n"
296R"==(do_ker = oh % SH == 0; )==""\n"
297R"==(oh /= SH; )==""\n"
298R"==(#if CASE_3D )==""\n"
299R"==(do_ker = do_ker && od % SD == 0; )==""\n"
300R"==(od /= SD; )==""\n"
301R"==(#endif )==""\n"
302R"==(#endif )==""\n"
303R"==(#if PH != 0 || PW != 0 || PD != 0 )==""\n"
304R"==(do_ker = do_ker && (oh < OH); )==""\n"
305R"==(#if CASE_3D )==""\n"
306R"==(do_ker = do_ker && (od < OD); )==""\n"
307R"==(#endif )==""\n"
308R"==(#endif )==""\n"
309R"==(#if SW != 1 || SH != 1 || SD != 1 || PH != 0 || PW != 0 || PD != 0 )==""\n"
310R"==(if (do_ker) { )==""\n"
311R"==(#endif )==""\n"
312R"==(diff_dst1 = diff_dst + oh * OW * OC_BLOCK * MB_BLOCK; )==""\n"
313R"==(#if CASE_3D )==""\n"
314R"==(diff_dst1 += od * OH * OW * OC_BLOCK * MB_BLOCK; )==""\n"
315R"==(#endif )==""\n"
316R"==(wei1 = wei; )==""\n"
317R"==(#endif )==""\n"
318R"==(#define TRANSPOSE_1(_block, _col) \ )==""\n"
319R"==((DATA_T)(intel_sub_group_shuffle(_block, _col)) )==""\n"
320R"==(#define FMA1(a, b, c) fma((DATA_T)(a), (DATA_T)b, (DATA_T)c) )==""\n"
321R"==(#define HAS_PAD_W (PW > 0 || OW * SW - PW + (KW - 1) * (1 + DW) >= IW) )==""\n"
322R"==(#define _BLOCK_READ8(ptr) \ )==""\n"
323R"==(AS_DATA8_T(BLOCK_READ8((const __global BLOCK_DATA_T *)(ptr))) )==""\n"
324R"==(#define _BLOCK_READ4(ptr) \ )==""\n"
325R"==(AS_DATA4_T(BLOCK_READ4((const __global BLOCK_DATA_T *)(ptr))) )==""\n"
326R"==(#define _BLOCK_READ2(ptr) \ )==""\n"
327R"==(AS_DATA2_T(BLOCK_READ2((const __global BLOCK_DATA_T *)(ptr))) )==""\n"
328R"==(#define _BLOCK_READ(ptr) \ )==""\n"
329R"==(AS_DATA_T(BLOCK_READ((const __global BLOCK_DATA_T *)(ptr))) )==""\n"
330R"==(#define unrolled_read(n, block, ptr) \ )==""\n"
331R"==(do { \ )==""\n"
332R"==(if ((n)&16) { \ )==""\n"
333R"==(*((DATA8_T *)(block)) = _BLOCK_READ8((ptr)); \ )==""\n"
334R"==(*((DATA8_T *)((block) + 8)) = _BLOCK_READ8((ptr) + 8 * 16); \ )==""\n"
335R"==(} \ )==""\n"
336R"==(if ((n)&8) \ )==""\n"
337R"==(*((DATA8_T *)((block) + ((n) & ~15))) \ )==""\n"
338R"==(= _BLOCK_READ8((ptr) + ((n) & ~15) * 16); \ )==""\n"
339R"==(if ((n)&4) \ )==""\n"
340R"==(*((DATA4_T *)((block) + ((n) & ~7))) \ )==""\n"
341R"==(= _BLOCK_READ4((ptr) + ((n) & ~7) * 16); \ )==""\n"
342R"==(if ((n)&2) \ )==""\n"
343R"==(*((DATA2_T *)((block) + ((n) & ~3))) \ )==""\n"
344R"==(= _BLOCK_READ2((ptr) + ((n) & ~3) * 16); \ )==""\n"
345R"==(if ((n)&1) \ )==""\n"
346R"==(*((block) + ((n) & ~1)) = _BLOCK_READ((ptr) + ((n) & ~1) * 16); \ )==""\n"
347R"==(} while (0) )==""\n"
348R"==(#define MULTIPLY_BLOCKS_8x8(_result, _blockA, _blockB, _blockB1) \ )==""\n"
349R"==({ \ )==""\n"
350R"==(_result = FMA1(_blockB.s0, TRANSPOSE_1(_blockA, 0), _result); \ )==""\n"
351R"==(_result = FMA1(_blockB.s1, TRANSPOSE_1(_blockA, 1), _result); \ )==""\n"
352R"==(_result = FMA1(_blockB.s2, TRANSPOSE_1(_blockA, 2), _result); \ )==""\n"
353R"==(_result = FMA1(_blockB.s3, TRANSPOSE_1(_blockA, 3), _result); \ )==""\n"
354R"==(_result = FMA1(_blockB.s4, TRANSPOSE_1(_blockA, 4), _result); \ )==""\n"
355R"==(_result = FMA1(_blockB.s5, TRANSPOSE_1(_blockA, 5), _result); \ )==""\n"
356R"==(_result = FMA1(_blockB.s6, TRANSPOSE_1(_blockA, 6), _result); \ )==""\n"
357R"==(_result = FMA1(_blockB.s7, TRANSPOSE_1(_blockA, 7), _result); \ )==""\n"
358R"==(_result = FMA1(_blockB1.s0, TRANSPOSE_1(_blockA, 8), _result); \ )==""\n"
359R"==(_result = FMA1(_blockB1.s1, TRANSPOSE_1(_blockA, 9), _result); \ )==""\n"
360R"==(_result = FMA1(_blockB1.s2, TRANSPOSE_1(_blockA, 10), _result); \ )==""\n"
361R"==(_result = FMA1(_blockB1.s3, TRANSPOSE_1(_blockA, 11), _result); \ )==""\n"
362R"==(_result = FMA1(_blockB1.s4, TRANSPOSE_1(_blockA, 12), _result); \ )==""\n"
363R"==(_result = FMA1(_blockB1.s5, TRANSPOSE_1(_blockA, 13), _result); \ )==""\n"
364R"==(_result = FMA1(_blockB1.s6, TRANSPOSE_1(_blockA, 14), _result); \ )==""\n"
365R"==(_result = FMA1(_blockB1.s7, TRANSPOSE_1(_blockA, 15), _result); \ )==""\n"
366R"==(} )==""\n"
367R"==(DATA8_T blockB00 = AS_DATA8_T( )==""\n"
368R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)wei1)); )==""\n"
369R"==(DATA8_T blockB01 = AS_DATA8_T( )==""\n"
370R"==(BLOCK_READ8((const __global BLOCK_DATA_T *)(wei1 )==""\n"
371R"==(+ 8 * IC_BLOCK))); )==""\n"
372R"==(DATA_T blockA[IW_BLOCK] = {0}; )==""\n"
373R"==(#if KW == 1 && !HAS_PAD_W && SW == 1 )==""\n"
374R"==(int iw_bound = min(IW_BLOCK, IW - iw); )==""\n"
375R"==(int ow = iw; )==""\n"
376R"==(unrolled_read( )==""\n"
377R"==(iw_bound, &blockA[0], &(diff_dst1)[ow * OC_BLOCK]); )==""\n"
378R"==(__attribute__(( )==""\n"
379R"==(opencl_unroll_hint(IW_BLOCK))) )==""\n"
380R"==(for (int i = 0; i < IW_BLOCK; i++) { )==""\n"
381R"==(MULTIPLY_BLOCKS_8x8( )==""\n"
382R"==(blockC00[i], blockA[i], blockB00, blockB01); )==""\n"
383R"==(} )==""\n"
384R"==(#else )==""\n"
385R"==(__attribute__((opencl_unroll_hint(IW_BLOCK))) )==""\n"
386R"==(for (int i = 0; i < IW_BLOCK; i++) { )==""\n"
387R"==(#if KW != 1 )==""\n"
388R"==(if (iw + i + PW < kw * (1 + DW)) continue; )==""\n"
389R"==(int ow = iw + i - kw * (1 + DW) + PW; )==""\n"
390R"==(#else )==""\n"
391R"==(int ow = iw + i + PW; )==""\n"
392R"==(#endif )==""\n"
393R"==(#if SW != 1 )==""\n"
394R"==(if (ow % SW != 0) continue; )==""\n"
395R"==(ow /= SW; )==""\n"
396R"==(#endif )==""\n"
397R"==(if (ow >= OW) continue; )==""\n"
398R"==(blockA[i] = AS_DATA_T( )==""\n"
399R"==(BLOCK_READ((const __global BLOCK_DATA_T *)(&( )==""\n"
400R"==(diff_dst1)[ow * OC_BLOCK]))); )==""\n"
401R"==(} )==""\n"
402R"==(__attribute__((opencl_unroll_hint(IW_BLOCK))) )==""\n"
403R"==(for (int i = 0; i < IW_BLOCK; i++) { )==""\n"
404R"==(MULTIPLY_BLOCKS_8x8(blockC00[i], blockA[i], blockB00, blockB01); )==""\n"
405R"==(} )==""\n"
406R"==(#endif )==""\n"
407R"==(#undef TRANSPOSE_BLOCK_8 )==""\n"
408R"==(#undef MULTIPLY_BLOCKS_8x8 )==""\n"
409R"==(#if KH != 1 || KW != 1 || KD != 1 )==""\n"
410R"==(} )==""\n"
411R"==(} )==""\n"
412R"==(} )==""\n"
413R"==(#else )==""\n"
414R"==(#if SW != 1 || SH != 1 || SD != 1 || PH != 0 || PW != 0 || PD != 0 )==""\n"
415R"==(} )==""\n"
416R"==(#endif )==""\n"
417R"==(#endif )==""\n"
418R"==(diff_dst += OC_BLOCK * OD * OH * OW * MB_BLOCK; )==""\n"
419R"==(wei += IC * KD * KH * KW * OC_BLOCK; )==""\n"
420R"==(} )==""\n"
421R"==(__global DATA_T *src_write0 = diff_src + mb * IC * G * ID * IH * IW )==""\n"
422R"==(+ gic * ID * IH * IW * IC_BLOCK * MB_BLOCK )==""\n"
423R"==(+ g * IC * ID * IH * IW * MB_BLOCK )==""\n"
424R"==(+ id * IH * IW * IC_BLOCK * MB_BLOCK + ih * IW * IC_BLOCK * MB_BLOCK )==""\n"
425R"==(+ iw * IC_BLOCK * MB_BLOCK; )==""\n"
426R"==(for (int i = 0; i < IW_BLOCK; i++) { )==""\n"
427R"==(if (iw + i >= IW) continue; )==""\n"
428R"==(BLOCK_WRITE((__global BLOCK_DATA_T *)(&(src_write0)[i * IC_BLOCK]), )==""\n"
429R"==(AS_BLOCK_DATA_T(blockC00[i])); )==""\n"
430R"==(} )==""\n"
431R"==(#endif )==""\n"
432R"==(} )==""\n"
433R"==(#endif )==""\n"
434R"==()==";
435}
436}
437}
438}