gen9_pooling_kernel.cpp source code [oneDNN/build/src/gpu/ocl/gen9_pooling_kernel.cpp]

1	namespace dnnl {
2	namespace impl {
3	namespace gpu {
4	namespace ocl {
5	const char gen9_pooling_kernel = R"==(/****************************************************************************** )==""\n"
6	R"==(* Copyright 2020-2022 Intel Corporation )==""\n"
7	R"==(* )==""\n"
8	R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9	R"==(* you may not use this file except in compliance with the License. )==""\n"
10	R"==(* You may obtain a copy of the License at )==""\n"
11	R"==(* )==""\n"
12	R"==(* http: )==""\n"
13	R"==(* )==""\n"
14	R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15	R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16	R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17	R"==(* See the License for the specific language governing permissions and )==""\n"
18	R"==(* limitations under the License. )==""\n"
19	R"==(*******************************************************************************/ )==""\n"
20	R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
21	R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
22	R"==(inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr, int c, )==""\n"
23	R"==(int blocks_stride, int chunks_per_block); )==""\n"
24	R"==(inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr, int c, )==""\n"
25	R"==(int blocks_stride, int chunks_per_block); )==""\n"
26	R"==(inline void write_vect_c_block(int idx, __global DATA_T *ptr, int c, )==""\n"
27	R"==(int blocks_stride, int chunks_per_block, VECT_DATA_T block); )==""\n"
28	R"==(inline void write_vect_c_block_int(int idx, __global int *ptr, int c, )==""\n"
29	R"==(int blocks_stride, int chunks_per_block, VECT_INT_T block); )==""\n"
30	R"==(#if DT_BF16 \|\| DT_F16 )==""\n"
31	R"==(#define USE_FLOATS true )==""\n"
32	R"==(#else )==""\n"
33	R"==(#define USE_FLOATS (ALG_AVG_NP \|\| ALG_AVG_P) )==""\n"
34	R"==(#endif )==""\n"
35	R"==(#if IS_FWD )==""\n"
36	R"==(KERNEL_ATTR )==""\n"
37	R"==(__kernel void gen9_pooling_fwd(__global DATA_T src, __global int ws, )==""\n"
38	R"==(__global DATA_T *dst, const int batch_id POST_OP_ARGS) { )==""\n"
39	R"==(const int mb0 = MB_BLOCK_SIZE * batch_id + GWS_GET_MB(); )==""\n"
40	R"==(#if UNROLL_MB )==""\n"
41	R"==(const int mb1 = mb0 + MB / 2; )==""\n"
42	R"==(#endif )==""\n"
43	R"==(const int c = GWS_GET_C(); )==""\n"
44	R"==(const int od = GWS_GET_OD(); )==""\n"
45	R"==(const int oh = GWS_GET_OH(); )==""\n"
46	R"==(const int ow = GWS_GET_OW(); )==""\n"
47	R"==(#if USE_MB_C_BLOCK )==""\n"
48	R"==(const int src_stride = (SRC_SB0 > 1) ? SRC_SB0 : SRC_S0; )==""\n"
49	R"==(const int dst_stride = (DST_SB0 > 1) ? DST_SB0 : DST_S0; )==""\n"
50	R"==(const int src_chunks_per_c_block = CHUNKS_PER_C_BLOCK; )==""\n"
51	R"==(const int dst_chunks_per_c_block = CHUNKS_PER_C_BLOCK; )==""\n"
52	R"==(#elif USE_ONLY_C_BLOCK )==""\n"
53	R"==(const int src_stride = (SRC_B1 > 1) ? SRC_S1 : SUB_GROUP_SIZE; )==""\n"
54	R"==(const int dst_stride = (DST_B1 > 1) ? DST_S1 : SUB_GROUP_SIZE; )==""\n"
55	R"==(const int src_chunks_per_c_block )==""\n"
56	R"==(= (SRC_B1 > 1) ? (SRC_B1 / SUB_GROUP_SIZE) : 1; )==""\n"
57	R"==(const int dst_chunks_per_c_block )==""\n"
58	R"==(= (DST_B1 > 1) ? (DST_B1 / SUB_GROUP_SIZE) : 1; )==""\n"
59	R"==(#endif )==""\n"
60	R"==(const int ws_stride = dst_stride; )==""\n"
61	R"==(const int ws_chunks_per_c_block = dst_chunks_per_c_block; )==""\n"
62	R"==(if (mb0 >= SRC_D0) { )==""\n"
63	R"==(VECT_DATA_T dst_zero = DATA_ZERO; )==""\n"
64	R"==(VECT_INT_T ws_zero = 0; )==""\n"
65	R"==(int off = DST_OFF(mb0, c, od, oh, ow); )==""\n"
66	R"==(write_vect_c_block( )==""\n"
67	R"==(0, &dst[off], c, dst_stride, dst_chunks_per_c_block, dst_zero); )==""\n"
68	R"==(write_vect_c_block( )==""\n"
69	R"==(1, &dst[off], c, dst_stride, dst_chunks_per_c_block, dst_zero); )==""\n"
70	R"==(#if ALG_MAX && IS_TRAINING )==""\n"
71	R"==(write_vect_c_block_int( )==""\n"
72	R"==(0, &ws[off], c, ws_stride, ws_chunks_per_c_block, ws_zero); )==""\n"
73	R"==(write_vect_c_block_int( )==""\n"
74	R"==(1, &ws[off], c, ws_stride, ws_chunks_per_c_block, ws_zero); )==""\n"
75	R"==(#endif )==""\n"
76	R"==(return; )==""\n"
77	R"==(} )==""\n"
78	R"==(const int id = od * SD - PD; )==""\n"
79	R"==(const int ih = oh * SH - PH; )==""\n"
80	R"==(const int iw = ow * SW - PW; )==""\n"
81	R"==(#if USE_FLOATS )==""\n"
82	R"==(VECT_FLOAT_T D0 = ALG_MAX ? CONVERT_FLOAT_T(DATA_MIN) : 0.0f; )==""\n"
83	R"==(VECT_FLOAT_T D1 = ALG_MAX ? CONVERT_FLOAT_T(DATA_MIN) : 0.0f; )==""\n"
84	R"==(#else )==""\n"
85	R"==(VECT_DATA_T D0 = ALG_MAX ? DATA_MIN : DATA_ZERO; )==""\n"
86	R"==(VECT_DATA_T D1 = ALG_MAX ? DATA_MIN : DATA_ZERO; )==""\n"
87	R"==(#endif )==""\n"
88	R"==(VECT_INT_T WS0 = 0, WS1 = 0; )==""\n"
89	R"==(for (int kd = 0; kd < KD; ++kd) { )==""\n"
90	R"==(if (id + kd < 0 \|\| id + kd >= ID) continue; )==""\n"
91	R"==(for (int kh = 0; kh < KH; ++kh) { )==""\n"
92	R"==(if (ih + kh < 0 \|\| ih + kh >= IH) continue; )==""\n"
93	R"==(for (int kw = 0; kw < KW; ++kw) { )==""\n"
94	R"==(if (iw + kw < 0 \|\| iw + kw >= IW) continue; )==""\n"
95	R"==(int src_off0 = SRC_OFF(mb0, c, id + kd, ih + kh, iw + kw); )==""\n"
96	R"==(#if UNROLL_MB )==""\n"
97	R"==(int src_off1 = SRC_OFF(mb1, c, id + kd, ih + kh, iw + kw); )==""\n"
98	R"==(#endif )==""\n"
99	R"==(#if USE_FLOATS )==""\n"
100	R"==(VECT_FLOAT_T S0 = CONVERT_VECT_FLOAT_T(read_vect_c_block(0, )==""\n"
101	R"==(&src[src_off0], c, src_stride, src_chunks_per_c_block)); )==""\n"
102	R"==(#if UNROLL_MB )==""\n"
103	R"==(VECT_FLOAT_T S1 = CONVERT_VECT_FLOAT_T(read_vect_c_block(0, )==""\n"
104	R"==(&src[src_off1], c, src_stride, src_chunks_per_c_block)); )==""\n"
105	R"==(#else )==""\n"
106	R"==(VECT_FLOAT_T S1 = CONVERT_VECT_FLOAT_T(read_vect_c_block(1, )==""\n"
107	R"==(&src[src_off0], c, src_stride, src_chunks_per_c_block)); )==""\n"
108	R"==(#endif )==""\n"
109	R"==(#else )==""\n"
110	R"==(VECT_DATA_T S0 = read_vect_c_block(0, &src[src_off0], c, )==""\n"
111	R"==(src_stride, src_chunks_per_c_block); )==""\n"
112	R"==(#if UNROLL_MB )==""\n"
113	R"==(VECT_DATA_T S1 = read_vect_c_block(0, &src[src_off1], c, )==""\n"
114	R"==(src_stride, src_chunks_per_c_block); )==""\n"
115	R"==(#else )==""\n"
116	R"==(VECT_DATA_T S1 = read_vect_c_block(1, &src[src_off0], c, )==""\n"
117	R"==(src_stride, src_chunks_per_c_block); )==""\n"
118	R"==(#endif )==""\n"
119	R"==(#endif )==""\n"
120	R"==(#if ALG_MAX )==""\n"
121	R"==(#if IS_TRAINING )==""\n"
122	R"==(VECT_INT_T CMP0 = isless(D0, S0); )==""\n"
123	R"==(WS0 = select(WS0, kd * KH * KW + kh * KW + kw, CMP0); )==""\n"
124	R"==(D0 = select(D0, S0, CMP0); )==""\n"
125	R"==(VECT_INT_T CMP1 = isless(D1, S1); )==""\n"
126	R"==(WS1 = select(WS1, kd * KH * KW + kh * KW + kw, CMP1); )==""\n"
127	R"==(D1 = select(D1, S1, CMP1); )==""\n"
128	R"==(#else )==""\n"
129	R"==(D0 = max(D0, S0); )==""\n"
130	R"==(D1 = max(D1, S1); )==""\n"
131	R"==(#endif )==""\n"
132	R"==(#else )==""\n"
133	R"==(D0 += S0; )==""\n"
134	R"==(D1 += S1; )==""\n"
135	R"==(#endif )==""\n"
136	R"==(} )==""\n"
137	R"==(} )==""\n"
138	R"==(} )==""\n"
139	R"==(#if ALG_AVG_P )==""\n"
140	R"==(D0 = D0 / (KD * KH * KW); )==""\n"
141	R"==(D1 = D1 / (KD * KH * KW); )==""\n"
142	R"==(#endif )==""\n"
143	R"==(#if ALG_AVG_NP )==""\n"
144	R"==(const int id_start = max(od * SD - PD, 0); )==""\n"
145	R"==(const int ih_start = max(oh * SH - PH, 0); )==""\n"
146	R"==(const int iw_start = max(ow * SW - PW, 0); )==""\n"
147	R"==(const int id_end = min(od * SD - PD + KD, ID); )==""\n"
148	R"==(const int ih_end = min(oh * SH - PH + KH, IH); )==""\n"
149	R"==(const int iw_end = min(ow * SW - PW + KW, IW); )==""\n"
150	R"==(const int num_summands )==""\n"
151	R"==(= (ih_end - ih_start) * (iw_end - iw_start) * (id_end - id_start); )==""\n"
152	R"==(D0 = D0 / num_summands; )==""\n"
153	R"==(D1 = D1 / num_summands; )==""\n"
154	R"==(#endif )==""\n"
155	R"==(int dst_off0 = DST_OFF(mb0, c, od, oh, ow); )==""\n"
156	R"==(#if UNROLL_MB )==""\n"
157	R"==(int dst_off1 = DST_OFF(mb1, c, od, oh, ow); )==""\n"
158	R"==(#endif )==""\n"
159	R"==(VECT_DATA_T sum0; )==""\n"
160	R"==(VECT_DATA_T sum1; )==""\n"
161	R"==(#if WITH_SUM )==""\n"
162	R"==(sum0 = read_vect_c_block( )==""\n"
163	R"==(0, &dst[dst_off0], c, dst_stride, dst_chunks_per_c_block); )==""\n"
164	R"==(#if UNROLL_MB )==""\n"
165	R"==(sum1 = read_vect_c_block( )==""\n"
166	R"==(0, &dst[dst_off1], c, dst_stride, dst_chunks_per_c_block); )==""\n"
167	R"==(#else )==""\n"
168	R"==(sum1 = read_vect_c_block( )==""\n"
169	R"==(1, &dst[dst_off0], c, dst_stride, dst_chunks_per_c_block); )==""\n"
170	R"==(#endif )==""\n"
171	R"==(#endif )==""\n"
172	R"==(const int local_id = get_sub_group_local_id(); )==""\n"
173	R"==(#if VECT_DT_N == 1 )==""\n"
174	R"==(const int po_mb = mb0; )==""\n"
175	R"==(const int po_oc = c + local_id; )==""\n"
176	R"==(if (po_oc < C_WO_PADDING) { )==""\n"
177	R"==(POST_OP_DATA_T po_sum0 = DATA_TO_REF(sum0); )==""\n"
178	R"==(float po_D0 = USE_FLOATS ? D0 : CONVERT_FLOAT_T(D0); )==""\n"
179	R"==(APPLY_POST_OPS_SERIAL_BINARY_2D( )==""\n"
180	R"==(po_D0, float, po_sum0, POST_OP_DATA_T, po_mb, 1, po_oc, 1); )==""\n"
181	R"==(D0 = USE_FLOATS ? po_D0 : CONVERT_DATA_T(po_D0); )==""\n"
182	R"==(POST_OP_DATA_T po_sum1 = DATA_TO_REF(sum1); )==""\n"
183	R"==(float po_D1 = USE_FLOATS ? D1 : CONVERT_FLOAT_T(D1); )==""\n"
184	R"==(APPLY_POST_OPS_SERIAL_BINARY_2D( )==""\n"
185	R"==(po_D1, float, po_sum1, POST_OP_DATA_T, po_mb, 1, po_oc, 1); )==""\n"
186	R"==(D1 = USE_FLOATS ? po_D1 : CONVERT_DATA_T(po_D1); )==""\n"
187	R"==(} )==""\n"
188	R"==(#else )==""\n"
189	R"==(for (int idx = 0; idx < VECT_DT_N; ++idx) { )==""\n"
190	R"==(#if USE_MB_C_BLOCK )==""\n"
191	R"==(int c_sub_block_id = idx % CHUNKS_PER_C_BLOCK; )==""\n"
192	R"==(int mb_sub_block_id = idx / CHUNKS_PER_C_BLOCK; )==""\n"
193	R"==(const int po_oc = c + c_sub_block_id * SUB_GROUP_SIZE + local_id; )==""\n"
194	R"==(int po_mb = (mb0 + mb_sub_block_id) % MB; )==""\n"
195	R"==(#else )==""\n"
196	R"==(const int po_oc = c + idx * SUB_GROUP_SIZE + local_id; )==""\n"
197	R"==(int po_mb = mb0; )==""\n"
198	R"==(#endif )==""\n"
199	R"==(if (po_mb >= MB \|\| po_oc >= C_WO_PADDING) continue; )==""\n"
200	R"==(float d0_i = USE_FLOATS ? D0[idx] : CONVERT_FLOAT_T(D0[idx]); )==""\n"
201	R"==(POST_OP_DATA_T sum0_i = DATA_TO_REF(sum0[idx]); )==""\n"
202	R"==(APPLY_POST_OPS_SERIAL_BINARY_2D( )==""\n"
203	R"==(d0_i, float, sum0_i, POST_OP_DATA_T, po_mb, 1, po_oc, 1); )==""\n"
204	R"==(D0[idx] = USE_FLOATS ? d0_i : CONVERT_DATA_T(d0_i); )==""\n"
205	R"==(float d1_i = USE_FLOATS ? D1[idx] : CONVERT_FLOAT_T(D1[idx]); )==""\n"
206	R"==(POST_OP_DATA_T sum1_i = DATA_TO_REF(sum1[idx]); )==""\n"
207	R"==(po_mb += VECT_DT_N; )==""\n"
208	R"==(APPLY_POST_OPS_SERIAL_BINARY_2D( )==""\n"
209	R"==(d1_i, float, sum1_i, POST_OP_DATA_T, po_mb, 1, po_oc, 1); )==""\n"
210	R"==(D1[idx] = USE_FLOATS ? d1_i : CONVERT_DATA_T(d1_i); )==""\n"
211	R"==(} )==""\n"
212	R"==(#endif )==""\n"
213	R"==(#if USE_FLOATS )==""\n"
214	R"==(VECT_DATA_T res0 = CONVERT_VECTOR_DATA_T(D0); )==""\n"
215	R"==(VECT_DATA_T res1 = CONVERT_VECTOR_DATA_T(D1); )==""\n"
216	R"==(#else )==""\n"
217	R"==(VECT_DATA_T res0 = D0; )==""\n"
218	R"==(VECT_DATA_T res1 = D1; )==""\n"
219	R"==(#endif )==""\n"
220	R"==(write_vect_c_block( )==""\n"
221	R"==(0, &dst[dst_off0], c, dst_stride, dst_chunks_per_c_block, res0); )==""\n"
222	R"==(#if UNROLL_MB )==""\n"
223	R"==(write_vect_c_block( )==""\n"
224	R"==(0, &dst[dst_off1], c, dst_stride, dst_chunks_per_c_block, res1); )==""\n"
225	R"==(#else )==""\n"
226	R"==(write_vect_c_block( )==""\n"
227	R"==(1, &dst[dst_off0], c, dst_stride, dst_chunks_per_c_block, res1); )==""\n"
228	R"==(#endif )==""\n"
229	R"==(#if ALG_MAX && IS_TRAINING )==""\n"
230	R"==(int ws_off0 = dst_off0; )==""\n"
231	R"==(#if UNROLL_MB )==""\n"
232	R"==(int ws_off1 = dst_off1; )==""\n"
233	R"==(#endif )==""\n"
234	R"==(write_vect_c_block_int( )==""\n"
235	R"==(0, &ws[ws_off0], c, ws_stride, ws_chunks_per_c_block, WS0); )==""\n"
236	R"==(#if UNROLL_MB )==""\n"
237	R"==(write_vect_c_block_int( )==""\n"
238	R"==(0, &ws[ws_off1], c, ws_stride, ws_chunks_per_c_block, WS1); )==""\n"
239	R"==(#else )==""\n"
240	R"==(write_vect_c_block_int( )==""\n"
241	R"==(1, &ws[ws_off0], c, ws_stride, ws_chunks_per_c_block, WS1); )==""\n"
242	R"==(#endif )==""\n"
243	R"==(#endif )==""\n"
244	R"==(} )==""\n"
245	R"==(#endif )==""\n"
246	R"==(#if IS_BWD )==""\n"
247	R"==(KERNEL_ATTR )==""\n"
248	R"==(__kernel void gen9_pooling_bwd(__global DATA_T diff_src, __global int ws, )==""\n"
249	R"==(__global DATA_T *diff_dst) { )==""\n"
250	R"==(const int mb0 = GWS_GET_MB(); )==""\n"
251	R"==(#if UNROLL_MB )==""\n"
252	R"==(const int mb1 = mb0 + MB / 4; )==""\n"
253	R"==(const int mb2 = mb1 + MB / 4; )==""\n"
254	R"==(const int mb3 = mb2 + MB / 4; )==""\n"
255	R"==(#endif )==""\n"
256	R"==(const int c = GWS_GET_C(); )==""\n"
257	R"==(const int id = GWS_GET_ID(); )==""\n"
258	R"==(const int ih = GWS_GET_IH(); )==""\n"
259	R"==(const int iw = GWS_GET_IW(); )==""\n"
260	R"==(#if USE_MB_C_BLOCK )==""\n"
261	R"==(const int src_stride = (SRC_SB0 > 1) ? SRC_SB0 : SRC_S0; )==""\n"
262	R"==(const int dst_stride = (DST_SB0 > 1) ? DST_SB0 : DST_S0; )==""\n"
263	R"==(const int src_chunks_per_c_block = CHUNKS_PER_C_BLOCK; )==""\n"
264	R"==(const int dst_chunks_per_c_block = CHUNKS_PER_C_BLOCK; )==""\n"
265	R"==(#elif USE_ONLY_C_BLOCK )==""\n"
266	R"==(const int src_stride = (SRC_B1 > 1) ? SRC_S1 : SUB_GROUP_SIZE; )==""\n"
267	R"==(const int dst_stride = (DST_B1 > 1) ? DST_S1 : SUB_GROUP_SIZE; )==""\n"
268	R"==(const int src_chunks_per_c_block )==""\n"
269	R"==(= (SRC_B1 > 1) ? (SRC_B1 / SUB_GROUP_SIZE) : 1; )==""\n"
270	R"==(const int dst_chunks_per_c_block )==""\n"
271	R"==(= (DST_B1 > 1) ? (DST_B1 / SUB_GROUP_SIZE) : 1; )==""\n"
272	R"==(#endif )==""\n"
273	R"==(const int ws_stride = dst_stride; )==""\n"
274	R"==(const int ws_chunks_per_c_block = dst_chunks_per_c_block; )==""\n"
275	R"==(VECT_FLOAT_T S0 = 0, S1 = 0; )==""\n"
276	R"==(#if UNROLL_MB )==""\n"
277	R"==(VECT_FLOAT_T S2 = 0, S3 = 0; )==""\n"
278	R"==(#endif )==""\n"
279	R"==(for (int kd = 0; kd < KD; kd++) { )==""\n"
280	R"==(int od = (id + PD - kd); )==""\n"
281	R"==(if (od % SD != 0) continue; )==""\n"
282	R"==(od /= SD; )==""\n"
283	R"==(if (od < 0 \|\| od >= OD) continue; )==""\n"
284	R"==(for (int kh = 0; kh < KH; kh++) { )==""\n"
285	R"==(int oh = (ih + PH - kh); )==""\n"
286	R"==(if (oh % SH != 0) continue; )==""\n"
287	R"==(oh /= SH; )==""\n"
288	R"==(if (oh < 0 \|\| oh >= OH) continue; )==""\n"
289	R"==(for (int kw = 0; kw < KW; kw++) { )==""\n"
290	R"==(int ow = (iw + PW - kw); )==""\n"
291	R"==(if (ow % SW != 0) continue; )==""\n"
292	R"==(ow /= SW; )==""\n"
293	R"==(if (ow < 0 \|\| ow >= OW) continue; )==""\n"
294	R"==(const int dst_off0 = DST_OFF(mb0, c, od, oh, ow); )==""\n"
295	R"==(#if UNROLL_MB )==""\n"
296	R"==(const int dst_off1 = DST_OFF(mb1, c, od, oh, ow); )==""\n"
297	R"==(const int dst_off2 = DST_OFF(mb2, c, od, oh, ow); )==""\n"
298	R"==(const int dst_off3 = DST_OFF(mb3, c, od, oh, ow); )==""\n"
299	R"==(#endif )==""\n"
300	R"==(VECT_FLOAT_T D0 = CONVERT_VECT_FLOAT_T( )==""\n"
301	R"==(read_vect_c_block(0, &diff_dst[dst_off0], c, dst_stride, )==""\n"
302	R"==(dst_chunks_per_c_block)); )==""\n"
303	R"==(#if UNROLL_MB )==""\n"
304	R"==(VECT_FLOAT_T D1 = CONVERT_VECT_FLOAT_T( )==""\n"
305	R"==(read_vect_c_block(0, &diff_dst[dst_off1], c, dst_stride, )==""\n"
306	R"==(dst_chunks_per_c_block)); )==""\n"
307	R"==(VECT_FLOAT_T D2 = CONVERT_VECT_FLOAT_T( )==""\n"
308	R"==(read_vect_c_block(0, &diff_dst[dst_off2], c, dst_stride, )==""\n"
309	R"==(dst_chunks_per_c_block)); )==""\n"
310	R"==(VECT_FLOAT_T D3 = CONVERT_VECT_FLOAT_T( )==""\n"
311	R"==(read_vect_c_block(0, &diff_dst[dst_off3], c, dst_stride, )==""\n"
312	R"==(dst_chunks_per_c_block)); )==""\n"
313	R"==(#else )==""\n"
314	R"==(VECT_FLOAT_T D1 = CONVERT_VECT_FLOAT_T( )==""\n"
315	R"==(read_vect_c_block(1, &diff_dst[dst_off0], c, dst_stride, )==""\n"
316	R"==(dst_chunks_per_c_block)); )==""\n"
317	R"==(#endif )==""\n"
318	R"==(#if ALG_MAX )==""\n"
319	R"==(VECT_INT_T WS0 = read_vect_c_block_int( )==""\n"
320	R"==(0, &ws[dst_off0], c, ws_stride, ws_chunks_per_c_block); )==""\n"
321	R"==(#if UNROLL_MB )==""\n"
322	R"==(VECT_INT_T WS1 = read_vect_c_block_int( )==""\n"
323	R"==(0, &ws[dst_off1], c, ws_stride, ws_chunks_per_c_block); )==""\n"
324	R"==(VECT_INT_T WS2 = read_vect_c_block_int( )==""\n"
325	R"==(0, &ws[dst_off2], c, ws_stride, ws_chunks_per_c_block); )==""\n"
326	R"==(VECT_INT_T WS3 = read_vect_c_block_int( )==""\n"
327	R"==(0, &ws[dst_off3], c, ws_stride, ws_chunks_per_c_block); )==""\n"
328	R"==(#else )==""\n"
329	R"==(VECT_INT_T WS1 = read_vect_c_block_int( )==""\n"
330	R"==(1, &ws[dst_off0], c, ws_stride, ws_chunks_per_c_block); )==""\n"
331	R"==(#endif )==""\n"
332	R"==(VECT_INT_T CMP0 = isnotequal( )==""\n"
333	R"==(AS_VECT_FLOAT_T(WS0 - kd * KH * KW - kh * KW - kw), )==""\n"
334	R"==((VECT_FLOAT_T)0); )==""\n"
335	R"==(D0 = select(D0, (VECT_FLOAT_T)0, CMP0); )==""\n"
336	R"==(VECT_INT_T CMP1 = isnotequal( )==""\n"
337	R"==(AS_VECT_FLOAT_T(WS1 - kd * KH * KW - kh * KW - kw), )==""\n"
338	R"==((VECT_FLOAT_T)0); )==""\n"
339	R"==(D1 = select(D1, (VECT_FLOAT_T)0, CMP1); )==""\n"
340	R"==(#if UNROLL_MB )==""\n"
341	R"==(VECT_INT_T CMP2 = isnotequal( )==""\n"
342	R"==(AS_VECT_FLOAT_T(WS2 - kd * KH * KW - kh * KW - kw), )==""\n"
343	R"==((VECT_FLOAT_T)0); )==""\n"
344	R"==(D2 = select(D2, (VECT_FLOAT_T)0, CMP2); )==""\n"
345	R"==(VECT_INT_T CMP3 = isnotequal( )==""\n"
346	R"==(AS_VECT_FLOAT_T(WS3 - kd * KH * KW - kh * KW - kw), )==""\n"
347	R"==((VECT_FLOAT_T)0); )==""\n"
348	R"==(D3 = select(D3, (VECT_FLOAT_T)0, CMP3); )==""\n"
349	R"==(#endif )==""\n"
350	R"==(#endif )==""\n"
351	R"==(#if ALG_AVG_NP )==""\n"
352	R"==(const int id_start = max(id - kd, 0); )==""\n"
353	R"==(const int ih_start = max(ih - kh, 0); )==""\n"
354	R"==(const int iw_start = max(iw - kw, 0); )==""\n"
355	R"==(const int id_end = min(id - kd + KD, ID); )==""\n"
356	R"==(const int ih_end = min(ih - kh + KH, IH); )==""\n"
357	R"==(const int iw_end = min(iw - kw + KW, IW); )==""\n"
358	R"==(const int num_summands = (ih_end - ih_start) )==""\n"
359	R"==(* (iw_end - iw_start) * (id_end - id_start); )==""\n"
360	R"==(D0 /= num_summands; )==""\n"
361	R"==(D1 /= num_summands; )==""\n"
362	R"==(#endif )==""\n"
363	R"==(S0 += D0; )==""\n"
364	R"==(S1 += D1; )==""\n"
365	R"==(#if UNROLL_MB )==""\n"
366	R"==(S2 += D2; )==""\n"
367	R"==(S3 += D3; )==""\n"
368	R"==(#endif )==""\n"
369	R"==(} )==""\n"
370	R"==(} )==""\n"
371	R"==(} )==""\n"
372	R"==(#if ALG_AVG_P )==""\n"
373	R"==(S0 /= KD * KH * KW; )==""\n"
374	R"==(S1 /= KD * KH * KW; )==""\n"
375	R"==(#if UNROLL_MB )==""\n"
376	R"==(S2 /= KD * KH * KW; )==""\n"
377	R"==(S3 /= KD * KH * KW; )==""\n"
378	R"==(#endif )==""\n"
379	R"==(#endif )==""\n"
380	R"==(int src_off0 = SRC_OFF(mb0, c, id, ih, iw); )==""\n"
381	R"==(#if UNROLL_MB )==""\n"
382	R"==(int src_off1 = SRC_OFF(mb1, c, id, ih, iw); )==""\n"
383	R"==(int src_off2 = SRC_OFF(mb2, c, id, ih, iw); )==""\n"
384	R"==(int src_off3 = SRC_OFF(mb3, c, id, ih, iw); )==""\n"
385	R"==(#endif )==""\n"
386	R"==(write_vect_c_block(0, &diff_src[src_off0], c, src_stride, )==""\n"
387	R"==(src_chunks_per_c_block, CONVERT_VECTOR_DATA_T(S0)); )==""\n"
388	R"==(#if UNROLL_MB )==""\n"
389	R"==(write_vect_c_block(0, &diff_src[src_off1], c, src_stride, )==""\n"
390	R"==(src_chunks_per_c_block, CONVERT_VECTOR_DATA_T(S1)); )==""\n"
391	R"==(write_vect_c_block(0, &diff_src[src_off2], c, src_stride, )==""\n"
392	R"==(src_chunks_per_c_block, CONVERT_VECTOR_DATA_T(S2)); )==""\n"
393	R"==(write_vect_c_block(0, &diff_src[src_off3], c, src_stride, )==""\n"
394	R"==(src_chunks_per_c_block, CONVERT_VECTOR_DATA_T(S3)); )==""\n"
395	R"==(#else )==""\n"
396	R"==(write_vect_c_block(1, &diff_src[src_off0], c, src_stride, )==""\n"
397	R"==(src_chunks_per_c_block, CONVERT_VECTOR_DATA_T(S1)); )==""\n"
398	R"==(#endif )==""\n"
399	R"==(} )==""\n"
400	R"==(#endif )==""\n"
401	R"==(inline DATA_T read_c_block(const __global DATA_T *ptr, int c) { )==""\n"
402	R"==(#if C_W_PADDING % SUB_GROUP_SIZE != 0 )==""\n"
403	R"==(int local_id = get_sub_group_local_id(); )==""\n"
404	R"==(int tail = C_WO_PADDING - c; )==""\n"
405	R"==(return (local_id < tail) ? ptr[local_id] : 0; )==""\n"
406	R"==(#else )==""\n"
407	R"==(return AS_DATA_T(BLOCK_READ((const __global BLOCK_DATA_T *)ptr)); )==""\n"
408	R"==(#endif )==""\n"
409	R"==(} )==""\n"
410	R"==(#define CALC_VECT_LEN() \ )==""\n"
411	R"==(({ \ )==""\n"
412	R"==(int size; \ )==""\n"
413	R"==(if (USE_ONLY_C_BLOCK == 1 \ )==""\n"
414	R"==(&& VECT_DT_N > C_WO_PADDING / SUB_GROUP_SIZE + 1) \ )==""\n"
415	R"==(size = C_WO_PADDING / SUB_GROUP_SIZE + 1; \ )==""\n"
416	R"==(else \ )==""\n"
417	R"==(size = VECT_DT_N; \ )==""\n"
418	R"==(size; \ )==""\n"
419	R"==(}) )==""\n"
420	R"==(inline VECT_DATA_T read_vect_c_block(int idx, const __global DATA_T *ptr, int c, )==""\n"
421	R"==(int blocks_stride, int chunks_per_block) { )==""\n"
422	R"==(if (idx >= NVECT) return 0; )==""\n"
423	R"==(if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE) )==""\n"
424	R"==(&& (C_WO_PADDING % (chunks_per_block * SUB_GROUP_SIZE) == 0)) { )==""\n"
425	R"==(return AS_VECT_DATA_T(VECT_BLOCK_READ((const __global BLOCK_DATA_T *)ptr )==""\n"
426	R"==(+ idx * VECT_DT_N * SUB_GROUP_SIZE)); )==""\n"
427	R"==(} else { )==""\n"
428	R"==(VECT_DATA_T ret; )==""\n"
429	R"==(for (int i = 0; i < CALC_VECT_LEN(); i++) { )==""\n"
430	R"==(const int offset_index = (idx * VECT_DT_N + i); )==""\n"
431	R"==(const int local_c_block_index = offset_index % chunks_per_block; )==""\n"
432	R"==(const int global_c_block_index = offset_index / chunks_per_block; )==""\n"
433	R"==(const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE )==""\n"
434	R"==(+ global_c_block_index * blocks_stride; )==""\n"
435	R"==(const int c_off )==""\n"
436	R"==(= (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE )==""\n"
437	R"==(: local_c_block_index * SUB_GROUP_SIZE); )==""\n"
438	R"==(#if VECT_DT_N == 1 )==""\n"
439	R"==(ret = read_c_block(ptr + ptr_offset, c + c_off); )==""\n"
440	R"==(#else )==""\n"
441	R"==(ret[i] = read_c_block(ptr + ptr_offset, c + c_off); )==""\n"
442	R"==(#endif )==""\n"
443	R"==(} )==""\n"
444	R"==(#if VECT_DT_N > 1 )==""\n"
445	R"==(for (int i = CALC_VECT_LEN(); i < VECT_DT_N; ++i) { )==""\n"
446	R"==(ret[i] = 0; )==""\n"
447	R"==(} )==""\n"
448	R"==(#endif )==""\n"
449	R"==(return ret; )==""\n"
450	R"==(} )==""\n"
451	R"==(} )==""\n"
452	R"==(inline int read_c_block_int(const __global int *ptr, int c) { )==""\n"
453	R"==(#if C_W_PADDING % SUB_GROUP_SIZE != 0 )==""\n"
454	R"==(int local_id = get_sub_group_local_id(); )==""\n"
455	R"==(int tail = C_WO_PADDING - c; )==""\n"
456	R"==(return (local_id < tail) ? ptr[local_id] : 0; )==""\n"
457	R"==(#else )==""\n"
458	R"==(return as_int(intel_sub_group_block_read((const __global uint *)ptr)); )==""\n"
459	R"==(#endif )==""\n"
460	R"==(} )==""\n"
461	R"==(inline VECT_INT_T read_vect_c_block_int(int idx, const __global int *ptr, int c, )==""\n"
462	R"==(int blocks_stride, int chunks_per_block) { )==""\n"
463	R"==(if (idx >= NVECT) return 0; )==""\n"
464	R"==(if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE) )==""\n"
465	R"==(&& (C_WO_PADDING % (chunks_per_block * SUB_GROUP_SIZE) == 0)) { )==""\n"
466	R"==(return AS_VECT_INT_T(VECT_UINT_READ( )==""\n"
467	R"==((const __global uint )ptr + idx VECT_DT_N * SUB_GROUP_SIZE)); )==""\n"
468	R"==(} else { )==""\n"
469	R"==(VECT_INT_T ret; )==""\n"
470	R"==(for (int i = 0; i < VECT_DT_N; i++) { )==""\n"
471	R"==(const int offset_index = (idx * VECT_DT_N + i); )==""\n"
472	R"==(const int local_c_block_index = offset_index % chunks_per_block; )==""\n"
473	R"==(const int global_c_block_index = offset_index / chunks_per_block; )==""\n"
474	R"==(const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE )==""\n"
475	R"==(+ global_c_block_index * blocks_stride; )==""\n"
476	R"==(const int c_off )==""\n"
477	R"==(= (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE )==""\n"
478	R"==(: local_c_block_index * SUB_GROUP_SIZE); )==""\n"
479	R"==(#if VECT_DT_N == 1 )==""\n"
480	R"==(ret = read_c_block_int(ptr + ptr_offset, c + c_off); )==""\n"
481	R"==(#else )==""\n"
482	R"==(ret[i] = read_c_block_int(ptr + ptr_offset, c + c_off); )==""\n"
483	R"==(#endif )==""\n"
484	R"==(} )==""\n"
485	R"==(return ret; )==""\n"
486	R"==(} )==""\n"
487	R"==(} )==""\n"
488	R"==(inline void write_c_block(__global DATA_T *ptr, int c, DATA_T value) { )==""\n"
489	R"==(#if C_W_PADDING % SUB_GROUP_SIZE != 0 )==""\n"
490	R"==(int local_id = get_sub_group_local_id(); )==""\n"
491	R"==(int tail = C_WO_PADDING - c; )==""\n"
492	R"==(if (local_id < tail) ptr[local_id] = value; )==""\n"
493	R"==(#else )==""\n"
494	R"==(#if C_WO_PADDING % SUB_GROUP_SIZE != 0 )==""\n"
495	R"==(int local_id = get_sub_group_local_id(); )==""\n"
496	R"==(if (local_id >= C_WO_PADDING - c && local_id < C_W_PADDING - c) value = 0; )==""\n"
497	R"==(#endif )==""\n"
498	R"==(if (c >= C_WO_PADDING) { )==""\n"
499	R"==(BLOCK_WRITE((__global BLOCK_DATA_T *)ptr, )==""\n"
500	R"==(AS_BLOCK_DATA_T(CONVERT_DATA_T(DATA_ZERO))); )==""\n"
501	R"==(return; )==""\n"
502	R"==(} )==""\n"
503	R"==(BLOCK_WRITE((__global BLOCK_DATA_T *)ptr, AS_BLOCK_DATA_T(value)); )==""\n"
504	R"==(#endif )==""\n"
505	R"==(} )==""\n"
506	R"==(inline void write_vect_c_block(int idx, __global DATA_T *ptr, int c, )==""\n"
507	R"==(int blocks_stride, int chunks_per_block, VECT_DATA_T block) { )==""\n"
508	R"==(if (idx >= NVECT) return; )==""\n"
509	R"==(if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE) )==""\n"
510	R"==(&& (C_WO_PADDING % (chunks_per_block * SUB_GROUP_SIZE) == 0)) { )==""\n"
511	R"==(VECT_BLOCK_WRITE( )==""\n"
512	R"==((__global BLOCK_DATA_T )ptr + idx VECT_DT_N * SUB_GROUP_SIZE, )==""\n"
513	R"==(AS_VECT_BLOCK_DATA_T(block)); )==""\n"
514	R"==(} else { )==""\n"
515	R"==(for (int i = 0; i < VECT_DT_N; i++) { )==""\n"
516	R"==(const int offset_index = (idx * VECT_DT_N + i); )==""\n"
517	R"==(const int local_c_block_index = offset_index % chunks_per_block; )==""\n"
518	R"==(const int global_c_block_index = offset_index / chunks_per_block; )==""\n"
519	R"==(const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE )==""\n"
520	R"==(+ global_c_block_index * blocks_stride; )==""\n"
521	R"==(const int c_off )==""\n"
522	R"==(= (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE )==""\n"
523	R"==(: local_c_block_index * SUB_GROUP_SIZE); )==""\n"
524	R"==(#if VECT_DT_N == 1 )==""\n"
525	R"==(write_c_block(ptr + ptr_offset, c + c_off, block); )==""\n"
526	R"==(#else )==""\n"
527	R"==(write_c_block(ptr + ptr_offset, c + c_off, block[i]); )==""\n"
528	R"==(#endif )==""\n"
529	R"==(} )==""\n"
530	R"==(} )==""\n"
531	R"==(} )==""\n"
532	R"==(inline void write_c_block_int(__global int *ptr, int c, int value) { )==""\n"
533	R"==(#if C_WO_PADDING % SUB_GROUP_SIZE != 0 )==""\n"
534	R"==(int local_id = get_sub_group_local_id(); )==""\n"
535	R"==(int tail = C_WO_PADDING - c; )==""\n"
536	R"==(if (local_id < tail) )==""\n"
537	R"==(ptr[local_id] = value; )==""\n"
538	R"==(else if (local_id < C_W_PADDING - c) { )==""\n"
539	R"==(ptr[local_id] = 0; )==""\n"
540	R"==(} else )==""\n"
541	R"==(return; )==""\n"
542	R"==(#else )==""\n"
543	R"==(if (c >= C_WO_PADDING) { )==""\n"
544	R"==(intel_sub_group_block_write((__global uint *)ptr, 0); )==""\n"
545	R"==(return; )==""\n"
546	R"==(} )==""\n"
547	R"==(intel_sub_group_block_write((__global uint *)ptr, as_uint(value)); )==""\n"
548	R"==(#endif )==""\n"
549	R"==(} )==""\n"
550	R"==(inline void write_vect_c_block_int(int idx, __global int *ptr, int c, )==""\n"
551	R"==(int blocks_stride, int chunks_per_block, VECT_INT_T block) { )==""\n"
552	R"==(if (idx >= NVECT) return; )==""\n"
553	R"==(if ((blocks_stride == chunks_per_block * SUB_GROUP_SIZE) )==""\n"
554	R"==(&& (C_WO_PADDING % (chunks_per_block * SUB_GROUP_SIZE) == 0)) { )==""\n"
555	R"==(VECT_UINT_WRITE((__global uint )ptr + idx VECT_DT_N * SUB_GROUP_SIZE, )==""\n"
556	R"==(AS_VECT_UINT_T(block)); )==""\n"
557	R"==(} else { )==""\n"
558	R"==(for (int i = 0; i < VECT_DT_N; i++) { )==""\n"
559	R"==(const int offset_index = (idx * VECT_DT_N + i); )==""\n"
560	R"==(const int local_c_block_index = offset_index % chunks_per_block; )==""\n"
561	R"==(const int global_c_block_index = offset_index / chunks_per_block; )==""\n"
562	R"==(const int ptr_offset = local_c_block_index * SUB_GROUP_SIZE )==""\n"
563	R"==(+ global_c_block_index * blocks_stride; )==""\n"
564	R"==(const int c_off )==""\n"
565	R"==(= (USE_ONLY_C_BLOCK ? offset_index * SUB_GROUP_SIZE )==""\n"
566	R"==(: local_c_block_index * SUB_GROUP_SIZE); )==""\n"
567	R"==(#if VECT_DT_N == 1 )==""\n"
568	R"==(write_c_block_int(ptr + ptr_offset, c + c_off, block); )==""\n"
569	R"==(#else )==""\n"
570	R"==(write_c_block_int(ptr + ptr_offset, c + c_off, block[i]); )==""\n"
571	R"==(#endif )==""\n"
572	R"==(} )==""\n"
573	R"==(} )==""\n"
574	R"==(} )==""\n"
575	R"==()==";
576	}
577	}
578	}
579	}

Browse the source code of oneDNN/build/src/gpu/ocl/gen9_pooling_kernel.cpp