1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *ref_layer_normalization_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
21R"==(#undef SRC_OFF )==""\n"
22R"==(#undef DST_OFF )==""\n"
23R"==(#define SRC_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(SRC, x0, x1, x2, x3, x4, x5) )==""\n"
24R"==(#define DST_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(DST, x0, x1, x2, x3, x4, x5) )==""\n"
25R"==(#define STAT_OFF(x0, x1, x2, x3, x4, x5) OFF_MD(STAT, x0, x1, x2, x3, x4, x5) )==""\n"
26R"==(#if IS_FWD )==""\n"
27R"==(#if VECTORIZE_CALC_STATS == 1 )==""\n"
28R"==(KERNEL_ATTR )==""\n"
29R"==(__kernel void ref_lnorm_fwd(__global DATA_T *src, __global float *mean, )==""\n"
30R"==(__global float *variance, __global DATA_T *dst, __global float *scale, )==""\n"
31R"==(__global float *shift, float eps) { )==""\n"
32R"==(int x[6] = {0}; )==""\n"
33R"==(x[0] = GWS_GET_X0(); )==""\n"
34R"==(x[1] = GWS_GET_X1(); )==""\n"
35R"==(x[2] = GWS_GET_X2(); )==""\n"
36R"==(x[3] = GWS_GET_X3(); )==""\n"
37R"==(if (x[0] >= DST_D0 || x[1] >= DST_D1 || x[2] >= DST_D2 || x[3] >= DST_D3) { )==""\n"
38R"==(int local_id = get_sub_group_local_id(); )==""\n"
39R"==(for (int c = 0; c < C; c += SUB_GROUP_SIZE) { )==""\n"
40R"==(x[NDIMS - 1] = c + local_id; )==""\n"
41R"==(int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
42R"==(dst[dst_off] = CONVERT_DATA_T(0.f); )==""\n"
43R"==(} )==""\n"
44R"==(return; )==""\n"
45R"==(} )==""\n"
46R"==(int s_off = STAT_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
47R"==(float v_mean = CALCULATE_STATS ? 0 : mean[s_off]; )==""\n"
48R"==(float v_variance = CALCULATE_STATS ? 0 : variance[s_off]; )==""\n"
49R"==(if (CALCULATE_STATS) { )==""\n"
50R"==(VECT_FLOAT_T v_acc = 0; )==""\n"
51R"==(for (int c = 0; c < C; c += SUB_GROUP_SIZE * VECT_DT_N) { )==""\n"
52R"==(x[NDIMS - 1] = c; )==""\n"
53R"==(int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
54R"==(v_acc += CONVERT_VECT_FLOAT_T(AS_VECT_DATA_T(VECT_BLOCK_READ( )==""\n"
55R"==((const __global BLOCK_DATA_T *)&src[src_off]))); )==""\n"
56R"==(} )==""\n"
57R"==(#if VECT_DT_N == 1 )==""\n"
58R"==(v_mean = v_acc; )==""\n"
59R"==(#else )==""\n"
60R"==(v_mean = 0; )==""\n"
61R"==(for (int i = 0; i < VECT_DT_N; ++i) { )==""\n"
62R"==(v_mean += v_acc[i]; )==""\n"
63R"==(} )==""\n"
64R"==(#endif )==""\n"
65R"==(float total_sum = sub_group_reduce_add(v_mean); )==""\n"
66R"==(v_mean = total_sum / C; )==""\n"
67R"==(v_acc = 0; )==""\n"
68R"==(VECT_FLOAT_T m = 0; )==""\n"
69R"==(for (int c = 0; c < C; c += SUB_GROUP_SIZE * VECT_DT_N) { )==""\n"
70R"==(x[NDIMS - 1] = c; )==""\n"
71R"==(int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
72R"==(m = CONVERT_VECT_FLOAT_T(AS_VECT_DATA_T(VECT_BLOCK_READ( )==""\n"
73R"==((const __global BLOCK_DATA_T *)&src[src_off]))); )==""\n"
74R"==(m -= v_mean; )==""\n"
75R"==(v_acc += m * m; )==""\n"
76R"==(} )==""\n"
77R"==(#if VECT_DT_N == 1 )==""\n"
78R"==(v_variance = v_acc; )==""\n"
79R"==(#else )==""\n"
80R"==(v_variance = 0; )==""\n"
81R"==(for (int i = 0; i < VECT_DT_N; ++i) { )==""\n"
82R"==(v_variance += v_acc[i]; )==""\n"
83R"==(} )==""\n"
84R"==(#endif )==""\n"
85R"==(total_sum = sub_group_reduce_add(v_variance); )==""\n"
86R"==(v_variance = total_sum / C; )==""\n"
87R"==(} )==""\n"
88R"==(float sqrt_variance = sqrt(v_variance + eps); )==""\n"
89R"==(int local_id = get_sub_group_local_id(); )==""\n"
90R"==(for (int c = 0; c < C; c += SUB_GROUP_SIZE) { )==""\n"
91R"==(float sm = (scale ? scale[c + local_id] : 1.0f) / sqrt_variance; )==""\n"
92R"==(float sv = shift ? shift[c + local_id] : 0.0f; )==""\n"
93R"==(x[NDIMS - 1] = c + local_id; )==""\n"
94R"==(int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
95R"==(int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
96R"==(dst[dst_off] = TO_DST(sm * (SRC_TO_REF(src[src_off]) - v_mean) + sv); )==""\n"
97R"==(} )==""\n"
98R"==(if (CALCULATE_STATS) { )==""\n"
99R"==(if (SAVE_STATS) { )==""\n"
100R"==(mean[s_off] = v_mean; )==""\n"
101R"==(variance[s_off] = v_variance; )==""\n"
102R"==(} )==""\n"
103R"==(} )==""\n"
104R"==(} )==""\n"
105R"==(#else )==""\n"
106R"==(KERNEL_ATTR )==""\n"
107R"==(__kernel void ref_lnorm_fwd(__global DATA_T *src, __global float *mean, )==""\n"
108R"==(__global float *variance, __global DATA_T *dst, __global float *scale, )==""\n"
109R"==(__global float *shift, float eps) { )==""\n"
110R"==(int x[6] = {0}; )==""\n"
111R"==(x[0] = GWS_GET_X0(); )==""\n"
112R"==(x[1] = GWS_GET_X1(); )==""\n"
113R"==(x[2] = GWS_GET_X2(); )==""\n"
114R"==(x[3] = GWS_GET_X3(); )==""\n"
115R"==(if (x[0] >= DST_D0 || x[1] >= DST_D1 || x[2] >= DST_D2 || x[3] >= DST_D3) { )==""\n"
116R"==(for (int c = 0; c < C; ++c) { )==""\n"
117R"==(x[NDIMS - 1] = c; )==""\n"
118R"==(int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
119R"==(dst[dst_off] = CONVERT_DATA_T(0.f); )==""\n"
120R"==(} )==""\n"
121R"==(return; )==""\n"
122R"==(} )==""\n"
123R"==(int s_off = STAT_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
124R"==(float v_mean = CALCULATE_STATS ? 0 : mean[s_off]; )==""\n"
125R"==(float v_variance = CALCULATE_STATS ? 0 : variance[s_off]; )==""\n"
126R"==(if (CALCULATE_STATS) { )==""\n"
127R"==(for (int c = 0; c < C; ++c) { )==""\n"
128R"==(x[NDIMS - 1] = c; )==""\n"
129R"==(int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
130R"==(v_mean += SRC_TO_REF(src[src_off]); )==""\n"
131R"==(} )==""\n"
132R"==(v_mean /= C; )==""\n"
133R"==(for (int c = 0; c < C; ++c) { )==""\n"
134R"==(x[NDIMS - 1] = c; )==""\n"
135R"==(int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
136R"==(float m = SRC_TO_REF(src[src_off]) - v_mean; )==""\n"
137R"==(v_variance += m * m; )==""\n"
138R"==(} )==""\n"
139R"==(v_variance /= C; )==""\n"
140R"==(} )==""\n"
141R"==(float sqrt_variance = sqrt(v_variance + eps); )==""\n"
142R"==(for (int c = 0; c < C; ++c) { )==""\n"
143R"==(float sm = (scale ? scale[c] : 1.0f) / sqrt_variance; )==""\n"
144R"==(float sv = shift ? shift[c] : 0.0f; )==""\n"
145R"==(x[NDIMS - 1] = c; )==""\n"
146R"==(int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
147R"==(int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
148R"==(dst[dst_off] = TO_DST(sm * (SRC_TO_REF(src[src_off]) - v_mean) + sv); )==""\n"
149R"==(} )==""\n"
150R"==(if (CALCULATE_STATS) { )==""\n"
151R"==(if (SAVE_STATS) { )==""\n"
152R"==(mean[s_off] = v_mean; )==""\n"
153R"==(variance[s_off] = v_variance; )==""\n"
154R"==(} )==""\n"
155R"==(} )==""\n"
156R"==(} )==""\n"
157R"==(#endif )==""\n"
158R"==(#endif )==""\n"
159R"==(#if IS_BWD )==""\n"
160R"==(#if USE_SCALE || USE_SHIFT )==""\n"
161R"==(#if VECTORIZE_BWD_SCALESHIFT )==""\n"
162R"==(#if VECTOR_SIZE_SCALESHIFT == 1 )==""\n"
163R"==(#define VECTORIZED_VERSION(x) x )==""\n"
164R"==(#define vector_load(x) (x); )==""\n"
165R"==(#else )==""\n"
166R"==(#define VECTORIZED_VERSION(x) CONCAT2(x, VECTOR_SIZE_SCALESHIFT) )==""\n"
167R"==(#define vector_load(x) CONCAT2(vload, VECTOR_SIZE_SCALESHIFT)(0, &x); )==""\n"
168R"==(#endif )==""\n"
169R"==(#if DT_BF16 == 1 )==""\n"
170R"==(#define convert_vector_to_float cvt_bf16_to_f32 )==""\n"
171R"==(#else )==""\n"
172R"==(#define convert_vector_to_float VECTORIZED_VERSION(convert_float) )==""\n"
173R"==(#endif )==""\n"
174R"==(#define as_vector_data_t VECTORIZED_VERSION(AS_DATA_T) )==""\n"
175R"==(#define sub_group_read VECTORIZED_VERSION(BLOCK_READ) )==""\n"
176R"==(#define vector_float VECTORIZED_VERSION(float) )==""\n"
177R"==(NAMED_KERNEL_ATTR(SCALESHIFT) )==""\n"
178R"==(__kernel void ref_lnorm_bwd_scaleshift(__global DATA_T *src, )==""\n"
179R"==(__global float *mean, __global float *variance, )==""\n"
180R"==(__global DATA_T *diff_dst, __global float *diff_scale, )==""\n"
181R"==(__global float *diff_shift, float eps) { )==""\n"
182R"==(const int c = GWS_GET_C(); )==""\n"
183R"==(const int n_chunk_idx = GWS_GET_N(); )==""\n"
184R"==(const int n_start = n_chunk_idx * N_CHUNK_SIZE; )==""\n"
185R"==(const int n_end = n_start + N_CHUNK_SIZE; )==""\n"
186R"==(const int shift_off = N_CHUNKS * C; )==""\n"
187R"==(diff_shift += shift_off; )==""\n"
188R"==(vector_float diff_gamma_vect = 0; )==""\n"
189R"==(vector_float diff_beta_vect = 0; )==""\n"
190R"==(for (int n_off = n_start; n_off < n_end; n_off += VECTOR_SIZE_SCALESHIFT) { )==""\n"
191R"==(const vector_float mean_vect = vector_load(mean[n_off]); )==""\n"
192R"==(const vector_float variance_vect = vector_load(variance[n_off]); )==""\n"
193R"==(const vector_float inv_sqrt_variance = 1.0f / sqrt(variance_vect + eps); )==""\n"
194R"==(#if NDIMS == 2 )==""\n"
195R"==(const int src_off = SRC_OFF(n_off, c, 0, 0, 0, 0); )==""\n"
196R"==(const int dst_off = DST_OFF(n_off, c, 0, 0, 0, 0); )==""\n"
197R"==(#else )==""\n"
198R"==(const int src_off = SRC_OFF(0, n_off, c, 0, 0, 0); )==""\n"
199R"==(const int dst_off = DST_OFF(0, n_off, c, 0, 0, 0); )==""\n"
200R"==(#endif )==""\n"
201R"==(const vector_float src_vect = convert_vector_to_float(as_vector_data_t( )==""\n"
202R"==(sub_group_read((const __global BLOCK_DATA_T *)&src[src_off]))); )==""\n"
203R"==(const vector_float diff_dst_vect )==""\n"
204R"==(= convert_vector_to_float(as_vector_data_t(sub_group_read( )==""\n"
205R"==((const __global BLOCK_DATA_T *)&diff_dst[dst_off]))); )==""\n"
206R"==(diff_gamma_vect )==""\n"
207R"==(+= (src_vect - mean_vect) * diff_dst_vect * inv_sqrt_variance; )==""\n"
208R"==(diff_beta_vect += diff_dst_vect; )==""\n"
209R"==(} )==""\n"
210R"==(float diff_gamma = 0, diff_beta = 0; )==""\n"
211R"==(#if VECTOR_SIZE_SCALESHIFT == 1 )==""\n"
212R"==(diff_gamma = diff_gamma_vect; )==""\n"
213R"==(diff_beta = diff_beta_vect; )==""\n"
214R"==(#else )==""\n"
215R"==(for (int elem_idx = 0; elem_idx < VECTOR_SIZE_SCALESHIFT; elem_idx++) { )==""\n"
216R"==(diff_gamma += diff_gamma_vect[elem_idx]; )==""\n"
217R"==(diff_beta += diff_beta_vect[elem_idx]; )==""\n"
218R"==(} )==""\n"
219R"==(#endif )==""\n"
220R"==(const int result_offset = n_chunk_idx * C + c; )==""\n"
221R"==(if (USE_SCALE) )==""\n"
222R"==(intel_sub_group_block_write((__global uint *)&diff_scale[result_offset], )==""\n"
223R"==(as_uint(diff_gamma)); )==""\n"
224R"==(if (USE_SHIFT) )==""\n"
225R"==(intel_sub_group_block_write((__global uint *)&diff_shift[result_offset], )==""\n"
226R"==(as_uint(diff_beta)); )==""\n"
227R"==(} )==""\n"
228R"==(NAMED_KERNEL_ATTR(SCALESHIFT_FINALIZE) )==""\n"
229R"==(__kernel void ref_lnorm_bwd_scaleshift_final(__global float *tmp_reduce_mem, )==""\n"
230R"==(__global float *diff_scale, __global float *diff_shift) { )==""\n"
231R"==(const int c = GWS_GET_C_finalize(); )==""\n"
232R"==(const int diff_shift_off = N_CHUNKS * C; )==""\n"
233R"==(__global float *tmp_diff_scale = tmp_reduce_mem; )==""\n"
234R"==(__global float *tmp_diff_shift = tmp_reduce_mem + diff_shift_off; )==""\n"
235R"==(float diff_gamma = 0; )==""\n"
236R"==(float diff_beta = 0; )==""\n"
237R"==(for (int n_chunk_idx = 0; n_chunk_idx < N_CHUNKS; n_chunk_idx++) { )==""\n"
238R"==(const int result_off = n_chunk_idx * C + c; )==""\n"
239R"==(diff_gamma += tmp_diff_scale[result_off]; )==""\n"
240R"==(diff_beta += tmp_diff_shift[result_off]; )==""\n"
241R"==(} )==""\n"
242R"==(if (diff_scale) diff_scale[c] = diff_gamma; )==""\n"
243R"==(if (diff_shift) diff_shift[c] = diff_beta; )==""\n"
244R"==(} )==""\n"
245R"==(#else )==""\n"
246R"==(NAMED_KERNEL_ATTR(SCALESHIFT) )==""\n"
247R"==(__kernel void ref_lnorm_bwd_scaleshift(__global DATA_T *src, )==""\n"
248R"==(__global float *mean, __global float *variance, )==""\n"
249R"==(__global DATA_T *diff_dst, __global float *diff_scale, )==""\n"
250R"==(__global float *diff_shift, float eps) { )==""\n"
251R"==(const int c = GWS_GET_C(); )==""\n"
252R"==(int x[6] = {0}; )==""\n"
253R"==(float diff_gamma = 0; )==""\n"
254R"==(float diff_beta = 0; )==""\n"
255R"==(for (x[0] = 0; x[0] < max(1, STAT_D0); ++x[0]) { )==""\n"
256R"==(for (x[1] = 0; x[1] < max(1, STAT_D1); ++x[1]) { )==""\n"
257R"==(for (x[2] = 0; x[2] < max(1, STAT_D2); ++x[2]) { )==""\n"
258R"==(for (x[3] = 0; x[3] < max(1, STAT_D3); ++x[3]) { )==""\n"
259R"==(x[NDIMS - 1] = 0; )==""\n"
260R"==(const int s_off )==""\n"
261R"==(= STAT_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
262R"==(x[NDIMS - 1] = c; )==""\n"
263R"==(const int src_off )==""\n"
264R"==(= SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
265R"==(const int dst_off )==""\n"
266R"==(= DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
267R"==(const float inv_sqrt_variance )==""\n"
268R"==(= 1.0f / sqrt(variance[s_off] + eps); )==""\n"
269R"==(const float dd = DST_TO_REF(diff_dst[dst_off]); )==""\n"
270R"==(diff_gamma += (SRC_TO_REF(src[src_off]) - mean[s_off]) * dd )==""\n"
271R"==(* inv_sqrt_variance; )==""\n"
272R"==(diff_beta += dd; )==""\n"
273R"==(} )==""\n"
274R"==(} )==""\n"
275R"==(} )==""\n"
276R"==(} )==""\n"
277R"==(if (diff_scale) diff_scale[c] = diff_gamma; )==""\n"
278R"==(if (diff_shift) diff_shift[c] = diff_beta; )==""\n"
279R"==(} )==""\n"
280R"==(#endif )==""\n"
281R"==(#endif )==""\n"
282R"==(#if VECTORIZE_BWD )==""\n"
283R"==(KERNEL_ATTR )==""\n"
284R"==(__kernel void ref_lnorm_bwd(__global DATA_T *src, __global float *mean, )==""\n"
285R"==(__global float *variance, __global DATA_T *diff_dst, )==""\n"
286R"==(__global float *scale, __global DATA_T *diff_src, float eps) { )==""\n"
287R"==(int x[6] = {0}; )==""\n"
288R"==(x[0] = GWS_GET_X0(); )==""\n"
289R"==(x[1] = GWS_GET_X1(); )==""\n"
290R"==(x[2] = GWS_GET_X2(); )==""\n"
291R"==(x[3] = GWS_GET_X3(); )==""\n"
292R"==(const int s_off = STAT_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
293R"==(const float mean_val = mean[s_off]; )==""\n"
294R"==(const float inv_sqrt_variance = 1.0f / sqrt(variance[s_off] + eps); )==""\n"
295R"==(float dd_gamma = 0, dd_gamma_x = 0; )==""\n"
296R"==(VECT_FLOAT_T dd_gamma_vect = 0; )==""\n"
297R"==(VECT_FLOAT_T dd_gamma_x_vect = 0; )==""\n"
298R"==(if (CALCULATE_STATS) { )==""\n"
299R"==(for (int c = 0; c < C; c += VECT_DT_N * SUB_GROUP_SIZE) { )==""\n"
300R"==(VECT_FLOAT_T gamma = 1.0f; )==""\n"
301R"==(if (scale) { )==""\n"
302R"==(gamma = AS_VECT_FLOAT_T( )==""\n"
303R"==(VECT_UINT_READ((const __global uint *)&scale[c])); )==""\n"
304R"==(} )==""\n"
305R"==(x[NDIMS - 1] = c; )==""\n"
306R"==(const int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
307R"==(const int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
308R"==(const VECT_FLOAT_T src_vect )==""\n"
309R"==(= CONVERT_VECT_FLOAT_T(AS_VECT_DATA_T(VECT_BLOCK_READ( )==""\n"
310R"==((const __global BLOCK_DATA_T *)&src[src_off]))); )==""\n"
311R"==(const VECT_FLOAT_T dst_vect )==""\n"
312R"==(= CONVERT_VECT_FLOAT_T(AS_VECT_DATA_T(VECT_BLOCK_READ(( )==""\n"
313R"==(const __global BLOCK_DATA_T *)&diff_dst[dst_off]))); )==""\n"
314R"==(dd_gamma_vect += dst_vect * gamma; )==""\n"
315R"==(dd_gamma_x_vect += dst_vect * gamma * (src_vect - mean_val); )==""\n"
316R"==(} )==""\n"
317R"==(#if VECT_DT_N == 1 )==""\n"
318R"==(dd_gamma = dd_gamma_vect; )==""\n"
319R"==(dd_gamma_x = dd_gamma_x_vect; )==""\n"
320R"==(#else )==""\n"
321R"==(for (int i = 0; i < VECT_DT_N; ++i) { )==""\n"
322R"==(dd_gamma += dd_gamma_vect[i]; )==""\n"
323R"==(dd_gamma_x += dd_gamma_x_vect[i]; )==""\n"
324R"==(} )==""\n"
325R"==(#endif )==""\n"
326R"==(dd_gamma = sub_group_reduce_add(dd_gamma); )==""\n"
327R"==(dd_gamma_x = sub_group_reduce_add(dd_gamma_x); )==""\n"
328R"==(dd_gamma_x *= inv_sqrt_variance; )==""\n"
329R"==(} )==""\n"
330R"==(for (int c = 0; c < C; c += VECT_DT_N * SUB_GROUP_SIZE) { )==""\n"
331R"==(VECT_FLOAT_T gamma = 1.0f; )==""\n"
332R"==(if (scale) { )==""\n"
333R"==(gamma = AS_VECT_FLOAT_T( )==""\n"
334R"==(VECT_UINT_READ((const __global uint *)&scale[c])); )==""\n"
335R"==(} )==""\n"
336R"==(x[NDIMS - 1] = c; )==""\n"
337R"==(const int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
338R"==(const int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
339R"==(const VECT_FLOAT_T src_vect = CONVERT_VECT_FLOAT_T(AS_VECT_DATA_T( )==""\n"
340R"==(VECT_BLOCK_READ((const __global BLOCK_DATA_T *)&src[src_off]))); )==""\n"
341R"==(VECT_FLOAT_T v_diff_src_vect )==""\n"
342R"==(= CONVERT_VECT_FLOAT_T(AS_VECT_DATA_T(VECT_BLOCK_READ( )==""\n"
343R"==((const __global BLOCK_DATA_T *)&diff_dst[dst_off]))); )==""\n"
344R"==(v_diff_src_vect *= gamma; )==""\n"
345R"==(if (CALCULATE_STATS) { )==""\n"
346R"==(v_diff_src_vect -= dd_gamma / C )==""\n"
347R"==(+ (src_vect - mean_val) * dd_gamma_x * inv_sqrt_variance )==""\n"
348R"==(/ C; )==""\n"
349R"==(} )==""\n"
350R"==(v_diff_src_vect *= inv_sqrt_variance; )==""\n"
351R"==(VECT_BLOCK_WRITE((__global BLOCK_DATA_T *)&diff_src[src_off], )==""\n"
352R"==(AS_VECT_BLOCK_DATA_T(CONVERT_VECTOR_DATA_T(v_diff_src_vect))); )==""\n"
353R"==(} )==""\n"
354R"==(} )==""\n"
355R"==(#else )==""\n"
356R"==(KERNEL_ATTR )==""\n"
357R"==(__kernel void ref_lnorm_bwd(__global DATA_T *src, __global float *mean, )==""\n"
358R"==(__global float *variance, __global DATA_T *diff_dst, )==""\n"
359R"==(__global float *scale, __global DATA_T *diff_src, float eps) { )==""\n"
360R"==(int x[6] = {0}; )==""\n"
361R"==(x[0] = GWS_GET_X0(); )==""\n"
362R"==(x[1] = GWS_GET_X1(); )==""\n"
363R"==(x[2] = GWS_GET_X2(); )==""\n"
364R"==(x[3] = GWS_GET_X3(); )==""\n"
365R"==(const int s_off = STAT_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
366R"==(const float mean_val = mean[s_off]; )==""\n"
367R"==(const float inv_sqrt_variance = 1.0f / sqrt(variance[s_off] + eps); )==""\n"
368R"==(float dd_gamma = 0; )==""\n"
369R"==(float dd_gamma_x = 0; )==""\n"
370R"==(if (CALCULATE_STATS) { )==""\n"
371R"==(for (int c = 0; c < C; ++c) { )==""\n"
372R"==(const float gamma = scale ? scale[c] : 1.0f; )==""\n"
373R"==(x[NDIMS - 1] = c; )==""\n"
374R"==(const int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
375R"==(const int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
376R"==(const float dd = DST_TO_REF(diff_dst[dst_off]); )==""\n"
377R"==(dd_gamma += dd * gamma; )==""\n"
378R"==(dd_gamma_x += dd * gamma * (SRC_TO_REF(src[src_off]) - mean_val); )==""\n"
379R"==(} )==""\n"
380R"==(dd_gamma_x *= inv_sqrt_variance; )==""\n"
381R"==(} )==""\n"
382R"==(for (int c = 0; c < C; ++c) { )==""\n"
383R"==(const float gamma = scale ? scale[c] : 1.0f; )==""\n"
384R"==(x[NDIMS - 1] = c; )==""\n"
385R"==(const int src_off = SRC_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
386R"==(const int dst_off = DST_OFF(x[0], x[1], x[2], x[3], x[4], x[5]); )==""\n"
387R"==(float v_diff_src = DST_TO_REF(diff_dst[dst_off]) * gamma; )==""\n"
388R"==(if (CALCULATE_STATS) { )==""\n"
389R"==(v_diff_src -= dd_gamma / C )==""\n"
390R"==(+ (SRC_TO_REF(src[src_off]) - mean_val) * dd_gamma_x )==""\n"
391R"==(* inv_sqrt_variance / C; )==""\n"
392R"==(} )==""\n"
393R"==(v_diff_src *= inv_sqrt_variance; )==""\n"
394R"==(diff_src[src_off] = TO_SRC(v_diff_src); )==""\n"
395R"==(} )==""\n"
396R"==(} )==""\n"
397R"==(#endif )==""\n"
398R"==(#endif )==""\n"
399R"==()==";
400}
401}
402}
403}