simple_layer_normalization.cpp source code [oneDNN/src/cpu/simple_layer_normalization.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <assert.h>
18	#include <math.h>
19
20	#include "common/c_types_map.hpp"
21	#include "common/dnnl_thread.hpp"
22	#include "common/reorder.hpp"
23	#include "common/type_helpers.hpp"
24
25	#include "cpu/cpu_primitive.hpp"
26	#include "cpu/ref_io_helper.hpp"
27
28	#include "cpu/simple_layer_normalization.hpp"
29
30	namespace dnnl {
31	namespace impl {
32	namespace cpu {
33
34	using namespace memory_tracking::names;
35	using namespace data_type;
36
37	status_t simple_layer_normalization_fwd_t::pd_t::init(engine_t *engine) {
38	using namespace data_type;
39	using skip_mask_t = primitive_attr_t::skip_mask_t;
40	const memory_desc_wrapper src_d(src_md());
41
42	const bool ok = is_fwd() && !has_zero_dim_memory()
43	&& utils::one_of(src_md()->data_type, f32, bf16, f16, s8, u8)
44	&& utils::one_of(dst_md()->data_type, f32, bf16, f16, s8, u8)
45	&& platform::has_data_type_support(src_md()->data_type)
46	&& platform::has_data_type_support(dst_md()->data_type)
47	&& stat_md()->data_type == f32 && check_scale_shift_data_type()
48	&& attr()->has_default_values(skip_mask_t::scales_runtime)
49	&& attr_scales_ok() && set_default_formats_common()
50	&& src_d.is_blocking_desc()
51	// plain format, last logical dim is last physical
52	&& src_d.blocking_desc().strides[ndims() - `1`] == `1`;
53	if (!ok) return status::unimplemented;
54
55	CHECK(fill_compatible_stats_md(*src_md(), reordered_stat_md_));
56
57	if (reordered_stat_md_ != *stat_md() && !stats_are_tmp()) {
58	CHECK(reorder_primitive_desc_create(reorder_pd_, engine,
59	stats_are_src() ? stat_md() : &reordered_stat_md_,
60	stats_are_src() ? &reordered_stat_md_ : stat_md()));
61	}
62
63	init_scratchpad();
64	return status::success;
65	}
66
67	status_t simple_layer_normalization_fwd_t::execute_forward(
68	const exec_ctx_t &ctx) const {
69	const bool use_scale = pd()->use_scale();
70	const bool use_shift = pd()->use_shift();
71
72	auto scratchpad = ctx.get_scratchpad_grantor();
73	const auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
74	auto dst = CTX_OUT_MEM(void *, DNNL_ARG_DST);
75
76	auto scale = CTX_IN_MEM(const float *, DNNL_ARG_SCALE);
77	auto shift = CTX_IN_MEM(const float *, DNNL_ARG_SHIFT);
78
79	float mean, variance;
80	if (pd()->use_tmp_stats()) {
81	mean = scratchpad.template get<float>(key_lnorm_tmp_mean);
82	variance = scratchpad.template get<float>(key_lnorm_tmp_var);
83	} else {
84	mean = pd()->stats_are_src()
85	? const_cast<float >(CTX_IN_MEM(const* float *, DNNL_ARG_MEAN))
86	: CTX_OUT_MEM(float *, DNNL_ARG_MEAN);
87	variance = pd()->stats_are_src()
88	? const_cast<float *>(
89	CTX_IN_MEM(const float *, DNNL_ARG_VARIANCE))
90	: CTX_OUT_MEM(float *, DNNL_ARG_VARIANCE);
91	}
92
93	DEFINE_ARG_SCALES_BUFFER(src_scales, DNNL_ARG_SRC);
94	DEFINE_ARG_SCALES_BUFFER(dst_scales, DNNL_ARG_DST);
95
96	const memory_desc_wrapper src_d(pd()->src_md());
97	const memory_desc_wrapper dst_d(pd()->dst_md());
98
99	const dim_t N = pd()->across_axis();
100	const dim_t C = pd()->norm_axis();
101	const dim_t C_padded = src_d.padded_dims()[pd()->ndims() - `1`];
102
103	const auto calculate_stats = !pd()->stats_are_src();
104	const auto src_dt = pd()->src_md()->data_type;
105	const auto dst_dt = pd()->dst_md()->data_type;
106	const auto eps = pd()->desc()->layer_norm_epsilon;
107	const auto save_stats = pd()->is_training();
108
109	parallel(`0`, [&](const int ithr, const int nthr) {
110	dim_t N_start = `0`, N_end = `0`;
111	balance211(N, nthr, ithr, N_start, N_end);
112	const char *const __restrict src_ptr
113	= reinterpret_cast<const char *>(src)
114	+ N_start * C_padded * src_d.data_type_size();
115	char *const __restrict dst_ptr = reinterpret_cast<char *>(dst)
116	+ N_start * C_padded * dst_d.data_type_size();
117	float *const __restrict mean_ptr = &mean[N_start];
118	float *const __restrict var_ptr = &variance[N_start];
119	const size_t block_size = N_end - N_start;
120	// Note: manual unrolling for scale and shift due to clang issue.
121	// see: CLANG_WA_01_SAFE_TO_USE_OMP_SIMD
122	for (size_t offset = `0`; offset < block_size; offset++) {
123	float v_mean = `0`, v_variance = `0`;
124	if (calculate_stats) {
125	PRAGMA_OMP_SIMD(reduction(+ : v_mean))
126	for (dim_t c = `0`; c < C; ++c) {
127	float s = io::load_float_value(
128	src_dt, src_ptr, c + C * offset);
129	v_mean += s;
130	}
131	v_mean /= C;
132
133	PRAGMA_OMP_SIMD(reduction(+ : v_variance))
134	for (dim_t c = `0`; c < C; ++c) {
135	float s = io::load_float_value(
136	src_dt, src_ptr, c + C * offset);
137	float src_sub_mean = s - v_mean;
138	v_variance += src_sub_mean * src_sub_mean;
139	}
140	v_variance /= C;
141	} else {
142	v_mean = mean_ptr[offset];
143	v_variance = var_ptr[offset];
144	}
145
146	const float inv_sqrtvar = `1.f` / sqrtf(v_variance + eps);
147	if (use_scale && use_shift) {
148	PRAGMA_OMP_SIMD()
149	for (dim_t c = `0`; c < C; ++c) {
150	const float sm = scale[c] * inv_sqrtvar;
151	const float sv = shift[c];
152	const size_t off = c + C * offset;
153	float s = io::load_float_value(src_dt, src_ptr, off);
154	float d = sm * (s - v_mean) + sv;
155	d = src_scales[`0`] dst_scales[`0`];
156	io::store_float_value(dst_dt, d, dst_ptr, off);
157	}
158	} else if (use_scale) {
159	PRAGMA_OMP_SIMD()
160	for (dim_t c = `0`; c < C; ++c) {
161	const float sm = scale[c] * inv_sqrtvar;
162	const size_t off = c + C * offset;
163	float s = io::load_float_value(src_dt, src_ptr, off);
164	float d = sm * (s - v_mean);
165	d = src_scales[`0`] dst_scales[`0`];
166	io::store_float_value(dst_dt, d, dst_ptr, off);
167	}
168	} else if (use_shift) {
169	PRAGMA_OMP_SIMD()
170	for (dim_t c = `0`; c < C; ++c) {
171	const float sm = `1.f` * inv_sqrtvar;
172	const float sv = shift[c];
173	const size_t off = c + C * offset;
174	float s = io::load_float_value(src_dt, src_ptr, off);
175	float d = sm * (s - v_mean) + sv;
176	d = src_scales[`0`] dst_scales[`0`];
177	io::store_float_value(dst_dt, d, dst_ptr, off);
178	}
179	} else {
180	PRAGMA_OMP_SIMD()
181	for (dim_t c = `0`; c < C; ++c) {
182	const float sm = `1.f` * inv_sqrtvar;
183	const size_t off = c + C * offset;
184	float s = io::load_float_value(src_dt, src_ptr, off);
185	float d = sm * (s - v_mean);
186	d = src_scales[`0`] dst_scales[`0`];
187	io::store_float_value(dst_dt, d, dst_ptr, off);
188	}
189	}
190	if (calculate_stats && save_stats) {
191	mean_ptr[offset] = v_mean;
192	var_ptr[offset] = v_variance;
193	}
194	}
195	});
196	return status::success;
197	}
198
199	status_t simple_layer_normalization_bwd_t::pd_t::init(engine_t *engine) {
200	using namespace data_type;
201	const memory_desc_wrapper src_d(src_md());
202
203	const bool ok = is_bwd() && !has_zero_dim_memory()
204	&& utils::one_of(src_md()->data_type, f32, bf16, f16)
205	&& utils::one_of(diff_dst_md()->data_type, f32, bf16, f16)
206	&& utils::one_of(diff_src_md()->data_type, f32, bf16, f16)
207	&& platform::has_data_type_support(src_md()->data_type)
208	&& platform::has_data_type_support(diff_dst_md()->data_type)
209	&& platform::has_data_type_support(diff_src_md()->data_type)
210	&& stat_md()->data_type == f32 && check_scale_shift_data_type()
211	&& attr()->has_default_values() && set_default_formats_common()
212	&& src_d.is_blocking_desc()
213	// plain format, last logical dim is last physical
214	&& src_d.blocking_desc().strides[ndims() - `1`] == `1`;
215	if (!ok) return status::unimplemented;
216
217	CHECK(fill_compatible_stats_md(*src_md(), reordered_stat_md_));
218
219	if (reordered_stat_md_ != *stat_md()) {
220	CHECK(reorder_primitive_desc_create(
221	reorder_pd_, engine, stat_md(), &reordered_stat_md_));
222	}
223
224	nthr_ = dnnl_get_max_threads();
225	init_scratchpad();
226	return status::success;
227	}
228
229	status_t simple_layer_normalization_bwd_t::execute_backward(
230	const exec_ctx_t &ctx) const {
231	status_t status = status::success;
232
233	const bool use_scale = pd()->use_scale();
234
235	auto scratchpad = ctx.get_scratchpad_grantor();
236	auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
237	auto diff_dst = CTX_IN_MEM(const void *, DNNL_ARG_DIFF_DST);
238	auto scale = CTX_IN_MEM(float *, DNNL_ARG_SCALE);
239	auto diff_src = CTX_OUT_CLEAN_MEM(void *, DNNL_ARG_DIFF_SRC, status);
240
241	auto diff_scale = CTX_OUT_CLEAN_MEM(float *, DNNL_ARG_DIFF_SCALE, status);
242	CHECK(status);
243	auto diff_shift = CTX_OUT_CLEAN_MEM(float *, DNNL_ARG_DIFF_SHIFT, status);
244	CHECK(status);
245
246	const float mean, variance;
247	if (pd()->use_tmp_stats()) {
248	mean = scratchpad.template get<float>(key_lnorm_tmp_mean);
249	variance = scratchpad.template get<float>(key_lnorm_tmp_var);
250	} else {
251	mean = CTX_IN_MEM(const float *, DNNL_ARG_MEAN);
252	variance = CTX_IN_MEM(const float *, DNNL_ARG_VARIANCE);
253	}
254
255	float *const inv_sqrtvar
256	= scratchpad.template get<float>(key_lnorm_inv_sqrtvar);
257
258	const memory_desc_wrapper src_d(pd()->src_md());
259	const memory_desc_wrapper diff_dst_d(pd()->diff_dst_md());
260	const memory_desc_wrapper diff_src_d(pd()->diff_src_md());
261
262	const dim_t N = pd()->across_axis();
263	const dim_t C = pd()->norm_axis();
264	const dim_t C_padded = src_d.padded_dims()[pd()->ndims() - `1`];
265
266	float reduce = scratchpad.template get<float*>(key_lnorm_reduction);
267	if (diff_scale == nullptr)
268	diff_scale = scratchpad.template get<float>(key_lnorm_tmp_diff_ss);
269	if (diff_shift == nullptr) {
270	diff_shift = scratchpad.template get<float>(key_lnorm_tmp_diff_ss);
271	}
272
273	const int max_nthr = pd()->nthr_;
274
275	const auto src_dt = pd()->src_md()->data_type;
276	const auto diff_dst_dt = pd()->diff_dst_md()->data_type;
277	const auto diff_src_dt = pd()->diff_src_md()->data_type;
278	const auto eps = pd()->desc()->layer_norm_epsilon;
279	const auto calculate_diff_stats = !pd()->stats_are_src();
280
281	parallel(max_nthr, [&](int ithr, int nthr) {
282	dim_t N_start = `0`, N_end = `0`;
283	balance211(N, nthr, ithr, N_start, N_end);
284	const size_t block_size = N_end - N_start;
285	const char *const __restrict src_ptr
286	= reinterpret_cast<const char *>(src)
287	+ N_start * C_padded * src_d.data_type_size();
288	const char *const __restrict diff_dst_ptr
289	= reinterpret_cast<const char *>(diff_dst)
290	+ N_start * C_padded * diff_dst_d.data_type_size();
291	const float *mean_ptr = &mean[N_start];
292	const float *var_ptr = &variance[N_start];
293	float *const inv_sqrtvar_ptr = &inv_sqrtvar[N_start];
294
295	float my_diff_gamma = reduce + C ithr;
296	float my_diff_beta = reduce + C nthr + C * ithr;
297
298	PRAGMA_OMP_SIMD()
299	for (dim_t c = `0`; c < C; c++) {
300	my_diff_gamma[c] = `0.`;
301	my_diff_beta[c] = `0.`;
302	}
303
304	for (size_t offset = `0`; offset < block_size; offset++) {
305	inv_sqrtvar_ptr[offset] = `1.` / sqrtf(var_ptr[offset] + eps);
306
307	PRAGMA_OMP_SIMD()
308	for (dim_t c = `0`; c < C; c++) {
309	const size_t off = c + C * offset;
310	float s = io::load_float_value(src_dt, src_ptr, off);
311	float dd = io::load_float_value(diff_dst_dt, diff_dst_ptr, off);
312	my_diff_gamma[c] += (s - mean_ptr[offset]) * dd
313	* inv_sqrtvar_ptr[offset];
314	my_diff_beta[c] += dd;
315	}
316	}
317	});
318
319	parallel_nd(C, [&](dim_t c) {
320	float diff_gamma = `0`, diff_beta = `0`;
321	for (dim_t n = `0`; n < max_nthr; n++) {
322	diff_gamma += reduce[C * n + c];
323	diff_beta += reduce[C * max_nthr + C * n + c];
324	}
325	diff_scale[c] = diff_gamma;
326	diff_shift[c] = diff_beta;
327	});
328
329	parallel(max_nthr, [&](int ithr, int nthr) {
330	dim_t N_start = `0`, N_end = `0`;
331	balance211(N, nthr, ithr, N_start, N_end);
332	const size_t block_size = N_end - N_start;
333	const char *const __restrict src_ptr
334	= reinterpret_cast<const char *>(src)
335	+ N_start * C_padded * src_d.data_type_size();
336	const char *const __restrict diff_dst_ptr
337	= reinterpret_cast<const char *>(diff_dst)
338	+ N_start * C_padded * diff_dst_d.data_type_size();
339	char *const __restrict diff_src_ptr = reinterpret_cast<char *>(diff_src)
340	+ N_start * C_padded * diff_src_d.data_type_size();
341	const float *mean_ptr = &mean[N_start];
342	float *const inv_sqrtvar_ptr = &inv_sqrtvar[N_start];
343
344	// Note: manual unrolling for scale and shift due to clang issue.
345	// see: CLANG_WA_01_SAFE_TO_USE_OMP_SIMD
346	float dd_gamma, dd_gamma_x;
347	for (size_t offset = `0`; offset < block_size; offset++) {
348	// reduce gamma
349	dd_gamma = dd_gamma_x = `0`;
350	if (calculate_diff_stats) {
351	if (use_scale) {
352	PRAGMA_OMP_SIMD(reduction(+ : dd_gamma, dd_gamma_x))
353	for (dim_t c = `0`; c < C; c++) {
354	const size_t off = c + C * offset;
355	float s = io::load_float_value(src_dt, src_ptr, off);
356	float dd = io::load_float_value(
357	diff_dst_dt, diff_dst_ptr, off);
358	dd_gamma += dd * scale[c];
359	dd_gamma_x += dd * scale[c] * (s - mean_ptr[offset]);
360	}
361	} else {
362	PRAGMA_OMP_SIMD(reduction(+ : dd_gamma, dd_gamma_x))
363	for (dim_t c = `0`; c < C; c++) {
364	const size_t off = c + C * offset;
365	float s = io::load_float_value(src_dt, src_ptr, off);
366	float dd = io::load_float_value(
367	diff_dst_dt, diff_dst_ptr, off);
368	dd_gamma += dd;
369	dd_gamma_x += dd * (s - mean_ptr[offset]);
370	}
371	}
372	dd_gamma_x *= inv_sqrtvar_ptr[offset];
373	}
374
375	// calculate diff_dst
376	if (use_scale) {
377	PRAGMA_OMP_SIMD()
378	for (dim_t c = `0`; c < C; c++) {
379	const size_t off = c + C * offset;
380	float dd = io::load_float_value(
381	diff_dst_dt, diff_dst_ptr, off);
382	float ds = dd * scale[c];
383	if (calculate_diff_stats) {
384	float s = io::load_float_value(src_dt, src_ptr, off);
385	ds -= dd_gamma / C;
386	ds -= (s - mean_ptr[offset]) * dd_gamma_x
387	* inv_sqrtvar_ptr[offset] / C;
388	}
389	ds *= inv_sqrtvar_ptr[offset];
390	io::store_float_value(diff_src_dt, ds, diff_src_ptr, off);
391	}
392	} else {
393	PRAGMA_OMP_SIMD()
394	for (dim_t c = `0`; c < C; c++) {
395	const size_t off = c + C * offset;
396	float dd = io::load_float_value(
397	diff_dst_dt, diff_dst_ptr, off);
398	float ds = dd;
399	if (calculate_diff_stats) {
400	float s = io::load_float_value(src_dt, src_ptr, off);
401	ds -= dd_gamma / C;
402	ds -= (s - mean_ptr[offset]) * dd_gamma_x
403	* inv_sqrtvar_ptr[offset] / C;
404	}
405	ds *= inv_sqrtvar_ptr[offset];
406	io::store_float_value(diff_src_dt, ds, diff_src_ptr, off);
407	}
408	}
409	}
410	});
411	return status::success;
412	}
413
414	} // namespace cpu
415	} // namespace impl
416	} // namespace dnnl
417

Browse the source code of oneDNN/src/cpu/simple_layer_normalization.cpp