jit_prelu_base_kernel.cpp source code [oneDNN/src/cpu/x64/prelu/jit_prelu_base_kernel.cpp]

1	/*******************************************************************************
2	* Copyright 2020-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include "cpu/x64/prelu/jit_prelu_base_kernel.hpp"
18	#include "common/dnnl_thread.hpp"
19
20	namespace dnnl {
21	namespace impl {
22	namespace cpu {
23	namespace x64 {
24
25	jit_prelu_base_kernel_t::jit_prelu_base_kernel_t(const cpu_isa_t &isa, int vlen,
26	const prelu::bcast &bcast, const memory_desc_wrapper &tensor_md,
27	size_t number_vmm_single_compute, const char *name)
28	: jit_generator (name, nullptr, MAX_CODE_SIZE, true, isa)
29	, isa_(isa)
30	, simd_w_(vlen / sizeof(float))
31	, bcast_(bcast)
32	, tail_size_(calc_tail_size(tensor_md))
33	, tensor_md_(tensor_md)
34	, number_vmm_single_compute_(number_vmm_single_compute) {}
35
36	size_t jit_prelu_base_kernel_t::simd_w() const noexcept {
37	return simd_w_;
38	}
39
40	prelu::bcast jit_prelu_base_kernel_t::get_bcast() const noexcept {
41	return bcast_;
42	}
43
44	void jit_prelu_base_kernel_t::generate() {
45	Xbyak::Label unroll_loop, unroll_loop_tail, nelems_tail, end;
46	const auto unrolling_factor = calc_unrolling_factor();
47
48	preamble();
49	load_kernel_call_params();
50	prepare_kernel_const_vars();
51
52	xor_(reg_offset_, reg_offset_);
53	L(unroll_loop);
54	{
55	const size_t offt = unrolling_factor * simd_w_;
56	cmp(reg_data_size_, offt);
57	jl(unroll_loop_tail, T_NEAR);
58
59	compute_dst(unrolling_factor, false /tail/);
60	sub(reg_data_size_, offt);
61	add(reg_offset_, offt);
62	jmp(unroll_loop);
63	}
64
65	static constexpr size_t single_unrolling = `1u`;
66	L(unroll_loop_tail);
67	{
68	cmp(reg_data_size_, simd_w_);
69	jl(nelems_tail, T_NEAR);
70
71	compute_dst(single_unrolling, false /tail/);
72	sub(reg_data_size_, simd_w_);
73	add(reg_offset_, simd_w_);
74	jmp(unroll_loop_tail);
75	}
76
77	L(nelems_tail);
78	{
79	cmp(reg_data_size_, `1`);
80	jl(end, T_NEAR);
81
82	compute_dst(single_unrolling, true /tail/);
83	}
84
85	L(end);
86	finalize();
87
88	postamble();
89	}
90
91	size_t jit_prelu_base_kernel_t::calc_tail_size(
92	const memory_desc_wrapper &tensor_md) const noexcept {
93
94	const auto &ndims = tensor_md.ndims();
95	dim_t nelems = `0`;
96	if (bcast_ == prelu::bcast::full)
97	nelems = tensor_md.nelems();
98	else if (bcast_ == prelu::bcast::per_oc_n_spatial_c)
99	nelems = tensor_md.dims()[`1`];
100	else if (bcast_ == prelu::bcast::per_oc_n_c_spatial && ndims >= `3`)
101	nelems = utils::array_product(tensor_md.dims() + `2`, ndims - `2`);
102
103	return nelems % simd_w_;
104	}
105
106	int jit_prelu_base_kernel_t::reserve_vmm() {
107	return number_reserved_vmms_++;
108	}
109
110	size_t jit_prelu_base_kernel_t::get_number_reserved_vmms() const noexcept {
111	static constexpr size_t number_vmm_reserved_bf16_process = `4u`;
112
113	const bool process_bf16_with_emu = any_tensor_bf16() && isa_ == avx512_core;
114
115	return number_reserved_vmms_
116	+ (process_bf16_with_emu ? number_vmm_reserved_bf16_process : `0`);
117	}
118
119	int jit_prelu_base_kernel_t::get_compute_vmm(
120	size_t base_idx, size_t unroll_group) const {
121	return number_reserved_vmms_ + base_idx
122	+ unroll_group * number_vmm_single_compute_;
123	}
124
125	size_t jit_prelu_base_kernel_t::calc_unrolling_factor() const noexcept {
126	const auto n_vregs = prelu::get_n_vregs(isa_);
127	const size_t number_of_available_regs
128	= n_vregs - get_number_reserved_vmms();
129	const size_t max_unrolling_factor
130	= number_of_available_regs / number_vmm_single_compute_;
131
132	size_t single_thread_estimated_elems = `0`;
133	const auto &dims = tensor_md_.dims();
134	const auto &ndims = tensor_md_.ndims();
135	const dim_t D = ndims >= `5` ? dims[ndims - `3`] : `1`;
136	const dim_t H = ndims >= `4` ? dims[ndims - `2`] : `1`;
137	const dim_t W = ndims >= `3` ? dims[ndims - `1`] : `1`;
138	const dim_t SP = D * H * W;
139
140	if (bcast_ == prelu::bcast::full) {
141	const size_t nelems = tensor_md_.nelems();
142	single_thread_estimated_elems = nelems / dnnl_get_max_threads();
143	} else if (bcast_ == prelu::bcast::per_oc_n_spatial_c) {
144	single_thread_estimated_elems = tensor_md_.dims()[`1`];
145	} else if (bcast_ == prelu::bcast::per_oc_blocked) {
146	single_thread_estimated_elems = SP * simd_w_;
147	} else if (bcast_ == prelu::bcast::per_oc_n_c_spatial) {
148	single_thread_estimated_elems = SP;
149	}
150
151	const size_t estimated_vectors_used = nstl::max(
152	static_cast<size_t>(
153	std::floor(single_thread_estimated_elems / simd_w_)),
154	static_cast<size_t>(`1`));
155
156	return nstl::min(max_unrolling_factor, estimated_vectors_used);
157	}
158
159	} // namespace x64
160	} // namespace cpu
161	} // namespace impl
162	} // namespace dnnl
163

Browse the source code of oneDNN/src/cpu/x64/prelu/jit_prelu_base_kernel.cpp