1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include "cpu/x64/prelu/jit_prelu_base_kernel.hpp"
18#include "common/dnnl_thread.hpp"
19
20namespace dnnl {
21namespace impl {
22namespace cpu {
23namespace x64 {
24
25jit_prelu_base_kernel_t::jit_prelu_base_kernel_t(const cpu_isa_t &isa, int vlen,
26 const prelu::bcast &bcast, const memory_desc_wrapper &tensor_md,
27 size_t number_vmm_single_compute, const char *name)
28 : jit_generator(name, nullptr, MAX_CODE_SIZE, true, isa)
29 , isa_(isa)
30 , simd_w_(vlen / sizeof(float))
31 , bcast_(bcast)
32 , tail_size_(calc_tail_size(tensor_md))
33 , tensor_md_(tensor_md)
34 , number_vmm_single_compute_(number_vmm_single_compute) {}
35
36size_t jit_prelu_base_kernel_t::simd_w() const noexcept {
37 return simd_w_;
38}
39
40prelu::bcast jit_prelu_base_kernel_t::get_bcast() const noexcept {
41 return bcast_;
42}
43
44void jit_prelu_base_kernel_t::generate() {
45 Xbyak::Label unroll_loop, unroll_loop_tail, nelems_tail, end;
46 const auto unrolling_factor = calc_unrolling_factor();
47
48 preamble();
49 load_kernel_call_params();
50 prepare_kernel_const_vars();
51
52 xor_(reg_offset_, reg_offset_);
53 L(unroll_loop);
54 {
55 const size_t offt = unrolling_factor * simd_w_;
56 cmp(reg_data_size_, offt);
57 jl(unroll_loop_tail, T_NEAR);
58
59 compute_dst(unrolling_factor, false /*tail*/);
60 sub(reg_data_size_, offt);
61 add(reg_offset_, offt);
62 jmp(unroll_loop);
63 }
64
65 static constexpr size_t single_unrolling = 1u;
66 L(unroll_loop_tail);
67 {
68 cmp(reg_data_size_, simd_w_);
69 jl(nelems_tail, T_NEAR);
70
71 compute_dst(single_unrolling, false /*tail*/);
72 sub(reg_data_size_, simd_w_);
73 add(reg_offset_, simd_w_);
74 jmp(unroll_loop_tail);
75 }
76
77 L(nelems_tail);
78 {
79 cmp(reg_data_size_, 1);
80 jl(end, T_NEAR);
81
82 compute_dst(single_unrolling, true /*tail*/);
83 }
84
85 L(end);
86 finalize();
87
88 postamble();
89}
90
91size_t jit_prelu_base_kernel_t::calc_tail_size(
92 const memory_desc_wrapper &tensor_md) const noexcept {
93
94 const auto &ndims = tensor_md.ndims();
95 dim_t nelems = 0;
96 if (bcast_ == prelu::bcast::full)
97 nelems = tensor_md.nelems();
98 else if (bcast_ == prelu::bcast::per_oc_n_spatial_c)
99 nelems = tensor_md.dims()[1];
100 else if (bcast_ == prelu::bcast::per_oc_n_c_spatial && ndims >= 3)
101 nelems = utils::array_product(tensor_md.dims() + 2, ndims - 2);
102
103 return nelems % simd_w_;
104}
105
106int jit_prelu_base_kernel_t::reserve_vmm() {
107 return number_reserved_vmms_++;
108}
109
110size_t jit_prelu_base_kernel_t::get_number_reserved_vmms() const noexcept {
111 static constexpr size_t number_vmm_reserved_bf16_process = 4u;
112
113 const bool process_bf16_with_emu = any_tensor_bf16() && isa_ == avx512_core;
114
115 return number_reserved_vmms_
116 + (process_bf16_with_emu ? number_vmm_reserved_bf16_process : 0);
117}
118
119int jit_prelu_base_kernel_t::get_compute_vmm(
120 size_t base_idx, size_t unroll_group) const {
121 return number_reserved_vmms_ + base_idx
122 + unroll_group * number_vmm_single_compute_;
123}
124
125size_t jit_prelu_base_kernel_t::calc_unrolling_factor() const noexcept {
126 const auto n_vregs = prelu::get_n_vregs(isa_);
127 const size_t number_of_available_regs
128 = n_vregs - get_number_reserved_vmms();
129 const size_t max_unrolling_factor
130 = number_of_available_regs / number_vmm_single_compute_;
131
132 size_t single_thread_estimated_elems = 0;
133 const auto &dims = tensor_md_.dims();
134 const auto &ndims = tensor_md_.ndims();
135 const dim_t D = ndims >= 5 ? dims[ndims - 3] : 1;
136 const dim_t H = ndims >= 4 ? dims[ndims - 2] : 1;
137 const dim_t W = ndims >= 3 ? dims[ndims - 1] : 1;
138 const dim_t SP = D * H * W;
139
140 if (bcast_ == prelu::bcast::full) {
141 const size_t nelems = tensor_md_.nelems();
142 single_thread_estimated_elems = nelems / dnnl_get_max_threads();
143 } else if (bcast_ == prelu::bcast::per_oc_n_spatial_c) {
144 single_thread_estimated_elems = tensor_md_.dims()[1];
145 } else if (bcast_ == prelu::bcast::per_oc_blocked) {
146 single_thread_estimated_elems = SP * simd_w_;
147 } else if (bcast_ == prelu::bcast::per_oc_n_c_spatial) {
148 single_thread_estimated_elems = SP;
149 }
150
151 const size_t estimated_vectors_used = nstl::max(
152 static_cast<size_t>(
153 std::floor(single_thread_estimated_elems / simd_w_)),
154 static_cast<size_t>(1));
155
156 return nstl::min(max_unrolling_factor, estimated_vectors_used);
157}
158
159} // namespace x64
160} // namespace cpu
161} // namespace impl
162} // namespace dnnl
163