1 | /******************************************************************************* |
2 | * Copyright 2020-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include "cpu/x64/prelu/jit_prelu_base_kernel.hpp" |
18 | #include "common/dnnl_thread.hpp" |
19 | |
20 | namespace dnnl { |
21 | namespace impl { |
22 | namespace cpu { |
23 | namespace x64 { |
24 | |
25 | jit_prelu_base_kernel_t::jit_prelu_base_kernel_t(const cpu_isa_t &isa, int vlen, |
26 | const prelu::bcast &bcast, const memory_desc_wrapper &tensor_md, |
27 | size_t number_vmm_single_compute, const char *name) |
28 | : jit_generator(name, nullptr, MAX_CODE_SIZE, true, isa) |
29 | , isa_(isa) |
30 | , simd_w_(vlen / sizeof(float)) |
31 | , bcast_(bcast) |
32 | , tail_size_(calc_tail_size(tensor_md)) |
33 | , tensor_md_(tensor_md) |
34 | , number_vmm_single_compute_(number_vmm_single_compute) {} |
35 | |
36 | size_t jit_prelu_base_kernel_t::simd_w() const noexcept { |
37 | return simd_w_; |
38 | } |
39 | |
40 | prelu::bcast jit_prelu_base_kernel_t::get_bcast() const noexcept { |
41 | return bcast_; |
42 | } |
43 | |
44 | void jit_prelu_base_kernel_t::generate() { |
45 | Xbyak::Label unroll_loop, unroll_loop_tail, nelems_tail, end; |
46 | const auto unrolling_factor = calc_unrolling_factor(); |
47 | |
48 | preamble(); |
49 | load_kernel_call_params(); |
50 | prepare_kernel_const_vars(); |
51 | |
52 | xor_(reg_offset_, reg_offset_); |
53 | L(unroll_loop); |
54 | { |
55 | const size_t offt = unrolling_factor * simd_w_; |
56 | cmp(reg_data_size_, offt); |
57 | jl(unroll_loop_tail, T_NEAR); |
58 | |
59 | compute_dst(unrolling_factor, false /*tail*/); |
60 | sub(reg_data_size_, offt); |
61 | add(reg_offset_, offt); |
62 | jmp(unroll_loop); |
63 | } |
64 | |
65 | static constexpr size_t single_unrolling = 1u; |
66 | L(unroll_loop_tail); |
67 | { |
68 | cmp(reg_data_size_, simd_w_); |
69 | jl(nelems_tail, T_NEAR); |
70 | |
71 | compute_dst(single_unrolling, false /*tail*/); |
72 | sub(reg_data_size_, simd_w_); |
73 | add(reg_offset_, simd_w_); |
74 | jmp(unroll_loop_tail); |
75 | } |
76 | |
77 | L(nelems_tail); |
78 | { |
79 | cmp(reg_data_size_, 1); |
80 | jl(end, T_NEAR); |
81 | |
82 | compute_dst(single_unrolling, true /*tail*/); |
83 | } |
84 | |
85 | L(end); |
86 | finalize(); |
87 | |
88 | postamble(); |
89 | } |
90 | |
91 | size_t jit_prelu_base_kernel_t::calc_tail_size( |
92 | const memory_desc_wrapper &tensor_md) const noexcept { |
93 | |
94 | const auto &ndims = tensor_md.ndims(); |
95 | dim_t nelems = 0; |
96 | if (bcast_ == prelu::bcast::full) |
97 | nelems = tensor_md.nelems(); |
98 | else if (bcast_ == prelu::bcast::per_oc_n_spatial_c) |
99 | nelems = tensor_md.dims()[1]; |
100 | else if (bcast_ == prelu::bcast::per_oc_n_c_spatial && ndims >= 3) |
101 | nelems = utils::array_product(tensor_md.dims() + 2, ndims - 2); |
102 | |
103 | return nelems % simd_w_; |
104 | } |
105 | |
106 | int jit_prelu_base_kernel_t::reserve_vmm() { |
107 | return number_reserved_vmms_++; |
108 | } |
109 | |
110 | size_t jit_prelu_base_kernel_t::get_number_reserved_vmms() const noexcept { |
111 | static constexpr size_t number_vmm_reserved_bf16_process = 4u; |
112 | |
113 | const bool process_bf16_with_emu = any_tensor_bf16() && isa_ == avx512_core; |
114 | |
115 | return number_reserved_vmms_ |
116 | + (process_bf16_with_emu ? number_vmm_reserved_bf16_process : 0); |
117 | } |
118 | |
119 | int jit_prelu_base_kernel_t::get_compute_vmm( |
120 | size_t base_idx, size_t unroll_group) const { |
121 | return number_reserved_vmms_ + base_idx |
122 | + unroll_group * number_vmm_single_compute_; |
123 | } |
124 | |
125 | size_t jit_prelu_base_kernel_t::calc_unrolling_factor() const noexcept { |
126 | const auto n_vregs = prelu::get_n_vregs(isa_); |
127 | const size_t number_of_available_regs |
128 | = n_vregs - get_number_reserved_vmms(); |
129 | const size_t max_unrolling_factor |
130 | = number_of_available_regs / number_vmm_single_compute_; |
131 | |
132 | size_t single_thread_estimated_elems = 0; |
133 | const auto &dims = tensor_md_.dims(); |
134 | const auto &ndims = tensor_md_.ndims(); |
135 | const dim_t D = ndims >= 5 ? dims[ndims - 3] : 1; |
136 | const dim_t H = ndims >= 4 ? dims[ndims - 2] : 1; |
137 | const dim_t W = ndims >= 3 ? dims[ndims - 1] : 1; |
138 | const dim_t SP = D * H * W; |
139 | |
140 | if (bcast_ == prelu::bcast::full) { |
141 | const size_t nelems = tensor_md_.nelems(); |
142 | single_thread_estimated_elems = nelems / dnnl_get_max_threads(); |
143 | } else if (bcast_ == prelu::bcast::per_oc_n_spatial_c) { |
144 | single_thread_estimated_elems = tensor_md_.dims()[1]; |
145 | } else if (bcast_ == prelu::bcast::per_oc_blocked) { |
146 | single_thread_estimated_elems = SP * simd_w_; |
147 | } else if (bcast_ == prelu::bcast::per_oc_n_c_spatial) { |
148 | single_thread_estimated_elems = SP; |
149 | } |
150 | |
151 | const size_t estimated_vectors_used = nstl::max( |
152 | static_cast<size_t>( |
153 | std::floor(single_thread_estimated_elems / simd_w_)), |
154 | static_cast<size_t>(1)); |
155 | |
156 | return nstl::min(max_unrolling_factor, estimated_vectors_used); |
157 | } |
158 | |
159 | } // namespace x64 |
160 | } // namespace cpu |
161 | } // namespace impl |
162 | } // namespace dnnl |
163 | |