jit_brgemm_conv.hpp source code [oneDNN/src/cpu/x64/jit_brgemm_conv.hpp]

1	/*******************************************************************************
2	* Copyright 2021-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef CPU_X64_JIT_BRGEMM_CONV_HPP
18	#define CPU_X64_JIT_BRGEMM_CONV_HPP
19
20	#include "common/c_types_map.hpp"
21	#include "common/dnnl_thread.hpp"
22	#include "common/memory_tracking.hpp"
23	#include "common/primitive.hpp"
24	#include "common/utils.hpp"
25
26	#include "cpu/cpu_convolution_pd.hpp"
27	#include "cpu/platform.hpp"
28
29	#include "cpu/x64/amx_tile_configure.hpp"
30	#include "cpu/x64/brgemm/brgemm.hpp"
31	#include "cpu/x64/cpu_barrier.hpp"
32	#include "cpu/x64/cpu_reducer.hpp"
33	#include "cpu/x64/jit_brgemm_conv_comp_pad_kernel.hpp"
34	#include "cpu/x64/jit_brgemm_conv_trans_kernel.hpp"
35	#include "cpu/x64/jit_brgemm_conv_utils.hpp"
36	#include "cpu/x64/jit_brgemm_post_ops.hpp"
37
38	namespace dnnl {
39	namespace impl {
40	namespace cpu {
41	namespace x64 {
42
43	template <cpu_isa_t isa, bool use_inversion = false>
44	struct brgemm_convolution_fwd_t : public primitive_t {
45
46	struct pd_t : public cpu_convolution_fwd_pd_t {
47	pd_t(const convolution_desc_t adesc, const* primitive_attr_t *attr,
48	const typename pd_t::hint_class *hint_fwd_pd)
49	: cpu_convolution_fwd_pd_t(adesc, attr, hint_fwd_pd)
50	, with_sum(false) {}
51
52	~pd_t() = default;
53
54	// ------- DECLARE_COMMON_PD_t -----
55	pd_t clone() const* override {
56	auto new_pd = utils::make_unique<pd_t>(*this);
57	if (!new_pd->is_initialized()) return nullptr;
58	new_pd->brgs_.resize(brgs_sz_);
59	for (int i = `0`; i < brgs_sz_; i++) {
60	new_pd->brgs_[i] = brgs_[i];
61	new_pd->bd_masks[i] = bd_masks[i];
62	}
63	return new_pd.release();
64	}
65
66	status_t create_primitive(
67	std::pair<std::shared_ptr<primitive_t>, bool> &primitive,
68	engine_t *engine,
69	const cache_blob_t &cache_blob) const override {
70	return primitive_t::create_primitive_common<
71	brgemm_convolution_fwd_t, pd_t>(
72	primitive, this, engine, false, cache_blob);
73	}
74
75	const char name() const* override {
76	return JIT_IMPL_NAME_HELPER("brgconv:", isa, "");
77	}
78	// ---------------------------------
79
80	status_t init(engine_t *engine);
81
82	int brgs_sz_;
83	std::vector<std::shared_ptr<brgemm_t>> brgs_;
84	std::vector<std::shared_ptr<std::vector<char>>> bd_masks;
85	bool with_sum;
86	jit_brgemm_conv_conf_t jcp_;
87
88	int ic_chunks;
89	bool need_postwork;
90
91	// batch sizes info for unrolled kernels
92	int bs_c, first_bs;
93	std::vector<int> batchsizes;
94	int get_brg_idx(int bs, int m, bool do_initialization, bool is_N_tail,
95	bool is_K_tail) const {
96	auto bs_idx = jcp_.use_uker ? batchsizes[bs] : `0`;
97	assert(bs_idx >= `0`);
98	return (((m * bs_c + bs_idx) * `2`
99	+ static_cast<int>(do_initialization))
100	* `2`
101	+ static_cast<int>(is_N_tail))
102	* `2`
103	+ static_cast<int>(is_K_tail);
104	}
105
106	protected:
107	bool arg_scales_ok() const {
108	std::vector<int> supported_args = {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS};
109	const int with_g = static_cast<int>(with_groups());
110	bool ok = true;
111	ok = ok && attr()->scales_.has_default_values(supported_args);
112	for (int arg : supported_args) {
113	const auto &mask = attr()->scales_.get(arg).mask_;
114	if (arg == DNNL_ARG_WEIGHTS)
115	ok = ok && (mask == `0` \|\| mask == (`1` << with_g));
116	else
117	ok = ok && (mask == `0`);
118	}
119	return ok;
120	}
121	bool zero_points_ok() const {
122	// Only common zero points are supported -> mask should only be 0
123	int mask_src = `0`, mask_dst = `0`;
124	attr()->zero_points_.get(DNNL_ARG_SRC, &mask_src);
125	attr()->zero_points_.get(DNNL_ARG_DST, &mask_dst);
126	return attr()->zero_points_.has_default_values(DNNL_ARG_WEIGHTS)
127	&& mask_src == `0` && mask_dst == `0`;
128	}
129	};
130
131	brgemm_convolution_fwd_t(const pd_t *apd);
132
133	~brgemm_convolution_fwd_t() = default;
134
135	status_t execute(const exec_ctx_t &ctx) const override;
136
137	protected:
138	status_t init(engine_t *engine) override;
139
140	private:
141	struct S_t {
142	char a[AMX_PALETTE_SIZE];
143	};
144
145	// brgemm convolution execution context
146	struct brgemm_exec_ctx_t {
147	brgemm_exec_ctx_t(const exec_ctx_t &ctx, const pd_t *pd)
148	: src(CTX_IN_MEM(const char *, DNNL_ARG_SRC))
149	, weights(CTX_IN_MEM(const char *, DNNL_ARG_WEIGHTS))
150	, bias(CTX_IN_MEM(const char *, DNNL_ARG_BIAS))
151	, dst(CTX_OUT_MEM(char *, DNNL_ARG_DST))
152	, post_ops_binary_rhs_arg_vec(binary_injector::prepare_binary_args(
153	pd->attr()->post_ops_, ctx)) {}
154	const char *const __restrict src;
155	const char *const __restrict weights;
156	const char *const __restrict bias;
157	char *const __restrict dst;
158	const std::vector<const void *> post_ops_binary_rhs_arg_vec;
159	};
160
161	struct brgemm_thread_ctx_t;
162
163	static int get_ker_po_idx(int m, bool do_postwork, bool is_N_tail) {
164	return (m * `2` + static_cast<int>(do_postwork)) * `2`
165	+ static_cast<int>(is_N_tail);
166	}
167
168	static int get_inp_size(
169	int max_src_size, int dst_size, int k, int stride, int dilate) {
170	const auto res = nstl::min(max_src_size,
171	calculate_end_padding(`0`, dst_size, `0`, stride,
172	calculate_extended_filter_size(k, dilate)));
173	return res;
174	}
175
176	int maybe_invert(int k, int K) const {
177	return use_inversion ? K - `1` - k : k;
178	};
179	void get_kw_range(
180	int ow, int &kw_s, int &kw_full_s, int &kw_full_e, int &kw_e) const;
181	void get_ow_range(int ow, int kw, int &ow_s, int &ow_e) const;
182
183	void ker_base(brgemm_thread_ctx_t &btc) const;
184	void ker_trans(brgemm_thread_ctx_t &btc, char inp_buffer) const*;
185	void ker_vpad(brgemm_thread_ctx_t &btc) const;
186
187	void perform_outwork(char dst_base, char* dst, char* *c_buffer,
188	const char bias_w, int* od, int oh, int ow, int g_oc,
189	bool is_oc_tail, int ker_ow_s, int ker_ow_f, int kd_l, int kh_l,
190	const void post_ops_binary_rhs_arg_vec, const* float *oscales,
191	int32_t src_zp_vals, int32_t src_zp_ptr, int32_t dst_zp_ptr,
192	int32_t s8s8_compensation, bool* maybe_do_init, bool do_postwork,
193	bool do_post_comp) const;
194
195	void call_brgemm_kernel(brgemm_thread_ctx_t &btc, int brg_idx,
196	int batch_size, char ptr_C, char* ptr_D, const* char *bias_w,
197	int g_oc, bool do_postops, const void *binary_post_ops_rhs,
198	int32_t src_zp_vals, int32_t src_zp_ptr, int32_t dst_zp_ptr,
199	int32_t s8s8_comp, bool* do_only_comp) const;
200
201	void maybe_conv_inp(int ithr, const char *__restrict src,
202	char *__restrict inp_buffer, uint8_t *__restrict inp_buffer_mask,
203	int g, int n, int icc, int odb, int ohb, int owb, int last_g,
204	int last_n, int last_icc, int last_odb, int last_ohb,
205	int last_owb) const;
206
207	status_t add_po_kernel(brgemm_t bcfg, int* ker_idx, bool is_init);
208	void add_po_kernels(int i_N, int init_bcast_dim, int po_bcast_dim);
209	status_t add_brg_kernel(int bs, int M, int i_N, int i_K, int i_init);
210
211	status_t cal_compensation(const char *__restrict weights,
212	int32_t src_zp_buffer, int32_t s8s8_comp_buffer) const;
213	int get_comp_ker_idx(const int kd_b, const int kd_e, const int kh_b,
214	const int kh_e, const int kw_b, const int kw_e) const;
215	int get_comp_offset(const int g, const int ocb, const int ow,
216	const int kd_b, const int kd_e, const int kh_b, const int kh_e,
217	const int kw_b, const int kw_e) const;
218	const pd_t pd() const* {
219	return static_cast<const pd_t *>(primitive_t::pd().get());
220	}
221
222	std::vector<std::unique_ptr<brgemm_kernel_t>> brg_kernels_;
223	std::vector<std::unique_ptr<jit_brgemm_kernel_post_ops<isa>>> kernels_po_;
224	std::unique_ptr<jit_avx512_core_brgemm_conv_trans_kernel::
225	jit_avx512_core_brgemm_conv_trans_kernel_t>
226	copy_to_pbuffer_;
227	std::unique_ptr<jit_avx512_core_brgemm_conv_comp_pad_kernel::
228	jit_avx512_core_brgemm_conv_comp_pad_kernel_t>
229	comp_vpad_pbuffer_;
230	std::vector<S_t> brg_kernel_palettes_;
231
232	size_t acc_dsz, bia_dsz, src_dsz, wei_dsz, dst_dsz;
233
234	const memory_desc_wrapper bias_d;
235
236	// pre - calculated values
237	std::vector<dim_t> owb_kw_top_vpads;
238	std::vector<dim_t> owb_kw_bottom_vpads;
239	std::vector<dim_t> kd_bs, kd_es, kh_bs, kh_es, kw_bs, kw_es;
240
241	int KD, KH, KW, EXT_KD, EXT_KH, EXT_KW, KS, KD_BLOCK, KH_BLOCK, KW_BLOCK,
242	KD_BLOCK_PAD, KH_BLOCK_PAD, ID, IH, IW, IDP, IHP, IWP, OD, OH, OW,
243	SD, SH, SW, FP, TP, LP, DD, DH, DW;
244	dim_t src_w_sz, src_h_sz, src_d_sz, dst_w_sz, dst_h_sz, dst_d_sz, wei_ic_sz,
245	wei_kw_sz, wei_kh_sz, wei_kd_sz, wei_ocb_sz;
246	dim_t pbuf_w_sz, pbuf_h_sz, pbuf_d_sz;
247	dim_t ker_vpad_sz, comp_ocb_sz, comp_ker_sz, comp_kw_sz;
248
249	bool need_compensation;
250	bool is_amx;
251	};
252
253	} // namespace x64
254	} // namespace cpu
255	} // namespace impl
256	} // namespace dnnl
257
258	#endif
259
260	// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
261

Browse the source code of oneDNN/src/cpu/x64/jit_brgemm_conv.hpp