jit_uni_lstm_cell_postgemm_fwd.hpp source code [oneDNN/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_fwd.hpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef CPU_X64_RNN_JIT_UNI_LSTM_CELL_POSTGEMM_FWD_HPP
18	#define CPU_X64_RNN_JIT_UNI_LSTM_CELL_POSTGEMM_FWD_HPP
19
20	#include <memory>
21	#include "common/utils.hpp"
22	#include "cpu/x64/rnn/jit_uni_lstm_cell_postgemm.hpp"
23	#include "cpu/x64/rnn/jit_uni_rnn_common_postgemm.hpp"
24	namespace dnnl {
25	namespace impl {
26	namespace cpu {
27	namespace x64 {
28
29	template <cpu_isa_t isa, impl::data_type_t src_data_t,
30	impl::data_type_t scratch_data_t>
31	struct jit_uni_lstm_cell_postgemm_fwd
32	: public jit_uni_rnn_postgemm,
33	public jit_uni_lstm_cell_postgemm_t<isa> {
34	DECLARE_CPU_JIT_AUX_FUNCTIONS(jit_uni_lstm_cell_postgemm_fwd)
35
36	jit_uni_lstm_cell_postgemm_fwd(
37	const rnn_utils::rnn_conf_t &rnn, const rnn_pd_t *pd)
38	: jit_uni_rnn_postgemm(rnn, pd, jit_name())
39	, jit_uni_lstm_cell_postgemm_t<isa>(this,
40	get_last_preserved_vmm_idx(`1`) + `1`,
41	// usage of jit_uni_rnn_postgemm::bf16_emu_ to identify bf16
42	// emulation case is illegal here, it's created in
43	// jit_uni_rnn_postgemm::init(), not in constructor, so
44	// jit_uni_rnn_postgemm::bf16_emu_ = nullptr always on this
45	// stage
46	src_data_t == data_type::bf16 && !mayiuse(avx512_core_bf16)) {
47	}
48
49	~jit_uni_lstm_cell_postgemm_fwd() = default;
50
51	status_t init(data_type_t sdt) override {
52	jit_uni_rnn_postgemm::init(src_data_t);
53	// we use rax for both constant tables and load correspondent label
54	// into it when calling correspondent injector.
55	sigmoid_injector_ = utils::make_unique<injector_t>(
56	this, alg_kind::eltwise_logistic, `0.0f`, `0.0f`, `1.0f`, true, rax);
57	tanh_injector_ = utils::make_unique<injector_t>(
58	this, alg_kind::eltwise_tanh, `0.0f`, `0.0f`, `1.0f`, true, rax);
59	return create_kernel();
60	}
61
62	protected:
63	using injector_t = typename jit_uni_lstm_cell_postgemm_t<isa>::injector_t;
64	using Vmm = typename jit_uni_lstm_cell_postgemm_t<isa>::Vmm;
65
66	std::unique_ptr<injector_t> sigmoid_injector_;
67	std::unique_ptr<injector_t> tanh_injector_;
68
69	// register size in bytes
70	static constexpr size_t vlen_ = cpu_isa_traits<isa>::vlen;
71	static constexpr size_t qscale_dt_size = sizeof(float);
72	static constexpr size_t weights_peephole_dt_size_ = sizeof(float);
73	const size_t vlen_dst_
74	= vlen_ / (sizeof(float) / types::data_type_size(src_data_t));
75	const size_t vlen_bias_ = vlen_ / (sizeof(float) / bias_dt_size_);
76	const size_t vlen_c_states_ = vlen_ / (sizeof(float) / cstate_dt_size_);
77	const size_t hstate_dt_size_ = types::data_type_size(src_data_t);
78	const size_t gate_dt_size_ = types::data_type_size(src_data_t);
79	const size_t scratch_dt_size_ = types::data_type_size(scratch_data_t);
80	int get_vmm_idx(int unroll_idx, int type_shift) const {
81	const int preserved_vmm_start_idx = `1`;
82	// G0, G1, G2, G3, c_states;
83	const int num_preserved_regs_for_loop_iter = `5`;
84	assert(type_shift < num_preserved_regs_for_loop_iter);
85	const int unroll_idx_start = preserved_vmm_start_idx
86	+ num_preserved_regs_for_loop_iter * unroll_idx;
87	return unroll_idx_start + type_shift;
88	}
89
90	int G0_idx(int unroll_idx) const { return get_vmm_idx(unroll_idx, `0`); }
91	int G1_idx(int unroll_idx) const { return get_vmm_idx(unroll_idx, `1`); }
92	int G2_idx(int unroll_idx) const { return get_vmm_idx(unroll_idx, `2`); }
93	int G3_idx(int unroll_idx) const { return get_vmm_idx(unroll_idx, `3`); }
94	int c_states_idx(int unroll_idx) const {
95	return get_vmm_idx(unroll_idx, `4`);
96	}
97	int get_last_preserved_vmm_idx(int current_loop_unroll) const {
98	return c_states_idx(current_loop_unroll - `1`);
99	}
100
101	dim_t scale_off(int gate_idx, int unroll_idx) const {
102	const size_t vlen_qscale_elem = vlen_ / qscale_dt_size;
103	return gate_idx * rnn_.dhc + unroll_idx * vlen_qscale_elem;
104	}
105
106	void generate() override {
107	using namespace Xbyak;
108
109	const auto is_training
110	= (pd_->desc()->prop_kind == prop_kind::forward_training);
111
112	const int mask = pd_->attr()->rnn_weights_qparams_.mask_;
113	float *const weights_scales = pd_->attr()->rnn_weights_qparams_.scales_;
114
115	// Register map
116	const Reg64 loop_cnt(rbx); // loop counter
117
118	// We start code generations here
119	preamble();
120
121	const Reg64 n_step_reg(rbp);
122
123	// extract addresses passed as parameter
124	const auto addr_ws_gates_reg = abi_param1;
125	const auto addr_scratch_gates_reg = abi_param2;
126	const auto addr_weights_peephole_reg = r11;
127	const auto addr_bias_reg = abi_param3;
128	const auto addr_states_t_l_reg = abi_param4;
129	#ifdef _WIN32
130	const auto addr_states_t_l_copy_reg = r10;
131	const auto addr_c_states_tm1_l_reg = rdi;
132	const auto addr_c_states_t_l_reg = rsi;
133	// Here we cannot use rbp to have initial stack pointer so we
134	// use rsp and offset it with the size of pushed registers in
135	// preamble
136	const auto base_args = get_stack_params_address();
137	mov(addr_states_t_l_copy_reg, ptr[base_args]);
138	mov(addr_c_states_tm1_l_reg, ptr[base_args + `8`]);
139	mov(addr_c_states_t_l_reg, ptr[base_args + `16`]);
140	mov(addr_weights_peephole_reg, ptr[base_args + `24`]);
141	mov(n_step_reg, ptr[base_args + `40`]);
142	#else
143	const auto addr_states_t_l_copy_reg = abi_param5;
144	const auto addr_c_states_tm1_l_reg = abi_param6;
145	const auto addr_c_states_t_l_reg = r10;
146	const auto base_args = get_stack_params_address();
147	mov(addr_c_states_t_l_reg, ptr[base_args]);
148	mov(addr_weights_peephole_reg, ptr[base_args + `8`]);
149	mov(n_step_reg, ptr[base_args + `24`]);
150	#endif
151
152	// helper lambda to address the gates and biases
153	const auto sg_addr = [&](int i, int j = `0`) {
154	return ptr[addr_scratch_gates_reg + i * rnn_.dhc * scratch_dt_size_
155	+ j * vlen_];
156	};
157
158	const auto wg_addr = [&](int i, int j = `0`) {
159	return ptr[addr_ws_gates_reg + i * rnn_.dhc * gate_dt_size_
160	+ j * vlen_dst_];
161	};
162
163	const auto B_addr = [&](int i, int j = `0`) {
164	return ptr[addr_bias_reg + i * rnn_.dhc * bias_dt_size_
165	+ j * vlen_bias_];
166	};
167
168	const auto weights_peephole_addr = [&](int i, int j = `0`) {
169	return ptr[addr_weights_peephole_reg
170	+ i * rnn_.dhc * weights_peephole_dt_size_ + j * vlen_];
171	};
172
173	const auto loop_len = rnn_.dhc * scratch_dt_size_;
174	const auto loop_tail = loop_len % vlen_;
175
176	// initialize registers with addresses and constants
177	init_regs(weights_scales, vlen_, loop_tail / scratch_dt_size_);
178	sigmoid_injector_->load_table_addr();
179	tanh_injector_->load_table_addr();
180	if (rnn_.is_brgemm && !rnn_.unfused_post_gemm)
181	mov(loop_cnt, n_step_reg);
182	else
183	mov(loop_cnt, loop_len);
184
185	int loop_unroll = `1`;
186	int loop_unroll_tail = `0`;
187
188	const int loop_unroll_max = is_avx512 ? `4` : `1`;
189	if (rnn_.is_brgemm && !rnn_.unfused_post_gemm) {
190	const auto block_loop_len = rnn_.n_block * scratch_dt_size_;
191	for (loop_unroll = loop_unroll_max; loop_unroll > `1`;
192	loop_unroll--) {
193	if (block_loop_len % (loop_unroll * vlen_) == `0`) break;
194	}
195	if (loop_unroll > `1` && rnn_.n_tail > `0`
196	&& rnn_.n_tail * scratch_dt_size_ - loop_tail > `0`)
197	loop_unroll_tail = `1`;
198	} else {
199	for (loop_unroll = loop_unroll_max; loop_unroll > `1`;
200	loop_unroll--) {
201	if (loop_len >= (loop_unroll * vlen_)) break;
202	}
203	if (loop_unroll > `1`
204	&& (loop_len - loop_tail) % (loop_unroll * vlen_) > `0`)
205	loop_unroll_tail = `1`;
206	}
207
208	auto compute_loop = [=](size_t current_vlen, int current_unroll_len) {
209	this->reset_tmp_vmm_idx_range(
210	get_last_preserved_vmm_idx(current_unroll_len) + `1`,
211	this->get_max_allowed_tmp_vmm_allowed_idx());
212
213	injector_utils::vmm_index_set_t vmm_idxs;
214
215	const bool single_tail_loop_iter
216	= current_vlen < vlen_ && current_vlen == loop_tail;
217	const bool need_increment_regs = !single_tail_loop_iter;
218	const auto iter_size = current_unroll_len * current_vlen;
219
220	Label loop_start_label, loop_skip_label;
221	cmp(loop_cnt, iter_size);
222	jl(loop_skip_label, T_NEAR);
223
224	L_aligned(loop_start_label, `64`);
225	{
226	for (int ur_idx = `0`; ur_idx < current_unroll_len; ur_idx++) {
227	const Vmm G0(G0_idx(ur_idx)), G1(G1_idx(ur_idx)),
228	G2(G2_idx(ur_idx)), G3(G3_idx(ur_idx)),
229	tmp_c_states(c_states_idx(ur_idx));
230	// load G0 G1 G2 G3
231	load(G0, sg_addr(`0`, ur_idx), scratch_data_t, current_vlen);
232	load(G1, sg_addr(`1`, ur_idx), scratch_data_t, current_vlen);
233	load(G2, sg_addr(`2`, ur_idx), scratch_data_t, current_vlen);
234	load(G3, sg_addr(`3`, ur_idx), scratch_data_t, current_vlen);
235
236	// dequantize the gates from s32 to f32 if needed, add bias
237	deq_w(src_data_t, G0, this->get_next_tmp_vmm(),
238	this->get_next_tmp_vmm(), scale_off(`0`, ur_idx),
239	mask, current_vlen);
240	const auto bias_g0_vmm = this->get_next_tmp_vmm();
241	to_float(bias_g0_vmm, B_addr(`0`, ur_idx), rnn_.bias_dt,
242	current_vlen);
243	compute_vaddps(G0, G0, bias_g0_vmm, current_vlen);
244
245	deq_w(src_data_t, G1, this->get_next_tmp_vmm(),
246	this->get_next_tmp_vmm(), scale_off(`1`, ur_idx),
247	mask, current_vlen);
248	const auto bias_g1_vmm = this->get_next_tmp_vmm();
249	to_float(bias_g1_vmm, B_addr(`1`, ur_idx), rnn_.bias_dt,
250	current_vlen);
251	compute_vaddps(G1, G1, bias_g1_vmm, current_vlen);
252
253	deq_w(src_data_t, G2, this->get_next_tmp_vmm(),
254	this->get_next_tmp_vmm(), scale_off(`2`, ur_idx),
255	mask, current_vlen);
256	const auto bias_g2_vmm = this->get_next_tmp_vmm();
257	to_float(bias_g2_vmm, B_addr(`2`, ur_idx), rnn_.bias_dt,
258	current_vlen);
259	compute_vaddps(G2, G2, bias_g2_vmm, current_vlen);
260
261	deq_w(src_data_t, G3, this->get_next_tmp_vmm(),
262	this->get_next_tmp_vmm(), scale_off(`3`, ur_idx),
263	mask, current_vlen);
264	const auto bias_g3_vmm = this->get_next_tmp_vmm();
265	to_float(bias_g3_vmm, B_addr(`3`, ur_idx), rnn_.bias_dt,
266	current_vlen);
267	compute_vaddps(G3, G3, bias_g3_vmm, current_vlen);
268
269	to_float(tmp_c_states,
270	ptr[addr_c_states_tm1_l_reg
271	+ ur_idx * vlen_c_states_],
272	rnn_.src_iter_c_dt, current_vlen);
273
274	// add peephole
275	if (rnn_.is_lstm_peephole) {
276	compute_vfmadd231ps(G0, tmp_c_states,
277	weights_peephole_addr(`0`, ur_idx), current_vlen,
278	this->maybe_get_next_tmp_vmm_for_below_avx2_isa());
279	compute_vfmadd231ps(G1, tmp_c_states,
280	weights_peephole_addr(`1`, ur_idx), current_vlen,
281	this->maybe_get_next_tmp_vmm_for_below_avx2_isa());
282	}
283
284	vmm_idxs.emplace(G0.getIdx());
285	vmm_idxs.emplace(G1.getIdx());
286	if (!rnn_.is_lstm_peephole) vmm_idxs.emplace(G3.getIdx());
287	}
288
289	// inject eltwise code
290	sigmoid_injector_->load_table_addr();
291	sigmoid_injector_->compute_vector_range(vmm_idxs);
292	vmm_idxs.clear();
293
294	if (is_training) {
295	for (int ur_idx = `0`; ur_idx < current_unroll_len;
296	ur_idx++) {
297	to_src(wg_addr(`0`, ur_idx), Vmm(G0_idx(ur_idx)),
298	src_data_t, current_vlen);
299	to_src(wg_addr(`1`, ur_idx), Vmm(G1_idx(ur_idx)),
300	src_data_t, current_vlen);
301	if (!rnn_.is_lstm_peephole)
302	to_src(wg_addr(`3`, ur_idx), Vmm(G3_idx(ur_idx)),
303	src_data_t, current_vlen);
304	}
305	}
306	for (int ur_idx = `0`; ur_idx < current_unroll_len; ur_idx++) {
307	vmm_idxs.emplace(G2_idx(ur_idx));
308	}
309	tanh_injector_->load_table_addr();
310	tanh_injector_->compute_vector_range(vmm_idxs);
311	vmm_idxs.clear();
312
313	for (int ur_idx = `0`; ur_idx < current_unroll_len; ur_idx++) {
314	const Vmm G0(G0_idx(ur_idx)), G1(G1_idx(ur_idx)),
315	G2(G2_idx(ur_idx)),
316	tmp_c_states(c_states_idx(ur_idx));
317	if (is_training) {
318	to_src(wg_addr(`2`, ur_idx), G2, src_data_t,
319	current_vlen);
320	}
321
322	// compute c_states_t_l = G1 c_tm1_l + G0 * G2*
323	compute_vmulps(
324	tmp_c_states, tmp_c_states, G1, current_vlen);
325	compute_vfmadd231ps(tmp_c_states, this->vmm_backup(G0), G2,
326	current_vlen);
327	to_src(ptr[addr_c_states_t_l_reg + ur_idx * vlen_c_states_],
328	tmp_c_states, rnn_.dst_iter_c_dt, current_vlen);
329	}
330
331	// add peephole
332	if (rnn_.is_lstm_peephole) {
333	for (int ur_idx = `0`; ur_idx < current_unroll_len;
334	ur_idx++) {
335	const int cur_g3_idx = G3_idx(ur_idx);
336	compute_vfmadd231ps(Vmm(cur_g3_idx),
337	Vmm(c_states_idx(ur_idx)),
338	weights_peephole_addr(`2`, ur_idx), current_vlen,
339	this->maybe_get_next_tmp_vmm_for_below_avx2_isa());
340	vmm_idxs.emplace(cur_g3_idx);
341	}
342	sigmoid_injector_->load_table_addr();
343	sigmoid_injector_->compute_vector_range(vmm_idxs);
344	vmm_idxs.clear();
345
346	// if training we write back the gates
347	if (is_training) {
348	for (int ur_idx = `0`; ur_idx < current_unroll_len;
349	ur_idx++) {
350	to_src(wg_addr(`3`, ur_idx), Vmm(G3_idx(ur_idx)),
351	src_data_t, current_vlen);
352	}
353	}
354	}
355
356	for (int ur_idx = `0`; ur_idx < current_unroll_len; ur_idx++) {
357	vmm_idxs.emplace(c_states_idx(ur_idx));
358	}
359	// states_t_l = G3 tanh(c_states_t_l)*
360	tanh_injector_->load_table_addr();
361	tanh_injector_->compute_vector_range(vmm_idxs);
362	vmm_idxs.clear();
363
364	for (int ur_idx = `0`; ur_idx < current_unroll_len; ur_idx++) {
365	const Vmm G3(G3_idx(ur_idx)),
366	tmp_c_states(c_states_idx(ur_idx));
367	compute_vmulps(
368	tmp_c_states, tmp_c_states, G3, current_vlen);
369	}
370
371	// downconvert/quantize and write back the state
372	Label loop_inc_regs_label,
373	update_single_states_tensor_only_label;
374	cmp(addr_states_t_l_copy_reg, `0`);
375	je(update_single_states_tensor_only_label, T_NEAR);
376	// if states_t_l_copy is a non null ptr, we write the output to
377	// both tensors
378	for (int ur_idx = `0`; ur_idx < current_unroll_len; ur_idx++) {
379	const Vmm tmp_c_states(c_states_idx(ur_idx));
380	to_src(ptr[addr_states_t_l_reg + ur_idx * vlen_dst_],
381	tmp_c_states, src_data_t, current_vlen);
382	// As to_src is called with write_only=true it's important
383	// for bf16 src_dt to execute just after to_src method with
384	// write_only=false for the same Vmm
385	to_src(ptr[addr_states_t_l_copy_reg + ur_idx * vlen_dst_],
386	tmp_c_states, src_data_t, current_vlen, true);
387	}
388	const size_t hstate_shift = current_vlen < vlen_
389	? hstate_dt_size_
390	: current_unroll_len * vlen_dst_;
391	if (need_increment_regs)
392	add(addr_states_t_l_copy_reg, hstate_shift);
393	jmp(loop_inc_regs_label, T_NEAR);
394
395	L_aligned(update_single_states_tensor_only_label);
396	for (int ur_idx = `0`; ur_idx < current_unroll_len; ur_idx++) {
397	to_src(ptr[addr_states_t_l_reg + ur_idx * vlen_dst_],
398	Vmm(c_states_idx(ur_idx)), src_data_t,
399	current_vlen);
400	}
401
402	// increment address pointers
403	L_aligned(loop_inc_regs_label);
404	if (need_increment_regs) {
405	const size_t scratch_shift = current_vlen < vlen_
406	? scratch_dt_size_
407	: current_unroll_len * vlen_;
408	add(addr_scratch_gates_reg, scratch_shift);
409	if (rnn_.is_lstm_peephole) {
410	const size_t wpeephole_shift = current_vlen < vlen_
411	? weights_peephole_dt_size_
412	: current_unroll_len * vlen_;
413	add(addr_weights_peephole_reg, wpeephole_shift);
414	}
415	const size_t bias_shift = current_vlen < vlen_
416	? bias_dt_size_
417	: current_unroll_len * vlen_bias_;
418	add(addr_bias_reg, bias_shift);
419	add(addr_states_t_l_reg, hstate_shift);
420	const size_t cstate_shift = current_vlen < vlen_
421	? cstate_dt_size_
422	: current_unroll_len * vlen_c_states_;
423	add(addr_c_states_tm1_l_reg, cstate_shift);
424	add(addr_c_states_t_l_reg, cstate_shift);
425	if (is_training) {
426	const size_t gate_shift = current_vlen < vlen_
427	? gate_dt_size_
428	: current_unroll_len * vlen_dst_;
429	add(addr_ws_gates_reg, gate_shift);
430	}
431	const size_t qscale_shift = current_vlen < vlen_
432	? qscale_dt_size
433	: current_unroll_len * vlen_;
434	inc_regs(mask, qscale_shift);
435	}
436
437	// increment loop counter
438	sub(loop_cnt, iter_size);
439	cmp(loop_cnt, iter_size);
440	jge(loop_start_label, T_NEAR);
441	}
442	L_aligned(loop_skip_label, `64`);
443	};
444
445	if (loop_unroll > `0`) {
446	// unrolled vector loop
447	compute_loop(vlen_, loop_unroll);
448	}
449
450	if (loop_unroll_tail > `0`) {
451	// not unrolled vector loop if required
452	compute_loop(vlen_, loop_unroll_tail);
453	}
454
455	if (loop_tail > `0`) {
456	// tail processing
457	compute_loop(is_avx512 ? loop_tail : scratch_dt_size_, `1`);
458	}
459
460	postamble();
461
462	sigmoid_injector_->prepare_table(true);
463	tanh_injector_->prepare_table(true);
464
465	init_table(vlen_);
466	}
467	};
468
469	} // namespace x64
470	} // namespace cpu
471	} // namespace impl
472	} // namespace dnnl
473
474	#endif
475

Browse the source code of oneDNN/src/cpu/x64/rnn/jit_uni_lstm_cell_postgemm_fwd.hpp