jit_avx512_core_fp16cvt.cpp source code [oneDNN/src/cpu/x64/jit_avx512_core_fp16cvt.cpp]

1	/*******************************************************************************
2	* Copyright 2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <assert.h>
18
19	#include "common/float16.hpp"
20
21	#include "cpu/x64/cpu_isa_traits.hpp"
22	#include "cpu/x64/jit_avx512_core_fp16cvt.hpp"
23	#include "cpu/x64/jit_generator.hpp"
24
25	namespace dnnl {
26	namespace impl {
27	namespace cpu {
28	namespace x64 {
29
30	using namespace Xbyak;
31
32	#define GET_OFF(field) offsetof(f16_support::jit_call_t, field)
33
34	void jit_avx512_core_fp16_add_cvt_ps_to_f16_t::generate() {
35	preamble();
36
37	auto add_cvt = [&](size_t idx, Xbyak::Opmask ktail_mask) {
38	vmovups(fp32_inp \| ktail_mask \| T_z,
39	ptr [reg_inp + sizeof(float) * (idx)]);
40	vaddps(fp32_inp \| ktail_mask \| T_z, fp32_inp,
41	ptr [reg_add + sizeof(float) * (idx)]);
42
43	vcvtps2ph(f16_out, fp32_inp, _op_mxcsr);
44
45	vmovdqu16(yword [reg_out + sizeof(float16_t) * (idx)] \| ktail_mask,
46	f16_out);
47	};
48
49	mov(reg_inp, ptr [abi_param1 + GET_OFF(inp)]);
50	mov(reg_add, ptr [abi_param1 + GET_OFF(add)]);
51	mov(reg_out, ptr [abi_param1 + GET_OFF(out)]);
52	mov(reg_nelems, ptr [abi_param1 + GET_OFF(nelems)]);
53
54	mov(reg32_tail, `0xffff`);
55	kmovw(ktail_mask, reg32_tail);
56
57	constexpr int n_unroll = `2`; // unroll by powers of 2 from 2^n to 2^0
58	Xbyak::Label l_simd_loop[n_unroll + `2`], l_simd_notail;
59	for (int i = n_unroll; i >= `0`; i--) {
60	const int unroll = `1` << i; // 4, 2, 1
61	L(l_simd_loop[i + `1`]);
62	{
63	cmp(reg_nelems, simd_w_ * unroll);
64	jl(l_simd_loop[i], T_NEAR);
65	for (int j = `0`; j < simd_w_ * unroll; j += simd_w_) {
66	add_cvt (j, ktail_mask);
67	}
68	add(reg_inp, simd_w_ * unroll * sizeof(float));
69	add(reg_add, simd_w_ * unroll * sizeof(float));
70	add(reg_out, simd_w_ * unroll * sizeof(float16_t));
71
72	sub(reg_nelems, simd_w_ * unroll);
73	jmp(l_simd_loop[i + `1`], T_NEAR);
74	}
75	}
76	L(l_simd_loop[`0`]);
77	test(reg_nelems, reg_nelems);
78	jz(l_simd_notail);
79	// JIT of `tail_mask_ = (1 << (nelems_ % simd_w_)) - 1;`
80	mov(reg32_mask, `1`);
81	mov(reg64_tail, reg_nelems);
82	shl(reg32_mask, reg8_mask_shift);
83	sub(reg32_mask, `1`);
84	kmovd(ktail_mask, reg32_mask);
85	add_cvt (`0`, ktail_mask);
86	L(l_simd_notail);
87
88	postamble();
89	}
90	#undef GET_OFF
91
92	} // namespace x64
93	} // namespace cpu
94	} // namespace impl
95	} // namespace dnnl
96

Browse the source code of oneDNN/src/cpu/x64/jit_avx512_core_fp16cvt.cpp