binary_format.cpp source code [oneDNN/src/gpu/jit/binary_format.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include "gpu/jit/binary_format.hpp"
18
19	#include "common/utils.hpp"
20	#include "gpu/compute/compute_engine.hpp"
21	#include "gpu/compute/compute_stream.hpp"
22	#include "gpu/jit/jit_generator.hpp"
23
24	#define MAGIC0 0xBEEFCAFEu
25	#define MAGIC1 0x3141592653589793ull
26	#define MAGIC2 0xBEAD
27	#define MAGIC3 0xFACE
28	#define MAGIC4 0x0123456789ABCDEFull
29	#define MAGIC5 0xFEDCBA9876543210ull
30	#define MAGICPTR 0xABADFEEDu
31
32	#define MAGICSIZEX 4
33	#define MAGICSIZEY 2
34	#define MAGICSIZEZ 1
35
36	namespace dnnl {
37	namespace impl {
38	namespace gpu {
39	namespace jit {
40
41	using namespace ngen;
42
43	template <HW hw>
44	class binary_format_kernel_t : public jit_generator<hw> {
45	NGEN_FORWARD_OPENCL(hw);
46
47	public:
48	binary_format_kernel_t() {
49
50	auto low_half = [](uint64_t q) -> uint32_t { return q & `0xFFFFFFFF`; };
51	auto high_half = [](uint64_t q) -> uint32_t { return q >> `32`; };
52
53	newArgument("src0", DataType::ud); // r5.4:ud
54	newArgument("src1", DataType::uq); // r5.3:uq
55	newArgument("src2", DataType::uw); // r6.0:uw
56	newArgument("src3", DataType::uw); // r6.2:uw
57	newArgument("src4", DataType::uq); // r6.1:uq
58	newArgument("src5", DataType::uq); // r6.2:uq
59	newArgument("src_ptr", ExternalArgumentType::GlobalPtr);
60	newArgument("ok", ExternalArgumentType::GlobalPtr);
61
62	setDefaultAutoSWSB();
63	requireSIMD((GRF::bytes(hw) == `64`) ? `16` : `8`);
64	requireLocalID(`3`); // r1-r3
65	requireLocalSize(); // r7.0-2:ud
66	finalizeInterface();
67
68	Label doWrite;
69
70	auto src0 = getArgument("src0");
71	auto src1 = getArgument("src1");
72	auto src2 = getArgument("src2");
73	auto src3 = getArgument("src3");
74	auto src4 = getArgument("src4");
75	auto src5 = getArgument("src5");
76	auto src_ptr = getArgument("src_ptr");
77	auto ok_surface = Surface(getArgumentSurface("ok"));
78
79	auto data = r30;
80	auto data2 = r31;
81	auto ok = data.ud(`0`);
82	auto header = r64;
83
84	prologue();
85	setDefaultNoMask();
86
87	// Default: test failure.
88	mov(`1`, ok, uint16_t(`0`));
89
90	// Validate scalar arguments
91	cmp(`1` \| eq \| f0[`0`], null.ud(), src0, uint32_t(MAGIC0));
92	jmpi(`1` \| ~f0[`0`], doWrite);
93	cmp(`1` \| eq \| f0[`0`], null.ud(), src1.ud(`0`), low_half(MAGIC1));
94	jmpi(`1` \| ~f0[`0`], doWrite);
95	cmp(`1` \| eq \| f0[`0`], null.ud(), src1.ud(`1`), high_half(MAGIC1));
96	jmpi(`1` \| ~f0[`0`], doWrite);
97	cmp(`1` \| eq \| f0[`0`], null.uw(), src2, uint16_t(MAGIC2));
98	jmpi(`1` \| ~f0[`0`], doWrite);
99	cmp(`1` \| eq \| f0[`0`], null.uw(), src3, uint16_t(MAGIC3));
100	jmpi(`1` \| ~f0[`0`], doWrite);
101	cmp(`1` \| eq \| f0[`0`], null.ud(), src4.ud(`0`), low_half(MAGIC4));
102	jmpi(`1` \| ~f0[`0`], doWrite);
103	cmp(`1` \| eq \| f0[`0`], null.ud(), src4.ud(`1`), high_half(MAGIC4));
104	jmpi(`1` \| ~f0[`0`], doWrite);
105	cmp(`1` \| eq \| f0[`0`], null.ud(), src5.ud(`0`), low_half(MAGIC5));
106	jmpi(`1` \| ~f0[`0`], doWrite);
107	cmp(`1` \| eq \| f0[`0`], null.ud(), src5.ud(`1`), high_half(MAGIC5));
108	jmpi(`1` \| ~f0[`0`], doWrite);
109
110	// Validate A64 pointer argument.
111	mov<uint32_t>(`2`, header[`0`](`1`), src_ptr.ud(`0`)(`1`));
112	load(`1` \| SWSB(sb0, `1`), data2, scattered_dword (), A64, header);
113	cmp(`1` \| eq \| f0[`0`] \| sb0.dst, null.ud(), data2.ud(`0`),
114	uint32_t(MAGICPTR));
115	jmpi(`1` \| ~f0[`0`], doWrite);
116
117	// Validate OCL local size arguments
118	cmp(`1` \| eq \| f0[`0`], null.ud(), getLocalSize(`0`), uint32_t(MAGICSIZEX));
119	jmpi(`1` \| ~f0[`0`], doWrite);
120	cmp(`1` \| eq \| f0[`0`], null.ud(), getLocalSize(`1`), uint32_t(MAGICSIZEY));
121	jmpi(`1` \| ~f0[`0`], doWrite);
122	cmp(`1` \| eq \| f0[`0`], null.ud(), getLocalSize(`2`), uint32_t(MAGICSIZEZ));
123	jmpi(`1` \| ~f0[`0`], doWrite);
124
125	// Test passed.
126	mov(`1`, ok, uint16_t(`1`));
127
128	mark(doWrite);
129
130	// Write out results.
131	if (hw >= HW::XeHPC) {
132	// stateless
133	mov<uint32_t>(`2`, header, getArgument("ok").d(`0`)(`1`));
134	store.ugm(`1` \| SWSB(sb2, `1`), D32(`1`), A64, header, data);
135	} else {
136	// bti surface
137	mov<uint32_t>(`1`, header, uint16_t(`0`));
138	store(`1` \| SWSB(sb2, `1`), scattered_dword (), ok_surface, header,
139	data);
140	}
141
142	if (hw >= HW::XeHP) memfence(sb2, header);
143	mov<uint32_t>(`8`, r127, r0);
144	threadend(SWSB(sb2, `1`), r127);
145	}
146
147	static compute::kernel_t make_kernel(compute::compute_engine_t *engine) {
148	compute::kernel_t kernel;
149
150	if (hw != HW::Unknown) {
151	binary_format_kernel_t<hw> binary_format_kernel;
152
153	auto status
154	= engine->create_kernel(&kernel, &binary_format_kernel, {});
155	if (status != status::success) return nullptr;
156	} else {
157	switch (engine->device_info()->gpu_arch()) {
158	case compute::gpu_arch_t::gen9:
159	kernel = binary_format_kernel_t<HW::Gen9>::make_kernel(
160	engine);
161	break;
162	case compute::gpu_arch_t::gen11:
163	kernel = binary_format_kernel_t<HW::Gen11>::make_kernel(
164	engine);
165	break;
166	case compute::gpu_arch_t::xe_lp:
167	kernel = binary_format_kernel_t<HW::XeLP>::make_kernel(
168	engine);
169	break;
170	case compute::gpu_arch_t::xe_hp:
171	kernel = binary_format_kernel_t<HW::XeHP>::make_kernel(
172	engine);
173	break;
174	case compute::gpu_arch_t::xe_hpg:
175	kernel = binary_format_kernel_t<HW::XeHPG>::make_kernel(
176	engine);
177	break;
178	case compute::gpu_arch_t::xe_hpc:
179	kernel = binary_format_kernel_t<HW::XeHPC>::make_kernel(
180	engine);
181	break;
182	case compute::gpu_arch_t::unknown: kernel = nullptr; break;
183	}
184	}
185	return kernel;
186	}
187	};
188
189	status_t gpu_supports_binary_format(bool ok, engine_t engine) {
190	ok = false*;
191
192	auto gpu_engine = utils::downcast<compute::compute_engine_t *>(engine);
193	if (!gpu_engine) return status::invalid_arguments;
194
195	stream_t *stream_generic;
196	auto status = gpu_engine->get_service_stream(stream_generic);
197	if (status != status::success) return status::runtime_error;
198
199	auto stream = utils::downcast<compute::compute_stream_t *>(stream_generic);
200	if (!stream) return status::invalid_arguments;
201
202	auto kernel = binary_format_kernel_t<HW::Unknown>::make_kernel(gpu_engine);
203	if (!kernel) return status::success;
204
205	compute::kernel_t realized_kernel;
206	CHECK(kernel.realize(&realized_kernel, engine, nullptr));
207
208	// Binary kernel check.
209	uint32_t magic0 = MAGIC0;
210	uint64_t magic1 = MAGIC1;
211	uint16_t magic2 = MAGIC2;
212	uint16_t magic3 = MAGIC3;
213	uint64_t magic4 = MAGIC4;
214	uint64_t magic5 = MAGIC5;
215	uint32_t magic_ptr = MAGICPTR;
216
217	size_t gws[`3`] = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ};
218	size_t lws[`3`] = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ};
219
220	memory_storage_t storage = nullptr*;
221	std::unique_ptr<memory_storage_t> magic_buf, result_buf;
222
223	status = engine->create_memory_storage(&storage, sizeof(int32_t));
224	if (status != status::success) return status::runtime_error;
225	magic_buf.reset(storage);
226
227	status = engine->create_memory_storage(&storage, sizeof(int32_t));
228	if (status != status::success) return status::runtime_error;
229	result_buf.reset(storage);
230
231	void magic_host = nullptr*;
232	magic_buf->map_data(&magic_host, nullptr, sizeof(int32_t));
233	if (!magic_host) return status::runtime_error;
234
235	*reinterpret_cast<uint32_t *>(magic_host) = magic_ptr;
236
237	magic_buf->unmap_data(magic_host, nullptr);
238
239	void result_host = nullptr*;
240	result_buf->map_data(&result_host, nullptr, sizeof(int32_t));
241	if (!result_host) return status::runtime_error;
242
243	*reinterpret_cast<uint32_t *>(result_host) = `0`;
244
245	result_buf->unmap_data(result_host, nullptr);
246
247	compute::kernel_arg_list_t arg_list;
248	arg_list.set(`0`, magic0);
249	arg_list.set(`1`, magic1);
250	arg_list.set(`2`, magic2);
251	arg_list.set(`3`, magic3);
252	arg_list.set(`4`, magic4);
253	arg_list.set(`5`, magic5);
254	arg_list.set(`6`, *magic_buf.get());
255	arg_list.set(`7`, *result_buf.get());
256
257	auto nd_range = compute::nd_range_t (gws, lws);
258
259	status = stream->parallel_for(nd_range, realized_kernel, arg_list);
260
261	if (status != status::success) return status::runtime_error;
262
263	status = stream->wait();
264	if (status != status::success) return status::runtime_error;
265
266	result_host = nullptr;
267	result_buf->map_data(&result_host, nullptr, sizeof(int32_t));
268	if (!result_host) return status::runtime_error;
269
270	auto result = *reinterpret_cast<uint32_t *>(result_host);
271
272	result_buf->unmap_data(result_host, nullptr);
273
274	*ok = (result != `0`);
275
276	return status::success;
277	}
278
279	} // namespace jit
280	} // namespace gpu
281	} // namespace impl
282	} // namespace dnnl
283

Browse the source code of oneDNN/src/gpu/jit/binary_format.cpp