1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include "gpu/jit/binary_format.hpp"
18
19#include "common/utils.hpp"
20#include "gpu/compute/compute_engine.hpp"
21#include "gpu/compute/compute_stream.hpp"
22#include "gpu/jit/jit_generator.hpp"
23
24#define MAGIC0 0xBEEFCAFEu
25#define MAGIC1 0x3141592653589793ull
26#define MAGIC2 0xBEAD
27#define MAGIC3 0xFACE
28#define MAGIC4 0x0123456789ABCDEFull
29#define MAGIC5 0xFEDCBA9876543210ull
30#define MAGICPTR 0xABADFEEDu
31
32#define MAGICSIZEX 4
33#define MAGICSIZEY 2
34#define MAGICSIZEZ 1
35
36namespace dnnl {
37namespace impl {
38namespace gpu {
39namespace jit {
40
41using namespace ngen;
42
43template <HW hw>
44class binary_format_kernel_t : public jit_generator<hw> {
45 NGEN_FORWARD_OPENCL(hw);
46
47public:
48 binary_format_kernel_t() {
49
50 auto low_half = [](uint64_t q) -> uint32_t { return q & 0xFFFFFFFF; };
51 auto high_half = [](uint64_t q) -> uint32_t { return q >> 32; };
52
53 newArgument("src0", DataType::ud); // r5.4:ud
54 newArgument("src1", DataType::uq); // r5.3:uq
55 newArgument("src2", DataType::uw); // r6.0:uw
56 newArgument("src3", DataType::uw); // r6.2:uw
57 newArgument("src4", DataType::uq); // r6.1:uq
58 newArgument("src5", DataType::uq); // r6.2:uq
59 newArgument("src_ptr", ExternalArgumentType::GlobalPtr);
60 newArgument("ok", ExternalArgumentType::GlobalPtr);
61
62 setDefaultAutoSWSB();
63 requireSIMD((GRF::bytes(hw) == 64) ? 16 : 8);
64 requireLocalID(3); // r1-r3
65 requireLocalSize(); // r7.0-2:ud
66 finalizeInterface();
67
68 Label doWrite;
69
70 auto src0 = getArgument("src0");
71 auto src1 = getArgument("src1");
72 auto src2 = getArgument("src2");
73 auto src3 = getArgument("src3");
74 auto src4 = getArgument("src4");
75 auto src5 = getArgument("src5");
76 auto src_ptr = getArgument("src_ptr");
77 auto ok_surface = Surface(getArgumentSurface("ok"));
78
79 auto data = r30;
80 auto data2 = r31;
81 auto ok = data.ud(0);
82 auto header = r64;
83
84 prologue();
85 setDefaultNoMask();
86
87 // Default: test failure.
88 mov(1, ok, uint16_t(0));
89
90 // Validate scalar arguments
91 cmp(1 | eq | f0[0], null.ud(), src0, uint32_t(MAGIC0));
92 jmpi(1 | ~f0[0], doWrite);
93 cmp(1 | eq | f0[0], null.ud(), src1.ud(0), low_half(MAGIC1));
94 jmpi(1 | ~f0[0], doWrite);
95 cmp(1 | eq | f0[0], null.ud(), src1.ud(1), high_half(MAGIC1));
96 jmpi(1 | ~f0[0], doWrite);
97 cmp(1 | eq | f0[0], null.uw(), src2, uint16_t(MAGIC2));
98 jmpi(1 | ~f0[0], doWrite);
99 cmp(1 | eq | f0[0], null.uw(), src3, uint16_t(MAGIC3));
100 jmpi(1 | ~f0[0], doWrite);
101 cmp(1 | eq | f0[0], null.ud(), src4.ud(0), low_half(MAGIC4));
102 jmpi(1 | ~f0[0], doWrite);
103 cmp(1 | eq | f0[0], null.ud(), src4.ud(1), high_half(MAGIC4));
104 jmpi(1 | ~f0[0], doWrite);
105 cmp(1 | eq | f0[0], null.ud(), src5.ud(0), low_half(MAGIC5));
106 jmpi(1 | ~f0[0], doWrite);
107 cmp(1 | eq | f0[0], null.ud(), src5.ud(1), high_half(MAGIC5));
108 jmpi(1 | ~f0[0], doWrite);
109
110 // Validate A64 pointer argument.
111 mov<uint32_t>(2, header[0](1), src_ptr.ud(0)(1));
112 load(1 | SWSB(sb0, 1), data2, scattered_dword(), A64, header);
113 cmp(1 | eq | f0[0] | sb0.dst, null.ud(), data2.ud(0),
114 uint32_t(MAGICPTR));
115 jmpi(1 | ~f0[0], doWrite);
116
117 // Validate OCL local size arguments
118 cmp(1 | eq | f0[0], null.ud(), getLocalSize(0), uint32_t(MAGICSIZEX));
119 jmpi(1 | ~f0[0], doWrite);
120 cmp(1 | eq | f0[0], null.ud(), getLocalSize(1), uint32_t(MAGICSIZEY));
121 jmpi(1 | ~f0[0], doWrite);
122 cmp(1 | eq | f0[0], null.ud(), getLocalSize(2), uint32_t(MAGICSIZEZ));
123 jmpi(1 | ~f0[0], doWrite);
124
125 // Test passed.
126 mov(1, ok, uint16_t(1));
127
128 mark(doWrite);
129
130 // Write out results.
131 if (hw >= HW::XeHPC) {
132 // stateless
133 mov<uint32_t>(2, header, getArgument("ok").d(0)(1));
134 store.ugm(1 | SWSB(sb2, 1), D32(1), A64, header, data);
135 } else {
136 // bti surface
137 mov<uint32_t>(1, header, uint16_t(0));
138 store(1 | SWSB(sb2, 1), scattered_dword(), ok_surface, header,
139 data);
140 }
141
142 if (hw >= HW::XeHP) memfence(sb2, header);
143 mov<uint32_t>(8, r127, r0);
144 threadend(SWSB(sb2, 1), r127);
145 }
146
147 static compute::kernel_t make_kernel(compute::compute_engine_t *engine) {
148 compute::kernel_t kernel;
149
150 if (hw != HW::Unknown) {
151 binary_format_kernel_t<hw> binary_format_kernel;
152
153 auto status
154 = engine->create_kernel(&kernel, &binary_format_kernel, {});
155 if (status != status::success) return nullptr;
156 } else {
157 switch (engine->device_info()->gpu_arch()) {
158 case compute::gpu_arch_t::gen9:
159 kernel = binary_format_kernel_t<HW::Gen9>::make_kernel(
160 engine);
161 break;
162 case compute::gpu_arch_t::gen11:
163 kernel = binary_format_kernel_t<HW::Gen11>::make_kernel(
164 engine);
165 break;
166 case compute::gpu_arch_t::xe_lp:
167 kernel = binary_format_kernel_t<HW::XeLP>::make_kernel(
168 engine);
169 break;
170 case compute::gpu_arch_t::xe_hp:
171 kernel = binary_format_kernel_t<HW::XeHP>::make_kernel(
172 engine);
173 break;
174 case compute::gpu_arch_t::xe_hpg:
175 kernel = binary_format_kernel_t<HW::XeHPG>::make_kernel(
176 engine);
177 break;
178 case compute::gpu_arch_t::xe_hpc:
179 kernel = binary_format_kernel_t<HW::XeHPC>::make_kernel(
180 engine);
181 break;
182 case compute::gpu_arch_t::unknown: kernel = nullptr; break;
183 }
184 }
185 return kernel;
186 }
187};
188
189status_t gpu_supports_binary_format(bool *ok, engine_t *engine) {
190 *ok = false;
191
192 auto gpu_engine = utils::downcast<compute::compute_engine_t *>(engine);
193 if (!gpu_engine) return status::invalid_arguments;
194
195 stream_t *stream_generic;
196 auto status = gpu_engine->get_service_stream(stream_generic);
197 if (status != status::success) return status::runtime_error;
198
199 auto stream = utils::downcast<compute::compute_stream_t *>(stream_generic);
200 if (!stream) return status::invalid_arguments;
201
202 auto kernel = binary_format_kernel_t<HW::Unknown>::make_kernel(gpu_engine);
203 if (!kernel) return status::success;
204
205 compute::kernel_t realized_kernel;
206 CHECK(kernel.realize(&realized_kernel, engine, nullptr));
207
208 // Binary kernel check.
209 uint32_t magic0 = MAGIC0;
210 uint64_t magic1 = MAGIC1;
211 uint16_t magic2 = MAGIC2;
212 uint16_t magic3 = MAGIC3;
213 uint64_t magic4 = MAGIC4;
214 uint64_t magic5 = MAGIC5;
215 uint32_t magic_ptr = MAGICPTR;
216
217 size_t gws[3] = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ};
218 size_t lws[3] = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ};
219
220 memory_storage_t *storage = nullptr;
221 std::unique_ptr<memory_storage_t> magic_buf, result_buf;
222
223 status = engine->create_memory_storage(&storage, sizeof(int32_t));
224 if (status != status::success) return status::runtime_error;
225 magic_buf.reset(storage);
226
227 status = engine->create_memory_storage(&storage, sizeof(int32_t));
228 if (status != status::success) return status::runtime_error;
229 result_buf.reset(storage);
230
231 void *magic_host = nullptr;
232 magic_buf->map_data(&magic_host, nullptr, sizeof(int32_t));
233 if (!magic_host) return status::runtime_error;
234
235 *reinterpret_cast<uint32_t *>(magic_host) = magic_ptr;
236
237 magic_buf->unmap_data(magic_host, nullptr);
238
239 void *result_host = nullptr;
240 result_buf->map_data(&result_host, nullptr, sizeof(int32_t));
241 if (!result_host) return status::runtime_error;
242
243 *reinterpret_cast<uint32_t *>(result_host) = 0;
244
245 result_buf->unmap_data(result_host, nullptr);
246
247 compute::kernel_arg_list_t arg_list;
248 arg_list.set(0, magic0);
249 arg_list.set(1, magic1);
250 arg_list.set(2, magic2);
251 arg_list.set(3, magic3);
252 arg_list.set(4, magic4);
253 arg_list.set(5, magic5);
254 arg_list.set(6, *magic_buf.get());
255 arg_list.set(7, *result_buf.get());
256
257 auto nd_range = compute::nd_range_t(gws, lws);
258
259 status = stream->parallel_for(nd_range, realized_kernel, arg_list);
260
261 if (status != status::success) return status::runtime_error;
262
263 status = stream->wait();
264 if (status != status::success) return status::runtime_error;
265
266 result_host = nullptr;
267 result_buf->map_data(&result_host, nullptr, sizeof(int32_t));
268 if (!result_host) return status::runtime_error;
269
270 auto result = *reinterpret_cast<uint32_t *>(result_host);
271
272 result_buf->unmap_data(result_host, nullptr);
273
274 *ok = (result != 0);
275
276 return status::success;
277}
278
279} // namespace jit
280} // namespace gpu
281} // namespace impl
282} // namespace dnnl
283