1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include "gpu/jit/binary_format.hpp" |
18 | |
19 | #include "common/utils.hpp" |
20 | #include "gpu/compute/compute_engine.hpp" |
21 | #include "gpu/compute/compute_stream.hpp" |
22 | #include "gpu/jit/jit_generator.hpp" |
23 | |
24 | #define MAGIC0 0xBEEFCAFEu |
25 | #define MAGIC1 0x3141592653589793ull |
26 | #define MAGIC2 0xBEAD |
27 | #define MAGIC3 0xFACE |
28 | #define MAGIC4 0x0123456789ABCDEFull |
29 | #define MAGIC5 0xFEDCBA9876543210ull |
30 | #define MAGICPTR 0xABADFEEDu |
31 | |
32 | #define MAGICSIZEX 4 |
33 | #define MAGICSIZEY 2 |
34 | #define MAGICSIZEZ 1 |
35 | |
36 | namespace dnnl { |
37 | namespace impl { |
38 | namespace gpu { |
39 | namespace jit { |
40 | |
41 | using namespace ngen; |
42 | |
43 | template <HW hw> |
44 | class binary_format_kernel_t : public jit_generator<hw> { |
45 | NGEN_FORWARD_OPENCL(hw); |
46 | |
47 | public: |
48 | binary_format_kernel_t() { |
49 | |
50 | auto low_half = [](uint64_t q) -> uint32_t { return q & 0xFFFFFFFF; }; |
51 | auto high_half = [](uint64_t q) -> uint32_t { return q >> 32; }; |
52 | |
53 | newArgument("src0" , DataType::ud); // r5.4:ud |
54 | newArgument("src1" , DataType::uq); // r5.3:uq |
55 | newArgument("src2" , DataType::uw); // r6.0:uw |
56 | newArgument("src3" , DataType::uw); // r6.2:uw |
57 | newArgument("src4" , DataType::uq); // r6.1:uq |
58 | newArgument("src5" , DataType::uq); // r6.2:uq |
59 | newArgument("src_ptr" , ExternalArgumentType::GlobalPtr); |
60 | newArgument("ok" , ExternalArgumentType::GlobalPtr); |
61 | |
62 | setDefaultAutoSWSB(); |
63 | requireSIMD((GRF::bytes(hw) == 64) ? 16 : 8); |
64 | requireLocalID(3); // r1-r3 |
65 | requireLocalSize(); // r7.0-2:ud |
66 | finalizeInterface(); |
67 | |
68 | Label doWrite; |
69 | |
70 | auto src0 = getArgument("src0" ); |
71 | auto src1 = getArgument("src1" ); |
72 | auto src2 = getArgument("src2" ); |
73 | auto src3 = getArgument("src3" ); |
74 | auto src4 = getArgument("src4" ); |
75 | auto src5 = getArgument("src5" ); |
76 | auto src_ptr = getArgument("src_ptr" ); |
77 | auto ok_surface = Surface(getArgumentSurface("ok" )); |
78 | |
79 | auto data = r30; |
80 | auto data2 = r31; |
81 | auto ok = data.ud(0); |
82 | auto = r64; |
83 | |
84 | prologue(); |
85 | setDefaultNoMask(); |
86 | |
87 | // Default: test failure. |
88 | mov(1, ok, uint16_t(0)); |
89 | |
90 | // Validate scalar arguments |
91 | cmp(1 | eq | f0[0], null.ud(), src0, uint32_t(MAGIC0)); |
92 | jmpi(1 | ~f0[0], doWrite); |
93 | cmp(1 | eq | f0[0], null.ud(), src1.ud(0), low_half(MAGIC1)); |
94 | jmpi(1 | ~f0[0], doWrite); |
95 | cmp(1 | eq | f0[0], null.ud(), src1.ud(1), high_half(MAGIC1)); |
96 | jmpi(1 | ~f0[0], doWrite); |
97 | cmp(1 | eq | f0[0], null.uw(), src2, uint16_t(MAGIC2)); |
98 | jmpi(1 | ~f0[0], doWrite); |
99 | cmp(1 | eq | f0[0], null.uw(), src3, uint16_t(MAGIC3)); |
100 | jmpi(1 | ~f0[0], doWrite); |
101 | cmp(1 | eq | f0[0], null.ud(), src4.ud(0), low_half(MAGIC4)); |
102 | jmpi(1 | ~f0[0], doWrite); |
103 | cmp(1 | eq | f0[0], null.ud(), src4.ud(1), high_half(MAGIC4)); |
104 | jmpi(1 | ~f0[0], doWrite); |
105 | cmp(1 | eq | f0[0], null.ud(), src5.ud(0), low_half(MAGIC5)); |
106 | jmpi(1 | ~f0[0], doWrite); |
107 | cmp(1 | eq | f0[0], null.ud(), src5.ud(1), high_half(MAGIC5)); |
108 | jmpi(1 | ~f0[0], doWrite); |
109 | |
110 | // Validate A64 pointer argument. |
111 | mov<uint32_t>(2, header[0](1), src_ptr.ud(0)(1)); |
112 | load(1 | SWSB(sb0, 1), data2, scattered_dword(), A64, header); |
113 | cmp(1 | eq | f0[0] | sb0.dst, null.ud(), data2.ud(0), |
114 | uint32_t(MAGICPTR)); |
115 | jmpi(1 | ~f0[0], doWrite); |
116 | |
117 | // Validate OCL local size arguments |
118 | cmp(1 | eq | f0[0], null.ud(), getLocalSize(0), uint32_t(MAGICSIZEX)); |
119 | jmpi(1 | ~f0[0], doWrite); |
120 | cmp(1 | eq | f0[0], null.ud(), getLocalSize(1), uint32_t(MAGICSIZEY)); |
121 | jmpi(1 | ~f0[0], doWrite); |
122 | cmp(1 | eq | f0[0], null.ud(), getLocalSize(2), uint32_t(MAGICSIZEZ)); |
123 | jmpi(1 | ~f0[0], doWrite); |
124 | |
125 | // Test passed. |
126 | mov(1, ok, uint16_t(1)); |
127 | |
128 | mark(doWrite); |
129 | |
130 | // Write out results. |
131 | if (hw >= HW::XeHPC) { |
132 | // stateless |
133 | mov<uint32_t>(2, header, getArgument("ok" ).d(0)(1)); |
134 | store.ugm(1 | SWSB(sb2, 1), D32(1), A64, header, data); |
135 | } else { |
136 | // bti surface |
137 | mov<uint32_t>(1, header, uint16_t(0)); |
138 | store(1 | SWSB(sb2, 1), scattered_dword(), ok_surface, header, |
139 | data); |
140 | } |
141 | |
142 | if (hw >= HW::XeHP) memfence(sb2, header); |
143 | mov<uint32_t>(8, r127, r0); |
144 | threadend(SWSB(sb2, 1), r127); |
145 | } |
146 | |
147 | static compute::kernel_t make_kernel(compute::compute_engine_t *engine) { |
148 | compute::kernel_t kernel; |
149 | |
150 | if (hw != HW::Unknown) { |
151 | binary_format_kernel_t<hw> binary_format_kernel; |
152 | |
153 | auto status |
154 | = engine->create_kernel(&kernel, &binary_format_kernel, {}); |
155 | if (status != status::success) return nullptr; |
156 | } else { |
157 | switch (engine->device_info()->gpu_arch()) { |
158 | case compute::gpu_arch_t::gen9: |
159 | kernel = binary_format_kernel_t<HW::Gen9>::make_kernel( |
160 | engine); |
161 | break; |
162 | case compute::gpu_arch_t::gen11: |
163 | kernel = binary_format_kernel_t<HW::Gen11>::make_kernel( |
164 | engine); |
165 | break; |
166 | case compute::gpu_arch_t::xe_lp: |
167 | kernel = binary_format_kernel_t<HW::XeLP>::make_kernel( |
168 | engine); |
169 | break; |
170 | case compute::gpu_arch_t::xe_hp: |
171 | kernel = binary_format_kernel_t<HW::XeHP>::make_kernel( |
172 | engine); |
173 | break; |
174 | case compute::gpu_arch_t::xe_hpg: |
175 | kernel = binary_format_kernel_t<HW::XeHPG>::make_kernel( |
176 | engine); |
177 | break; |
178 | case compute::gpu_arch_t::xe_hpc: |
179 | kernel = binary_format_kernel_t<HW::XeHPC>::make_kernel( |
180 | engine); |
181 | break; |
182 | case compute::gpu_arch_t::unknown: kernel = nullptr; break; |
183 | } |
184 | } |
185 | return kernel; |
186 | } |
187 | }; |
188 | |
189 | status_t gpu_supports_binary_format(bool *ok, engine_t *engine) { |
190 | *ok = false; |
191 | |
192 | auto gpu_engine = utils::downcast<compute::compute_engine_t *>(engine); |
193 | if (!gpu_engine) return status::invalid_arguments; |
194 | |
195 | stream_t *stream_generic; |
196 | auto status = gpu_engine->get_service_stream(stream_generic); |
197 | if (status != status::success) return status::runtime_error; |
198 | |
199 | auto stream = utils::downcast<compute::compute_stream_t *>(stream_generic); |
200 | if (!stream) return status::invalid_arguments; |
201 | |
202 | auto kernel = binary_format_kernel_t<HW::Unknown>::make_kernel(gpu_engine); |
203 | if (!kernel) return status::success; |
204 | |
205 | compute::kernel_t realized_kernel; |
206 | CHECK(kernel.realize(&realized_kernel, engine, nullptr)); |
207 | |
208 | // Binary kernel check. |
209 | uint32_t magic0 = MAGIC0; |
210 | uint64_t magic1 = MAGIC1; |
211 | uint16_t magic2 = MAGIC2; |
212 | uint16_t magic3 = MAGIC3; |
213 | uint64_t magic4 = MAGIC4; |
214 | uint64_t magic5 = MAGIC5; |
215 | uint32_t magic_ptr = MAGICPTR; |
216 | |
217 | size_t gws[3] = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ}; |
218 | size_t lws[3] = {MAGICSIZEX, MAGICSIZEY, MAGICSIZEZ}; |
219 | |
220 | memory_storage_t *storage = nullptr; |
221 | std::unique_ptr<memory_storage_t> magic_buf, result_buf; |
222 | |
223 | status = engine->create_memory_storage(&storage, sizeof(int32_t)); |
224 | if (status != status::success) return status::runtime_error; |
225 | magic_buf.reset(storage); |
226 | |
227 | status = engine->create_memory_storage(&storage, sizeof(int32_t)); |
228 | if (status != status::success) return status::runtime_error; |
229 | result_buf.reset(storage); |
230 | |
231 | void *magic_host = nullptr; |
232 | magic_buf->map_data(&magic_host, nullptr, sizeof(int32_t)); |
233 | if (!magic_host) return status::runtime_error; |
234 | |
235 | *reinterpret_cast<uint32_t *>(magic_host) = magic_ptr; |
236 | |
237 | magic_buf->unmap_data(magic_host, nullptr); |
238 | |
239 | void *result_host = nullptr; |
240 | result_buf->map_data(&result_host, nullptr, sizeof(int32_t)); |
241 | if (!result_host) return status::runtime_error; |
242 | |
243 | *reinterpret_cast<uint32_t *>(result_host) = 0; |
244 | |
245 | result_buf->unmap_data(result_host, nullptr); |
246 | |
247 | compute::kernel_arg_list_t arg_list; |
248 | arg_list.set(0, magic0); |
249 | arg_list.set(1, magic1); |
250 | arg_list.set(2, magic2); |
251 | arg_list.set(3, magic3); |
252 | arg_list.set(4, magic4); |
253 | arg_list.set(5, magic5); |
254 | arg_list.set(6, *magic_buf.get()); |
255 | arg_list.set(7, *result_buf.get()); |
256 | |
257 | auto nd_range = compute::nd_range_t(gws, lws); |
258 | |
259 | status = stream->parallel_for(nd_range, realized_kernel, arg_list); |
260 | |
261 | if (status != status::success) return status::runtime_error; |
262 | |
263 | status = stream->wait(); |
264 | if (status != status::success) return status::runtime_error; |
265 | |
266 | result_host = nullptr; |
267 | result_buf->map_data(&result_host, nullptr, sizeof(int32_t)); |
268 | if (!result_host) return status::runtime_error; |
269 | |
270 | auto result = *reinterpret_cast<uint32_t *>(result_host); |
271 | |
272 | result_buf->unmap_data(result_host, nullptr); |
273 | |
274 | *ok = (result != 0); |
275 | |
276 | return status::success; |
277 | } |
278 | |
279 | } // namespace jit |
280 | } // namespace gpu |
281 | } // namespace impl |
282 | } // namespace dnnl |
283 | |