1 | /******************************************************************************* |
2 | * Copyright 2021-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include "gpu/jit/conv/gen_convolution.hpp" |
18 | |
19 | #include <iostream> |
20 | #include <utility> |
21 | |
22 | #include "common/impl_registration.hpp" |
23 | #include "common/utils.hpp" |
24 | #include "common/verbose.hpp" |
25 | #include "gpu/jit/ir/kernel_info.hpp" |
26 | #include "gpu/jit/reorder/reorder_kernel.hpp" |
27 | #include "gpu/jit/utils/utils.hpp" |
28 | #include "gpu/ocl/ocl_utils.hpp" |
29 | |
30 | #include "gpu/jit/conv/config.hpp" |
31 | #include "gpu/jit/conv/conv_kernel.hpp" |
32 | #include "gpu/jit/conv/zero_out.hpp" |
33 | |
34 | namespace dnnl { |
35 | namespace impl { |
36 | namespace gpu { |
37 | namespace jit { |
38 | |
39 | struct conv_pd_data_t { |
40 | conv_config_t pd_cfg; |
41 | tensor_config_t tensor_cfg; |
42 | std::vector<kernel_info_t> kernel_infos; |
43 | }; |
44 | |
45 | class gen_convolution_t { |
46 | public: |
47 | static const int max_kernels = 16; |
48 | |
49 | template <typename T> |
50 | static status_t init_pd(T *pd, engine_t *engine) { |
51 | try { |
52 | using compute::compute_engine_t; |
53 | auto *compute_engine = utils::downcast<compute_engine_t *>(engine); |
54 | |
55 | if (!compute_engine->mayiuse_ngen_kernels()) |
56 | return status::unimplemented; |
57 | if (!pd->set_default_alg_kind(alg_kind::convolution_direct)) |
58 | return status::unimplemented; |
59 | |
60 | conv_problem_t prb; |
61 | CHECK(prb.init(engine, pd)); |
62 | |
63 | pd->data = std::make_shared<conv_pd_data_t>(); |
64 | CHECK(init_pd_time_cfg( |
65 | prb, pd->data->pd_cfg, engine, pd, &pd->attr_)); |
66 | |
67 | // XXX: This is to be removed once we'll be able to properly handle |
68 | // out-of-registers cases during primitive creation. |
69 | auto tmp_cfg = pd->data->pd_cfg; |
70 | CHECK(init_cfg(tmp_cfg, pd)); |
71 | |
72 | pd->data->tensor_cfg = get_tensor_config(pd->data->pd_cfg); |
73 | pd->data->kernel_infos.reserve(max_kernels); |
74 | CHECK(init_kernel_infos(pd)); |
75 | |
76 | return status::success; |
77 | } catch (std::runtime_error &err) { |
78 | // If verbose is enabled, print the primitive case and rethrow the |
79 | // exception. |
80 | if (get_verbose()) |
81 | printf("onednn_verbose,error,%s\n" , pd->info(engine)); |
82 | std::cerr << err.what() << "\n" ; |
83 | return status::runtime_error; |
84 | } |
85 | } |
86 | |
87 | gen_convolution_t() = default; |
88 | |
89 | template <typename T> |
90 | status_t init(T *primitive, engine_t *engine) { |
91 | try { |
92 | auto &data = *primitive->pd()->data; |
93 | auto &tensor_cfg = data.tensor_cfg; |
94 | auto cfg = data.pd_cfg; |
95 | CHECK(init_cfg(cfg, primitive->pd())); |
96 | |
97 | ir_info() << "Configuration:" << std::endl; |
98 | ir_info() << cfg; |
99 | |
100 | init_nd_ranges(primitive, cfg); |
101 | |
102 | auto &kernel_infos = data.kernel_infos; |
103 | for (int i = 0; i < int(kernel_infos.size()); i++) { |
104 | auto &info = kernel_infos[i]; |
105 | switch (info.id()) { |
106 | case kernel_id_t::convolution: |
107 | try { |
108 | kernels_.push_back(make_kernel<conv_kernel_t>( |
109 | primitive, engine, cfg, info)); |
110 | } catch (const ngen::out_of_registers_exception &e) { |
111 | if (cfg.regs() < 256 |
112 | && cfg.hw_cfg().large_grf_support()) { |
113 | ir_warning() |
114 | << "Failed to generate kernel with " |
115 | "default register mode, attempting " |
116 | "again with large_grf_mode " |
117 | "enabled\n" ; |
118 | kernels_.push_back(make_kernel<conv_kernel_t>( |
119 | primitive, engine, cfg, info, |
120 | grf_mode_t::large)); |
121 | } else { |
122 | throw e; |
123 | } |
124 | } |
125 | break; |
126 | case kernel_id_t::pre_reorder: { |
127 | auto src_layout |
128 | = tensor_cfg.user_layout(info.arg_name(1)); |
129 | auto dst_layout |
130 | = tensor_cfg.compute_layout(info.arg_name(1)); |
131 | kernels_.push_back(make_kernel<reorder_kernel_t>( |
132 | primitive, engine, cfg.exec_cfg(), |
133 | "conv_reorder" , info, src_layout, dst_layout, |
134 | cfg.is_dpas_or_dpasw_fma(), |
135 | grf_mode_t::matches)); |
136 | break; |
137 | } |
138 | case kernel_id_t::post_reorder: { |
139 | auto src_layout |
140 | = tensor_cfg.compute_layout(info.arg_name(0)); |
141 | auto dst_layout |
142 | = tensor_cfg.user_layout(info.arg_name(0)); |
143 | kernels_.push_back(make_kernel<reorder_kernel_t>( |
144 | primitive, engine, cfg.exec_cfg(), |
145 | "conv_reorder" , info, src_layout, dst_layout, |
146 | cfg.is_dpas_or_dpasw_fma(), |
147 | grf_mode_t::matches)); |
148 | break; |
149 | } |
150 | case kernel_id_t::zero_out: |
151 | if (can_skip_zero_out(info, cfg)) { |
152 | kernels_.emplace_back(); |
153 | continue; |
154 | } |
155 | kernels_.push_back(make_kernel<zero_out_kernel_t>( |
156 | primitive, engine, cfg.exec_cfg(), info, |
157 | cfg.is_dpas_or_dpasw_fma(), |
158 | grf_mode_t::matches)); |
159 | break; |
160 | default: ir_error_not_expected(); |
161 | } |
162 | if (!kernels_[i]) return status::runtime_error; |
163 | } |
164 | } catch (std::runtime_error &err) { |
165 | // If verbose is enabled, print the primitive case and rethrow the |
166 | // exception. |
167 | if (get_verbose()) |
168 | printf("onednn_verbose,error,%s\n" , |
169 | primitive->pd()->info(engine)); |
170 | std::cerr << err.what() << "\n" ; |
171 | return status::runtime_error; |
172 | } |
173 | |
174 | return status::success; |
175 | } |
176 | |
177 | template <typename T> |
178 | status_t init_res_storage( |
179 | const T *primitive, engine_t *engine, gpu_resource_t *r) const { |
180 | auto &data = *primitive->pd()->data; |
181 | auto &kernel_infos = data.kernel_infos; |
182 | for (int i = 0; i < int(kernel_infos.size()); i++) { |
183 | auto &kernel_info = kernel_infos[i]; |
184 | for (int j = 0; j < kernel_info.nargs(); j++) { |
185 | if (!kernel_info.is_resource(j)) continue; |
186 | ir_error_not_expected(); |
187 | } |
188 | } |
189 | return status::success; |
190 | } |
191 | |
192 | template <typename T> |
193 | status_t execute(const T *primitive, const exec_ctx_t &ctx) const { |
194 | auto &data = *primitive->pd()->data; |
195 | auto &kernel_infos = data.kernel_infos; |
196 | |
197 | int max_stage = 100; |
198 | int nsubmitted = 0; |
199 | int nkernels = int(kernel_infos.size()); |
200 | for (int stage = 0; stage < max_stage; stage++) { |
201 | for (int i = 0; i < nkernels; i++) { |
202 | auto &info = kernel_infos[i]; |
203 | if (info.stage_id() != stage) continue; |
204 | |
205 | if (kernels_[i]) { |
206 | std::vector<memory_storage_wrapper_t> storage_list; |
207 | info.init_memory_storage_list(storage_list, ctx, primitive); |
208 | |
209 | compute::kernel_arg_list_t arg_list; |
210 | info.set_args(arg_list, storage_list); |
211 | |
212 | CHECK(primitive->parallel_for( |
213 | ctx, nd_ranges_[i], kernels_[i], arg_list)); |
214 | } |
215 | nsubmitted++; |
216 | if (nsubmitted == nkernels) break; |
217 | } |
218 | } |
219 | |
220 | return status::success; |
221 | } |
222 | |
223 | private: |
224 | template <typename T> |
225 | static kernel_info_t &create_kernel_info(T *pd, kernel_id_t kernel_id) { |
226 | auto &infos = pd->data->kernel_infos; |
227 | ir_assert((int)infos.size() + 1 <= max_kernels); |
228 | infos.emplace_back(); |
229 | auto &ret = infos.back(); |
230 | ret.set_id(kernel_id); |
231 | return ret; |
232 | } |
233 | |
234 | template <typename T> |
235 | static status_t init_kernel_infos(T *pd) { |
236 | auto &data = *pd->data; |
237 | auto &cfg = data.pd_cfg; |
238 | |
239 | auto scratchpad = pd->scratchpad_registry().registrar(); |
240 | auto &conv_info = create_kernel_info(pd, kernel_id_t::convolution); |
241 | |
242 | // Initialize kernel arguments. |
243 | uint32_t scratchpad_key = 1; |
244 | for (auto &t : data.tensor_cfg.tensors()) { |
245 | int compute_arg_key = t.arg_key; |
246 | int user_arg_key = t.arg_key; |
247 | size_t elems = t.compute_layout.elems(); |
248 | size_t compute_size = t.compute_layout.size(); |
249 | auto compute_buf = make_buffer(t.name); |
250 | auto user_buf = (t.needs_reorder ? make_buffer(t.name + "_user" ) |
251 | : compute_buf); |
252 | |
253 | if (user_arg_key == -1) { |
254 | ir_assert(!t.needs_reorder); |
255 | ir_assert(!t.needs_zero_out); |
256 | ir_error_not_expected(); |
257 | continue; |
258 | } |
259 | |
260 | if (t.needs_reorder) { |
261 | compute_arg_key = int(scratchpad_key); |
262 | scratchpad.book(scratchpad_key, compute_size, 1, |
263 | ocl::OCL_BUFFER_ALIGNMENT); |
264 | conv_info.register_scratchpad_arg(compute_buf, compute_arg_key, |
265 | /*is_input=*/t.is_input && !t.is_output, compute_size); |
266 | scratchpad_key++; |
267 | |
268 | if (t.is_input) { |
269 | auto &reorder_info |
270 | = create_kernel_info(pd, kernel_id_t::pre_reorder); |
271 | reorder_info.register_user_arg(user_buf, user_arg_key, |
272 | /*is_input=*/true); |
273 | reorder_info.register_scratchpad_arg(compute_buf, |
274 | compute_arg_key, |
275 | /*is_input=*/false, compute_size); |
276 | auto elems_var = var_t::make(type_t::u32(), "elems" ); |
277 | reorder_info.register_internal_arg( |
278 | elems_var, uint32_t(elems)); |
279 | reorder_info.set_nd_range(reorder_kernel_t<>::nd_range( |
280 | cfg.exec_cfg(), t.user_layout, t.compute_layout)); |
281 | } |
282 | if (t.is_output) { |
283 | auto &reorder_info |
284 | = create_kernel_info(pd, kernel_id_t::post_reorder); |
285 | reorder_info.register_scratchpad_arg(compute_buf, |
286 | compute_arg_key, |
287 | /*is_input=*/true, compute_size); |
288 | reorder_info.register_user_arg(user_buf, user_arg_key, |
289 | /*is_input=*/false); |
290 | auto elems_var = var_t::make(type_t::u32(), "elems" ); |
291 | reorder_info.register_internal_arg( |
292 | elems_var, uint32_t(elems)); |
293 | reorder_info.set_nd_range(reorder_kernel_t<>::nd_range( |
294 | cfg.exec_cfg(), t.compute_layout, t.user_layout)); |
295 | } |
296 | } |
297 | if (t.needs_zero_out) { |
298 | auto &zero_out_info |
299 | = create_kernel_info(pd, kernel_id_t::zero_out); |
300 | if (t.needs_reorder) { |
301 | zero_out_info.register_scratchpad_arg(compute_buf, |
302 | compute_arg_key, |
303 | /*is_input=*/false, compute_size); |
304 | } else { |
305 | zero_out_info.register_user_arg(compute_buf, |
306 | compute_arg_key, |
307 | /*is_input=*/false); |
308 | } |
309 | auto size_var = var_t::make(type_t::u32(), "size" ); |
310 | zero_out_info.register_internal_arg( |
311 | size_var, uint32_t(compute_size)); |
312 | zero_out_info.set_nd_range(zero_out_kernel_t<>::nd_range( |
313 | cfg.simd(), int(compute_size))); |
314 | } |
315 | if (!t.needs_reorder) |
316 | conv_info.register_user_arg(user_buf, user_arg_key, |
317 | /*is_input=*/t.is_input && !t.is_output); |
318 | } |
319 | |
320 | return status::success; |
321 | } |
322 | |
323 | template <typename T> |
324 | void init_nd_ranges(T *primitive, const conv_config_t &cfg) { |
325 | auto *pd = primitive->pd(); |
326 | auto &data = *pd->data; |
327 | int nkernels = int(data.kernel_infos.size()); |
328 | nd_ranges_.resize(nkernels); |
329 | for (int i = 0; i < nkernels; i++) { |
330 | auto &info = data.kernel_infos[i]; |
331 | switch (info.id()) { |
332 | case kernel_id_t::convolution: |
333 | // Convolution kernel info is initialized at PD creation |
334 | // time when ND range/grid information are not known yet so |
335 | // we need to directly query config here. |
336 | nd_ranges_[i] = cfg.nd_range(); |
337 | break; |
338 | case kernel_id_t::pre_reorder: |
339 | case kernel_id_t::post_reorder: |
340 | case kernel_id_t::zero_out: |
341 | nd_ranges_[i] = info.nd_range(); |
342 | break; |
343 | default: ir_error_not_expected(); |
344 | } |
345 | } |
346 | } |
347 | |
348 | static bool can_skip_zero_out( |
349 | const kernel_info_t &info, const conv_config_t &cfg) { |
350 | ir_assert(info.id() == kernel_id_t::zero_out); |
351 | auto &buf_name = info.arg_var(0).as<var_t>().name; |
352 | if (buf_name == "wei" ) return cfg.can_skip_wei_zero_out(); |
353 | if (buf_name == "bia" ) return cfg.can_skip_bia_zero_out(); |
354 | ir_error_not_expected(); |
355 | return false; |
356 | } |
357 | |
358 | std::vector<compute::kernel_t> kernels_; |
359 | std::vector<compute::nd_range_t> nd_ranges_; |
360 | }; |
361 | |
362 | status_t gen_convolution_fwd_t::pd_t::init(engine_t *engine) { |
363 | if (!is_fwd()) return status::unimplemented; |
364 | CHECK(gen_convolution_t::init_pd(this, engine)); |
365 | return status::success; |
366 | } |
367 | |
368 | status_t gen_convolution_fwd_t::init(engine_t *engine) { |
369 | impl_.reset(new gen_convolution_t()); |
370 | return impl_->init(this, engine); |
371 | } |
372 | |
373 | status_t gen_convolution_fwd_t::execute(const exec_ctx_t &ctx) const { |
374 | return impl_->execute(this, ctx); |
375 | } |
376 | |
377 | status_t gen_convolution_fwd_t::init_res_storage( |
378 | engine_t *engine, gpu_resource_t *r) const { |
379 | return impl_->init_res_storage(this, engine, r); |
380 | } |
381 | |
382 | status_t gen_convolution_bwd_data_t::pd_t::init(engine_t *engine) { |
383 | if (!is_bwd_d()) return status::unimplemented; |
384 | CHECK(gen_convolution_t::init_pd(this, engine)); |
385 | return status::success; |
386 | } |
387 | |
388 | status_t gen_convolution_bwd_data_t::init_res_storage( |
389 | engine_t *engine, gpu_resource_t *r) const { |
390 | return impl_->init_res_storage(this, engine, r); |
391 | } |
392 | |
393 | status_t gen_convolution_bwd_weights_t::pd_t::init(engine_t *engine) { |
394 | if (!is_bwd_w()) return status::unimplemented; |
395 | CHECK(gen_convolution_t::init_pd(this, engine)); |
396 | return status::success; |
397 | } |
398 | |
399 | status_t gen_convolution_bwd_data_t::init(engine_t *engine) { |
400 | impl_.reset(new gen_convolution_t()); |
401 | return impl_->init(this, engine); |
402 | } |
403 | |
404 | status_t gen_convolution_bwd_data_t::execute(const exec_ctx_t &ctx) const { |
405 | return impl_->execute(this, ctx); |
406 | } |
407 | |
408 | status_t gen_convolution_bwd_weights_t::init(engine_t *engine) { |
409 | impl_.reset(new gen_convolution_t()); |
410 | return impl_->init(this, engine); |
411 | } |
412 | |
413 | status_t gen_convolution_bwd_weights_t::init_res_storage( |
414 | engine_t *engine, gpu_resource_t *r) const { |
415 | return impl_->init_res_storage(this, engine, r); |
416 | } |
417 | |
418 | status_t gen_convolution_bwd_weights_t::execute(const exec_ctx_t &ctx) const { |
419 | return impl_->execute(this, ctx); |
420 | } |
421 | |
422 | } // namespace jit |
423 | } // namespace gpu |
424 | } // namespace impl |
425 | } // namespace dnnl |
426 | |