1/*******************************************************************************
2* Copyright 2021-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include "gpu/jit/conv/gen_convolution.hpp"
18
19#include <iostream>
20#include <utility>
21
22#include "common/impl_registration.hpp"
23#include "common/utils.hpp"
24#include "common/verbose.hpp"
25#include "gpu/jit/ir/kernel_info.hpp"
26#include "gpu/jit/reorder/reorder_kernel.hpp"
27#include "gpu/jit/utils/utils.hpp"
28#include "gpu/ocl/ocl_utils.hpp"
29
30#include "gpu/jit/conv/config.hpp"
31#include "gpu/jit/conv/conv_kernel.hpp"
32#include "gpu/jit/conv/zero_out.hpp"
33
34namespace dnnl {
35namespace impl {
36namespace gpu {
37namespace jit {
38
39struct conv_pd_data_t {
40 conv_config_t pd_cfg;
41 tensor_config_t tensor_cfg;
42 std::vector<kernel_info_t> kernel_infos;
43};
44
45class gen_convolution_t {
46public:
47 static const int max_kernels = 16;
48
49 template <typename T>
50 static status_t init_pd(T *pd, engine_t *engine) {
51 try {
52 using compute::compute_engine_t;
53 auto *compute_engine = utils::downcast<compute_engine_t *>(engine);
54
55 if (!compute_engine->mayiuse_ngen_kernels())
56 return status::unimplemented;
57 if (!pd->set_default_alg_kind(alg_kind::convolution_direct))
58 return status::unimplemented;
59
60 conv_problem_t prb;
61 CHECK(prb.init(engine, pd));
62
63 pd->data = std::make_shared<conv_pd_data_t>();
64 CHECK(init_pd_time_cfg(
65 prb, pd->data->pd_cfg, engine, pd, &pd->attr_));
66
67 // XXX: This is to be removed once we'll be able to properly handle
68 // out-of-registers cases during primitive creation.
69 auto tmp_cfg = pd->data->pd_cfg;
70 CHECK(init_cfg(tmp_cfg, pd));
71
72 pd->data->tensor_cfg = get_tensor_config(pd->data->pd_cfg);
73 pd->data->kernel_infos.reserve(max_kernels);
74 CHECK(init_kernel_infos(pd));
75
76 return status::success;
77 } catch (std::runtime_error &err) {
78 // If verbose is enabled, print the primitive case and rethrow the
79 // exception.
80 if (get_verbose())
81 printf("onednn_verbose,error,%s\n", pd->info(engine));
82 std::cerr << err.what() << "\n";
83 return status::runtime_error;
84 }
85 }
86
87 gen_convolution_t() = default;
88
89 template <typename T>
90 status_t init(T *primitive, engine_t *engine) {
91 try {
92 auto &data = *primitive->pd()->data;
93 auto &tensor_cfg = data.tensor_cfg;
94 auto cfg = data.pd_cfg;
95 CHECK(init_cfg(cfg, primitive->pd()));
96
97 ir_info() << "Configuration:" << std::endl;
98 ir_info() << cfg;
99
100 init_nd_ranges(primitive, cfg);
101
102 auto &kernel_infos = data.kernel_infos;
103 for (int i = 0; i < int(kernel_infos.size()); i++) {
104 auto &info = kernel_infos[i];
105 switch (info.id()) {
106 case kernel_id_t::convolution:
107 try {
108 kernels_.push_back(make_kernel<conv_kernel_t>(
109 primitive, engine, cfg, info));
110 } catch (const ngen::out_of_registers_exception &e) {
111 if (cfg.regs() < 256
112 && cfg.hw_cfg().large_grf_support()) {
113 ir_warning()
114 << "Failed to generate kernel with "
115 "default register mode, attempting "
116 "again with large_grf_mode "
117 "enabled\n";
118 kernels_.push_back(make_kernel<conv_kernel_t>(
119 primitive, engine, cfg, info,
120 grf_mode_t::large));
121 } else {
122 throw e;
123 }
124 }
125 break;
126 case kernel_id_t::pre_reorder: {
127 auto src_layout
128 = tensor_cfg.user_layout(info.arg_name(1));
129 auto dst_layout
130 = tensor_cfg.compute_layout(info.arg_name(1));
131 kernels_.push_back(make_kernel<reorder_kernel_t>(
132 primitive, engine, cfg.exec_cfg(),
133 "conv_reorder", info, src_layout, dst_layout,
134 cfg.is_dpas_or_dpasw_fma(),
135 grf_mode_t::matches));
136 break;
137 }
138 case kernel_id_t::post_reorder: {
139 auto src_layout
140 = tensor_cfg.compute_layout(info.arg_name(0));
141 auto dst_layout
142 = tensor_cfg.user_layout(info.arg_name(0));
143 kernels_.push_back(make_kernel<reorder_kernel_t>(
144 primitive, engine, cfg.exec_cfg(),
145 "conv_reorder", info, src_layout, dst_layout,
146 cfg.is_dpas_or_dpasw_fma(),
147 grf_mode_t::matches));
148 break;
149 }
150 case kernel_id_t::zero_out:
151 if (can_skip_zero_out(info, cfg)) {
152 kernels_.emplace_back();
153 continue;
154 }
155 kernels_.push_back(make_kernel<zero_out_kernel_t>(
156 primitive, engine, cfg.exec_cfg(), info,
157 cfg.is_dpas_or_dpasw_fma(),
158 grf_mode_t::matches));
159 break;
160 default: ir_error_not_expected();
161 }
162 if (!kernels_[i]) return status::runtime_error;
163 }
164 } catch (std::runtime_error &err) {
165 // If verbose is enabled, print the primitive case and rethrow the
166 // exception.
167 if (get_verbose())
168 printf("onednn_verbose,error,%s\n",
169 primitive->pd()->info(engine));
170 std::cerr << err.what() << "\n";
171 return status::runtime_error;
172 }
173
174 return status::success;
175 }
176
177 template <typename T>
178 status_t init_res_storage(
179 const T *primitive, engine_t *engine, gpu_resource_t *r) const {
180 auto &data = *primitive->pd()->data;
181 auto &kernel_infos = data.kernel_infos;
182 for (int i = 0; i < int(kernel_infos.size()); i++) {
183 auto &kernel_info = kernel_infos[i];
184 for (int j = 0; j < kernel_info.nargs(); j++) {
185 if (!kernel_info.is_resource(j)) continue;
186 ir_error_not_expected();
187 }
188 }
189 return status::success;
190 }
191
192 template <typename T>
193 status_t execute(const T *primitive, const exec_ctx_t &ctx) const {
194 auto &data = *primitive->pd()->data;
195 auto &kernel_infos = data.kernel_infos;
196
197 int max_stage = 100;
198 int nsubmitted = 0;
199 int nkernels = int(kernel_infos.size());
200 for (int stage = 0; stage < max_stage; stage++) {
201 for (int i = 0; i < nkernels; i++) {
202 auto &info = kernel_infos[i];
203 if (info.stage_id() != stage) continue;
204
205 if (kernels_[i]) {
206 std::vector<memory_storage_wrapper_t> storage_list;
207 info.init_memory_storage_list(storage_list, ctx, primitive);
208
209 compute::kernel_arg_list_t arg_list;
210 info.set_args(arg_list, storage_list);
211
212 CHECK(primitive->parallel_for(
213 ctx, nd_ranges_[i], kernels_[i], arg_list));
214 }
215 nsubmitted++;
216 if (nsubmitted == nkernels) break;
217 }
218 }
219
220 return status::success;
221 }
222
223private:
224 template <typename T>
225 static kernel_info_t &create_kernel_info(T *pd, kernel_id_t kernel_id) {
226 auto &infos = pd->data->kernel_infos;
227 ir_assert((int)infos.size() + 1 <= max_kernels);
228 infos.emplace_back();
229 auto &ret = infos.back();
230 ret.set_id(kernel_id);
231 return ret;
232 }
233
234 template <typename T>
235 static status_t init_kernel_infos(T *pd) {
236 auto &data = *pd->data;
237 auto &cfg = data.pd_cfg;
238
239 auto scratchpad = pd->scratchpad_registry().registrar();
240 auto &conv_info = create_kernel_info(pd, kernel_id_t::convolution);
241
242 // Initialize kernel arguments.
243 uint32_t scratchpad_key = 1;
244 for (auto &t : data.tensor_cfg.tensors()) {
245 int compute_arg_key = t.arg_key;
246 int user_arg_key = t.arg_key;
247 size_t elems = t.compute_layout.elems();
248 size_t compute_size = t.compute_layout.size();
249 auto compute_buf = make_buffer(t.name);
250 auto user_buf = (t.needs_reorder ? make_buffer(t.name + "_user")
251 : compute_buf);
252
253 if (user_arg_key == -1) {
254 ir_assert(!t.needs_reorder);
255 ir_assert(!t.needs_zero_out);
256 ir_error_not_expected();
257 continue;
258 }
259
260 if (t.needs_reorder) {
261 compute_arg_key = int(scratchpad_key);
262 scratchpad.book(scratchpad_key, compute_size, 1,
263 ocl::OCL_BUFFER_ALIGNMENT);
264 conv_info.register_scratchpad_arg(compute_buf, compute_arg_key,
265 /*is_input=*/t.is_input && !t.is_output, compute_size);
266 scratchpad_key++;
267
268 if (t.is_input) {
269 auto &reorder_info
270 = create_kernel_info(pd, kernel_id_t::pre_reorder);
271 reorder_info.register_user_arg(user_buf, user_arg_key,
272 /*is_input=*/true);
273 reorder_info.register_scratchpad_arg(compute_buf,
274 compute_arg_key,
275 /*is_input=*/false, compute_size);
276 auto elems_var = var_t::make(type_t::u32(), "elems");
277 reorder_info.register_internal_arg(
278 elems_var, uint32_t(elems));
279 reorder_info.set_nd_range(reorder_kernel_t<>::nd_range(
280 cfg.exec_cfg(), t.user_layout, t.compute_layout));
281 }
282 if (t.is_output) {
283 auto &reorder_info
284 = create_kernel_info(pd, kernel_id_t::post_reorder);
285 reorder_info.register_scratchpad_arg(compute_buf,
286 compute_arg_key,
287 /*is_input=*/true, compute_size);
288 reorder_info.register_user_arg(user_buf, user_arg_key,
289 /*is_input=*/false);
290 auto elems_var = var_t::make(type_t::u32(), "elems");
291 reorder_info.register_internal_arg(
292 elems_var, uint32_t(elems));
293 reorder_info.set_nd_range(reorder_kernel_t<>::nd_range(
294 cfg.exec_cfg(), t.compute_layout, t.user_layout));
295 }
296 }
297 if (t.needs_zero_out) {
298 auto &zero_out_info
299 = create_kernel_info(pd, kernel_id_t::zero_out);
300 if (t.needs_reorder) {
301 zero_out_info.register_scratchpad_arg(compute_buf,
302 compute_arg_key,
303 /*is_input=*/false, compute_size);
304 } else {
305 zero_out_info.register_user_arg(compute_buf,
306 compute_arg_key,
307 /*is_input=*/false);
308 }
309 auto size_var = var_t::make(type_t::u32(), "size");
310 zero_out_info.register_internal_arg(
311 size_var, uint32_t(compute_size));
312 zero_out_info.set_nd_range(zero_out_kernel_t<>::nd_range(
313 cfg.simd(), int(compute_size)));
314 }
315 if (!t.needs_reorder)
316 conv_info.register_user_arg(user_buf, user_arg_key,
317 /*is_input=*/t.is_input && !t.is_output);
318 }
319
320 return status::success;
321 }
322
323 template <typename T>
324 void init_nd_ranges(T *primitive, const conv_config_t &cfg) {
325 auto *pd = primitive->pd();
326 auto &data = *pd->data;
327 int nkernels = int(data.kernel_infos.size());
328 nd_ranges_.resize(nkernels);
329 for (int i = 0; i < nkernels; i++) {
330 auto &info = data.kernel_infos[i];
331 switch (info.id()) {
332 case kernel_id_t::convolution:
333 // Convolution kernel info is initialized at PD creation
334 // time when ND range/grid information are not known yet so
335 // we need to directly query config here.
336 nd_ranges_[i] = cfg.nd_range();
337 break;
338 case kernel_id_t::pre_reorder:
339 case kernel_id_t::post_reorder:
340 case kernel_id_t::zero_out:
341 nd_ranges_[i] = info.nd_range();
342 break;
343 default: ir_error_not_expected();
344 }
345 }
346 }
347
348 static bool can_skip_zero_out(
349 const kernel_info_t &info, const conv_config_t &cfg) {
350 ir_assert(info.id() == kernel_id_t::zero_out);
351 auto &buf_name = info.arg_var(0).as<var_t>().name;
352 if (buf_name == "wei") return cfg.can_skip_wei_zero_out();
353 if (buf_name == "bia") return cfg.can_skip_bia_zero_out();
354 ir_error_not_expected();
355 return false;
356 }
357
358 std::vector<compute::kernel_t> kernels_;
359 std::vector<compute::nd_range_t> nd_ranges_;
360};
361
362status_t gen_convolution_fwd_t::pd_t::init(engine_t *engine) {
363 if (!is_fwd()) return status::unimplemented;
364 CHECK(gen_convolution_t::init_pd(this, engine));
365 return status::success;
366}
367
368status_t gen_convolution_fwd_t::init(engine_t *engine) {
369 impl_.reset(new gen_convolution_t());
370 return impl_->init(this, engine);
371}
372
373status_t gen_convolution_fwd_t::execute(const exec_ctx_t &ctx) const {
374 return impl_->execute(this, ctx);
375}
376
377status_t gen_convolution_fwd_t::init_res_storage(
378 engine_t *engine, gpu_resource_t *r) const {
379 return impl_->init_res_storage(this, engine, r);
380}
381
382status_t gen_convolution_bwd_data_t::pd_t::init(engine_t *engine) {
383 if (!is_bwd_d()) return status::unimplemented;
384 CHECK(gen_convolution_t::init_pd(this, engine));
385 return status::success;
386}
387
388status_t gen_convolution_bwd_data_t::init_res_storage(
389 engine_t *engine, gpu_resource_t *r) const {
390 return impl_->init_res_storage(this, engine, r);
391}
392
393status_t gen_convolution_bwd_weights_t::pd_t::init(engine_t *engine) {
394 if (!is_bwd_w()) return status::unimplemented;
395 CHECK(gen_convolution_t::init_pd(this, engine));
396 return status::success;
397}
398
399status_t gen_convolution_bwd_data_t::init(engine_t *engine) {
400 impl_.reset(new gen_convolution_t());
401 return impl_->init(this, engine);
402}
403
404status_t gen_convolution_bwd_data_t::execute(const exec_ctx_t &ctx) const {
405 return impl_->execute(this, ctx);
406}
407
408status_t gen_convolution_bwd_weights_t::init(engine_t *engine) {
409 impl_.reset(new gen_convolution_t());
410 return impl_->init(this, engine);
411}
412
413status_t gen_convolution_bwd_weights_t::init_res_storage(
414 engine_t *engine, gpu_resource_t *r) const {
415 return impl_->init_res_storage(this, engine, r);
416}
417
418status_t gen_convolution_bwd_weights_t::execute(const exec_ctx_t &ctx) const {
419 return impl_->execute(this, ctx);
420}
421
422} // namespace jit
423} // namespace gpu
424} // namespace impl
425} // namespace dnnl
426