1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include "gpu/ocl/ref_reorder.hpp"
19
20#include "common/utils.hpp"
21#include "gpu/ocl/ocl_stream.hpp"
22#include "gpu/ocl/ocl_utils.hpp"
23namespace dnnl {
24namespace impl {
25namespace gpu {
26namespace ocl {
27
28using namespace dnnl::impl::memory_tracking::names;
29
30status_t ref_reorder_t::pd_t::init_conf(engine_t *engine) {
31 using namespace format_tag;
32
33 const memory_desc_wrapper src_mdw(src_md());
34 const memory_desc_wrapper dst_mdw(dst_md());
35
36 conf.src_md_info = memory_desc_info_t::create(src_mdw);
37 conf.dst_md_info = memory_desc_info_t::create(dst_mdw);
38
39 status_t status = status::success;
40
41 const auto &padded_dims = dst_mdw.padded_dims();
42 conf.src_quant = {attr(), src_mdw, DNNL_ARG_SRC};
43 conf.dst_quant = {attr(), dst_mdw, DNNL_ARG_DST};
44 conf.sum_quant = {attr()};
45 conf.has_padding = !src_mdw.is_dense() || !dst_mdw.is_dense();
46 conf.ndims = src_mdw.ndims();
47 conf.nelems = utils::array_product(padded_dims, conf.ndims);
48
49 conf.sub_group_size = 1;
50
51 if (conf.nelems == 0) return status::success;
52
53 auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine);
54
55 dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1};
56
57 conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_);
58
59 blocks[2] = blocks[3] = blocks[4] = blocks[5] = 0;
60
61 for (int i = 0; i < MAX_NDIMS; ++i) {
62 auto dim_str = utils::format("D%d", i);
63 if (i < dst_mdw.ndims()) {
64 int dim = padded_dims[i];
65 // if needed to align vectorized dim with vector size, pad that dim again
66 conf.dispatch.define_dim(dim_str, i, dim, blocks[i]);
67 } else {
68 conf.dispatch.define_dim(dim_str, 1);
69 }
70 }
71
72 conf.dispatch.generate();
73 return status;
74}
75
76status_t ref_reorder_t::pd_t::init_kernel_ctx(
77 compute::kernel_ctx_t &kernel_ctx) const {
78 using namespace format_tag;
79
80 const memory_desc_wrapper src_mdw(src_md());
81 const memory_desc_wrapper dst_mdw(dst_md());
82
83 if (conf.nelems == 0) return status::success;
84
85 kernel_ctx.define_int("NDIMS", conf.ndims);
86 kernel_ctx.add_option("-cl-std=CL2.0");
87
88 conf.src_quant.define_macros(kernel_ctx, "SRC");
89 conf.dst_quant.define_macros(kernel_ctx, "DST");
90 conf.sum_quant.define_macros(kernel_ctx, "SUM");
91
92 def_dispatch(kernel_ctx, conf.dispatch);
93
94 kernel_ctx.define_int("REF_REORDER", 1);
95
96 kernel_ctx.define_int("PAD_FILL_ZERO", conf.has_padding);
97
98 def_memory_desc_info(kernel_ctx, conf.src_md_info, "SRC");
99 def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST");
100
101 kernel_ctx.print_options();
102 return status::success;
103}
104
105void ref_reorder_t::pd_t::init_scratchpad() {
106 if (conf.src_quant.with_scale()) {
107 auto scratchpad = scratchpad_registry().registrar();
108 scratchpad.book(memory_tracking::names::key_reorder_src_scales,
109 conf.src_quant.num_scales(), sizeof(float),
110 OCL_BUFFER_ALIGNMENT);
111 }
112 if (conf.dst_quant.with_scale()) {
113 auto scratchpad = scratchpad_registry().registrar();
114 scratchpad.book(memory_tracking::names::key_reorder_dst_scales,
115 conf.dst_quant.num_scales(), sizeof(float),
116 OCL_BUFFER_ALIGNMENT);
117 }
118}
119
120status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const {
121
122 status_t status = status::success;
123
124 auto &src = CTX_IN_STORAGE(DNNL_ARG_FROM);
125 auto &dst = CTX_OUT_STORAGE(DNNL_ARG_TO);
126 CHECK(status);
127
128 const auto &conf = pd()->conf;
129 if (conf.nelems == 0) return status::success;
130
131 compute::kernel_arg_list_t arg_list;
132 arg_list.set(0, src);
133 arg_list.set(1, dst);
134
135 arg_list.set(2, conf.src_quant.scales(ctx));
136 arg_list.set(3, conf.src_quant.zero_points(ctx));
137 arg_list.set(4, conf.dst_quant.scales(ctx));
138 arg_list.set(5, conf.dst_quant.zero_points(ctx));
139
140 arg_list.set(6, conf.sum_quant.scales());
141 arg_list.set(7, conf.sum_quant.zero_points());
142
143 auto nd_range = conf.dispatch.nd_range();
144
145 status = parallel_for(ctx, nd_range, kernel_, arg_list);
146
147 return status;
148}
149
150} // namespace ocl
151} // namespace gpu
152} // namespace impl
153} // namespace dnnl
154