1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include "gpu/ocl/ref_reorder.hpp" |
19 | |
20 | #include "common/utils.hpp" |
21 | #include "gpu/ocl/ocl_stream.hpp" |
22 | #include "gpu/ocl/ocl_utils.hpp" |
23 | namespace dnnl { |
24 | namespace impl { |
25 | namespace gpu { |
26 | namespace ocl { |
27 | |
28 | using namespace dnnl::impl::memory_tracking::names; |
29 | |
30 | status_t ref_reorder_t::pd_t::init_conf(engine_t *engine) { |
31 | using namespace format_tag; |
32 | |
33 | const memory_desc_wrapper src_mdw(src_md()); |
34 | const memory_desc_wrapper dst_mdw(dst_md()); |
35 | |
36 | conf.src_md_info = memory_desc_info_t::create(src_mdw); |
37 | conf.dst_md_info = memory_desc_info_t::create(dst_mdw); |
38 | |
39 | status_t status = status::success; |
40 | |
41 | const auto &padded_dims = dst_mdw.padded_dims(); |
42 | conf.src_quant = {attr(), src_mdw, DNNL_ARG_SRC}; |
43 | conf.dst_quant = {attr(), dst_mdw, DNNL_ARG_DST}; |
44 | conf.sum_quant = {attr()}; |
45 | conf.has_padding = !src_mdw.is_dense() || !dst_mdw.is_dense(); |
46 | conf.ndims = src_mdw.ndims(); |
47 | conf.nelems = utils::array_product(padded_dims, conf.ndims); |
48 | |
49 | conf.sub_group_size = 1; |
50 | |
51 | if (conf.nelems == 0) return status::success; |
52 | |
53 | auto *compute_engine = utils::downcast<compute::compute_engine_t *>(engine); |
54 | |
55 | dim_t blocks[MAX_NDIMS] = {1, 1, 1, 1, 1, 1}; |
56 | |
57 | conf.dispatch = compute_engine->create_dispatch(dst_mdw.md_); |
58 | |
59 | blocks[2] = blocks[3] = blocks[4] = blocks[5] = 0; |
60 | |
61 | for (int i = 0; i < MAX_NDIMS; ++i) { |
62 | auto dim_str = utils::format("D%d" , i); |
63 | if (i < dst_mdw.ndims()) { |
64 | int dim = padded_dims[i]; |
65 | // if needed to align vectorized dim with vector size, pad that dim again |
66 | conf.dispatch.define_dim(dim_str, i, dim, blocks[i]); |
67 | } else { |
68 | conf.dispatch.define_dim(dim_str, 1); |
69 | } |
70 | } |
71 | |
72 | conf.dispatch.generate(); |
73 | return status; |
74 | } |
75 | |
76 | status_t ref_reorder_t::pd_t::init_kernel_ctx( |
77 | compute::kernel_ctx_t &kernel_ctx) const { |
78 | using namespace format_tag; |
79 | |
80 | const memory_desc_wrapper src_mdw(src_md()); |
81 | const memory_desc_wrapper dst_mdw(dst_md()); |
82 | |
83 | if (conf.nelems == 0) return status::success; |
84 | |
85 | kernel_ctx.define_int("NDIMS" , conf.ndims); |
86 | kernel_ctx.add_option("-cl-std=CL2.0" ); |
87 | |
88 | conf.src_quant.define_macros(kernel_ctx, "SRC" ); |
89 | conf.dst_quant.define_macros(kernel_ctx, "DST" ); |
90 | conf.sum_quant.define_macros(kernel_ctx, "SUM" ); |
91 | |
92 | def_dispatch(kernel_ctx, conf.dispatch); |
93 | |
94 | kernel_ctx.define_int("REF_REORDER" , 1); |
95 | |
96 | kernel_ctx.define_int("PAD_FILL_ZERO" , conf.has_padding); |
97 | |
98 | def_memory_desc_info(kernel_ctx, conf.src_md_info, "SRC" ); |
99 | def_memory_desc_info(kernel_ctx, conf.dst_md_info, "DST" ); |
100 | |
101 | kernel_ctx.print_options(); |
102 | return status::success; |
103 | } |
104 | |
105 | void ref_reorder_t::pd_t::init_scratchpad() { |
106 | if (conf.src_quant.with_scale()) { |
107 | auto scratchpad = scratchpad_registry().registrar(); |
108 | scratchpad.book(memory_tracking::names::key_reorder_src_scales, |
109 | conf.src_quant.num_scales(), sizeof(float), |
110 | OCL_BUFFER_ALIGNMENT); |
111 | } |
112 | if (conf.dst_quant.with_scale()) { |
113 | auto scratchpad = scratchpad_registry().registrar(); |
114 | scratchpad.book(memory_tracking::names::key_reorder_dst_scales, |
115 | conf.dst_quant.num_scales(), sizeof(float), |
116 | OCL_BUFFER_ALIGNMENT); |
117 | } |
118 | } |
119 | |
120 | status_t ref_reorder_t::execute(const exec_ctx_t &ctx) const { |
121 | |
122 | status_t status = status::success; |
123 | |
124 | auto &src = CTX_IN_STORAGE(DNNL_ARG_FROM); |
125 | auto &dst = CTX_OUT_STORAGE(DNNL_ARG_TO); |
126 | CHECK(status); |
127 | |
128 | const auto &conf = pd()->conf; |
129 | if (conf.nelems == 0) return status::success; |
130 | |
131 | compute::kernel_arg_list_t arg_list; |
132 | arg_list.set(0, src); |
133 | arg_list.set(1, dst); |
134 | |
135 | arg_list.set(2, conf.src_quant.scales(ctx)); |
136 | arg_list.set(3, conf.src_quant.zero_points(ctx)); |
137 | arg_list.set(4, conf.dst_quant.scales(ctx)); |
138 | arg_list.set(5, conf.dst_quant.zero_points(ctx)); |
139 | |
140 | arg_list.set(6, conf.sum_quant.scales()); |
141 | arg_list.set(7, conf.sum_quant.zero_points()); |
142 | |
143 | auto nd_range = conf.dispatch.nd_range(); |
144 | |
145 | status = parallel_for(ctx, nd_range, kernel_, arg_list); |
146 | |
147 | return status; |
148 | } |
149 | |
150 | } // namespace ocl |
151 | } // namespace gpu |
152 | } // namespace impl |
153 | } // namespace dnnl |
154 | |