1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_OCL_REF_SUM_HPP |
18 | #define GPU_OCL_REF_SUM_HPP |
19 | |
20 | #include "common/primitive.hpp" |
21 | #include "common/reorder.hpp" |
22 | #include "common/reorder_pd.hpp" |
23 | #include "common/stream.hpp" |
24 | #include "gpu/gpu_primitive.hpp" |
25 | #include "gpu/gpu_resource.hpp" |
26 | #include "gpu/gpu_sum_pd.hpp" |
27 | #include "gpu/ocl/ocl_utils.hpp" |
28 | |
29 | namespace dnnl { |
30 | namespace impl { |
31 | namespace gpu { |
32 | namespace ocl { |
33 | |
34 | struct ref_sum_t : public gpu_primitive_t { |
35 | using gpu_primitive_t::gpu_primitive_t; |
36 | struct pd_t : public gpu_sum_pd_t { |
37 | using gpu_sum_pd_t::gpu_sum_pd_t; |
38 | |
39 | pd_t(const pd_t &rhs) = default; |
40 | ~pd_t() = default; |
41 | |
42 | DECLARE_SUM_PD_T("ref:any" , ref_sum_t); |
43 | |
44 | status_t init(engine_t *engine) { |
45 | bool ok = gpu_sum_pd_t::init(engine) == status::success; |
46 | if (!ok) return status::unimplemented; |
47 | |
48 | if (has_zero_dim_memory()) return status::success; |
49 | reorder_pds_.resize(n_ + need_output_reorder()); |
50 | for (int i = 0; i < n_; ++i) { |
51 | primitive_attr_t r_attr; |
52 | r_attr.scales_.set(DNNL_ARG_SRC, 0); |
53 | if (i != 0) r_attr.post_ops_.append_sum(1.0); |
54 | |
55 | CHECK(reorder_primitive_desc_create(reorder_pds_[i], engine, |
56 | src_md(i), dst_acc_md(), &r_attr)); |
57 | } |
58 | |
59 | if (need_output_reorder()) { |
60 | CHECK(reorder_primitive_desc_create( |
61 | reorder_pds_[n_], engine, dst_acc_md(), dst_md())); |
62 | } |
63 | |
64 | scale_md_.ndims = 1; |
65 | scale_md_.dims[0] = 1; |
66 | scale_md_.data_type = data_type::f32; |
67 | CHECK(memory_desc_init_by_tag(scale_md_, format_tag::x)); |
68 | |
69 | init_scratchpad(); |
70 | return status::success; |
71 | } |
72 | |
73 | std::vector<std::shared_ptr<primitive_desc_t>> reorder_pds_; |
74 | memory_desc_t scale_md_; |
75 | |
76 | private: |
77 | void init_scratchpad() { |
78 | using namespace memory_tracking::names; |
79 | auto scratchpad = scratchpad_registry().registrar(); |
80 | if (need_output_reorder()) { |
81 | const memory_desc_wrapper dst_acc_d(dst_acc_md()); |
82 | scratchpad.book(key_sum_reduction, dst_acc_d.size(), 1, |
83 | OCL_BUFFER_ALIGNMENT); |
84 | } |
85 | |
86 | for (size_t i = 0; i < reorder_pds_.size(); i++) { |
87 | scratchpad.book(key_nested_multiple + (int)i, |
88 | reorder_pds_[i]->scratchpad_registry()); |
89 | } |
90 | } |
91 | }; |
92 | |
93 | status_t init(engine_t *engine) override { |
94 | const size_t n = pd()->reorder_pds_.size(); |
95 | reorders_.resize(n); |
96 | for (size_t i = 0; i < n; ++i) { |
97 | CHECK(create_nested_primitive( |
98 | reorders_[i], pd()->reorder_pds_[i], engine)); |
99 | } |
100 | return status::success; |
101 | } |
102 | |
103 | status_t init_res_storage( |
104 | engine_t *engine, gpu_resource_t *r) const override { |
105 | const dim_t count = pd()->n_inputs(); |
106 | const float *s_data = pd()->scales(); |
107 | for (dim_t i = 0; i < count; i++) { |
108 | // copy scales on gpu |
109 | const size_t size = sizeof(float); |
110 | std::unique_ptr<memory_storage_t> scales; |
111 | memory_storage_t *scale = nullptr; |
112 | auto s = engine->create_memory_storage(&scale, size); |
113 | if (s != status::success) return s; |
114 | float *mapped_mem_storage = nullptr; |
115 | s = scale->map_data((void **)&mapped_mem_storage, nullptr, size); |
116 | if (s != status::success) return s; |
117 | if (!mapped_mem_storage) return status::out_of_memory; |
118 | mapped_mem_storage[0] = s_data[i]; |
119 | s = scale->unmap_data((void *)mapped_mem_storage, nullptr); |
120 | if (s != status::success) return s; |
121 | scales.reset(scale); |
122 | r->add_memory_storage(i, std::move(scales)); |
123 | } |
124 | return status::success; |
125 | } |
126 | status_t execute(const exec_ctx_t &ctx) const override { |
127 | using namespace memory_tracking::names; |
128 | |
129 | if (pd()->has_zero_dim_memory()) return status::success; |
130 | |
131 | const auto n = pd()->n_inputs(); |
132 | exec_args_t r_args; |
133 | |
134 | std::unique_ptr<memory_t> p_temp_dst_acc; |
135 | if (pd()->need_output_reorder()) { |
136 | auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage( |
137 | key_sum_reduction); |
138 | CHECK(safe_ptr_assign(p_temp_dst_acc, |
139 | new memory_t(ctx.stream()->engine(), pd()->dst_acc_md(), |
140 | std::move(scratchpad)))); |
141 | } |
142 | |
143 | auto dst = ctx.args().at(DNNL_ARG_DST); |
144 | memory_arg_t dst_acc = {p_temp_dst_acc.get(), false}; |
145 | |
146 | for (int i = 0; i < n; ++i) { |
147 | memory_t scales_mem( |
148 | ctx.stream()->engine(), &pd()->scale_md_, nullptr); |
149 | scales_mem.set_data_handle(CTX_GPU_RES_STORAGE(i).data_handle()); |
150 | r_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i); |
151 | r_args[DNNL_ARG_DST] = pd()->need_output_reorder() ? dst_acc : dst; |
152 | r_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = {&scales_mem, true}; |
153 | exec_ctx_t r_ctx(ctx, std::move(r_args)); |
154 | |
155 | nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]); |
156 | r_ctx.set_scratchpad_grantor(ns.grantor()); |
157 | CHECK(reorders_[i]->execute(r_ctx)); |
158 | #ifndef DNNL_SYCL_CUDA |
159 | ctx.stream()->wait(); |
160 | #endif |
161 | } |
162 | |
163 | if (pd()->need_output_reorder()) { |
164 | dst_acc = {p_temp_dst_acc.get(), true}; |
165 | r_args[DNNL_ARG_SRC] = dst_acc; |
166 | r_args[DNNL_ARG_DST] = dst; |
167 | exec_ctx_t r_ctx(ctx, std::move(r_args)); |
168 | |
169 | nested_scratchpad_t ns(ctx, key_nested_multiple + n, reorders_[n]); |
170 | r_ctx.set_scratchpad_grantor(ns.grantor()); |
171 | CHECK(reorders_[n]->execute(r_ctx)); |
172 | } |
173 | |
174 | return status::success; |
175 | } |
176 | |
177 | private: |
178 | const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } |
179 | std::vector<std::shared_ptr<primitive_t>> reorders_; |
180 | }; |
181 | |
182 | } // namespace ocl |
183 | } // namespace gpu |
184 | } // namespace impl |
185 | } // namespace dnnl |
186 | |
187 | #endif |
188 | |