1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_OCL_REF_SUM_HPP
18#define GPU_OCL_REF_SUM_HPP
19
20#include "common/primitive.hpp"
21#include "common/reorder.hpp"
22#include "common/reorder_pd.hpp"
23#include "common/stream.hpp"
24#include "gpu/gpu_primitive.hpp"
25#include "gpu/gpu_resource.hpp"
26#include "gpu/gpu_sum_pd.hpp"
27#include "gpu/ocl/ocl_utils.hpp"
28
29namespace dnnl {
30namespace impl {
31namespace gpu {
32namespace ocl {
33
34struct ref_sum_t : public gpu_primitive_t {
35 using gpu_primitive_t::gpu_primitive_t;
36 struct pd_t : public gpu_sum_pd_t {
37 using gpu_sum_pd_t::gpu_sum_pd_t;
38
39 pd_t(const pd_t &rhs) = default;
40 ~pd_t() = default;
41
42 DECLARE_SUM_PD_T("ref:any", ref_sum_t);
43
44 status_t init(engine_t *engine) {
45 bool ok = gpu_sum_pd_t::init(engine) == status::success;
46 if (!ok) return status::unimplemented;
47
48 if (has_zero_dim_memory()) return status::success;
49 reorder_pds_.resize(n_ + need_output_reorder());
50 for (int i = 0; i < n_; ++i) {
51 primitive_attr_t r_attr;
52 r_attr.scales_.set(DNNL_ARG_SRC, 0);
53 if (i != 0) r_attr.post_ops_.append_sum(1.0);
54
55 CHECK(reorder_primitive_desc_create(reorder_pds_[i], engine,
56 src_md(i), dst_acc_md(), &r_attr));
57 }
58
59 if (need_output_reorder()) {
60 CHECK(reorder_primitive_desc_create(
61 reorder_pds_[n_], engine, dst_acc_md(), dst_md()));
62 }
63
64 scale_md_.ndims = 1;
65 scale_md_.dims[0] = 1;
66 scale_md_.data_type = data_type::f32;
67 CHECK(memory_desc_init_by_tag(scale_md_, format_tag::x));
68
69 init_scratchpad();
70 return status::success;
71 }
72
73 std::vector<std::shared_ptr<primitive_desc_t>> reorder_pds_;
74 memory_desc_t scale_md_;
75
76 private:
77 void init_scratchpad() {
78 using namespace memory_tracking::names;
79 auto scratchpad = scratchpad_registry().registrar();
80 if (need_output_reorder()) {
81 const memory_desc_wrapper dst_acc_d(dst_acc_md());
82 scratchpad.book(key_sum_reduction, dst_acc_d.size(), 1,
83 OCL_BUFFER_ALIGNMENT);
84 }
85
86 for (size_t i = 0; i < reorder_pds_.size(); i++) {
87 scratchpad.book(key_nested_multiple + (int)i,
88 reorder_pds_[i]->scratchpad_registry());
89 }
90 }
91 };
92
93 status_t init(engine_t *engine) override {
94 const size_t n = pd()->reorder_pds_.size();
95 reorders_.resize(n);
96 for (size_t i = 0; i < n; ++i) {
97 CHECK(create_nested_primitive(
98 reorders_[i], pd()->reorder_pds_[i], engine));
99 }
100 return status::success;
101 }
102
103 status_t init_res_storage(
104 engine_t *engine, gpu_resource_t *r) const override {
105 const dim_t count = pd()->n_inputs();
106 const float *s_data = pd()->scales();
107 for (dim_t i = 0; i < count; i++) {
108 // copy scales on gpu
109 const size_t size = sizeof(float);
110 std::unique_ptr<memory_storage_t> scales;
111 memory_storage_t *scale = nullptr;
112 auto s = engine->create_memory_storage(&scale, size);
113 if (s != status::success) return s;
114 float *mapped_mem_storage = nullptr;
115 s = scale->map_data((void **)&mapped_mem_storage, nullptr, size);
116 if (s != status::success) return s;
117 if (!mapped_mem_storage) return status::out_of_memory;
118 mapped_mem_storage[0] = s_data[i];
119 s = scale->unmap_data((void *)mapped_mem_storage, nullptr);
120 if (s != status::success) return s;
121 scales.reset(scale);
122 r->add_memory_storage(i, std::move(scales));
123 }
124 return status::success;
125 }
126 status_t execute(const exec_ctx_t &ctx) const override {
127 using namespace memory_tracking::names;
128
129 if (pd()->has_zero_dim_memory()) return status::success;
130
131 const auto n = pd()->n_inputs();
132 exec_args_t r_args;
133
134 std::unique_ptr<memory_t> p_temp_dst_acc;
135 if (pd()->need_output_reorder()) {
136 auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage(
137 key_sum_reduction);
138 CHECK(safe_ptr_assign(p_temp_dst_acc,
139 new memory_t(ctx.stream()->engine(), pd()->dst_acc_md(),
140 std::move(scratchpad))));
141 }
142
143 auto dst = ctx.args().at(DNNL_ARG_DST);
144 memory_arg_t dst_acc = {p_temp_dst_acc.get(), false};
145
146 for (int i = 0; i < n; ++i) {
147 memory_t scales_mem(
148 ctx.stream()->engine(), &pd()->scale_md_, nullptr);
149 scales_mem.set_data_handle(CTX_GPU_RES_STORAGE(i).data_handle());
150 r_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i);
151 r_args[DNNL_ARG_DST] = pd()->need_output_reorder() ? dst_acc : dst;
152 r_args[DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC] = {&scales_mem, true};
153 exec_ctx_t r_ctx(ctx, std::move(r_args));
154
155 nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]);
156 r_ctx.set_scratchpad_grantor(ns.grantor());
157 CHECK(reorders_[i]->execute(r_ctx));
158#ifndef DNNL_SYCL_CUDA
159 ctx.stream()->wait();
160#endif
161 }
162
163 if (pd()->need_output_reorder()) {
164 dst_acc = {p_temp_dst_acc.get(), true};
165 r_args[DNNL_ARG_SRC] = dst_acc;
166 r_args[DNNL_ARG_DST] = dst;
167 exec_ctx_t r_ctx(ctx, std::move(r_args));
168
169 nested_scratchpad_t ns(ctx, key_nested_multiple + n, reorders_[n]);
170 r_ctx.set_scratchpad_grantor(ns.grantor());
171 CHECK(reorders_[n]->execute(r_ctx));
172 }
173
174 return status::success;
175 }
176
177private:
178 const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
179 std::vector<std::shared_ptr<primitive_t>> reorders_;
180};
181
182} // namespace ocl
183} // namespace gpu
184} // namespace impl
185} // namespace dnnl
186
187#endif
188