ref_sum.hpp source code [oneDNN/src/gpu/ocl/ref_sum.hpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef GPU_OCL_REF_SUM_HPP
18	#define GPU_OCL_REF_SUM_HPP
19
20	#include "common/primitive.hpp"
21	#include "common/reorder.hpp"
22	#include "common/reorder_pd.hpp"
23	#include "common/stream.hpp"
24	#include "gpu/gpu_primitive.hpp"
25	#include "gpu/gpu_resource.hpp"
26	#include "gpu/gpu_sum_pd.hpp"
27	#include "gpu/ocl/ocl_utils.hpp"
28
29	namespace dnnl {
30	namespace impl {
31	namespace gpu {
32	namespace ocl {
33
34	struct ref_sum_t : public gpu_primitive_t {
35	using gpu_primitive_t::gpu_primitive_t;
36	struct pd_t : public gpu_sum_pd_t {
37	using gpu_sum_pd_t::gpu_sum_pd_t;
38
39	pd_t(const pd_t &rhs) = default;
40	~pd_t() = default;
41
42	DECLARE_SUM_PD_T("ref:any", ref_sum_t);
43
44	status_t init(engine_t *engine) {
45	bool ok = gpu_sum_pd_t::init(engine) == status::success;
46	if (!ok) return status::unimplemented;
47
48	if (has_zero_dim_memory()) return status::success;
49	reorder_pds_.resize(n_ + need_output_reorder());
50	for (int i = `0`; i < n_; ++i) {
51	primitive_attr_t r_attr;
52	r_attr.scales_.set(DNNL_ARG_SRC, `0`);
53	if (i != `0`) r_attr.post_ops_.append_sum(`1.0`);
54
55	CHECK(reorder_primitive_desc_create(reorder_pds_[i], engine,
56	src_md(i), dst_acc_md(), &r_attr));
57	}
58
59	if (need_output_reorder()) {
60	CHECK(reorder_primitive_desc_create(
61	reorder_pds_[n_], engine, dst_acc_md(), dst_md()));
62	}
63
64	scale_md_.ndims = `1`;
65	scale_md_.dims[`0`] = `1`;
66	scale_md_.data_type = data_type::f32;
67	CHECK(memory_desc_init_by_tag(scale_md_, format_tag::x));
68
69	init_scratchpad();
70	return status::success;
71	}
72
73	std::vector<std::shared_ptr<primitive_desc_t>> reorder_pds_;
74	memory_desc_t scale_md_;
75
76	private:
77	void init_scratchpad() {
78	using namespace memory_tracking::names;
79	auto scratchpad = scratchpad_registry().registrar();
80	if (need_output_reorder()) {
81	const memory_desc_wrapper dst_acc_d(dst_acc_md());
82	scratchpad.book(key_sum_reduction, dst_acc_d.size(), `1`,
83	OCL_BUFFER_ALIGNMENT);
84	}
85
86	for (size_t i = `0`; i < reorder_pds_.size(); i++) {
87	scratchpad.book(key_nested_multiple + (int)i,
88	reorder_pds_[i]->scratchpad_registry());
89	}
90	}
91	};
92
93	status_t init(engine_t *engine) override {
94	const size_t n = pd()->reorder_pds_.size();
95	reorders_.resize(n);
96	for (size_t i = `0`; i < n; ++i) {
97	CHECK(create_nested_primitive(
98	reorders_[i], pd()->reorder_pds_[i], engine));
99	}
100	return status::success;
101	}
102
103	status_t init_res_storage(
104	engine_t engine, gpu_resource_t r) const override {
105	const dim_t count = pd()->n_inputs();
106	const float *s_data = pd()->scales();
107	for (dim_t i = `0`; i < count; i++) {
108	// copy scales on gpu
109	const size_t size = sizeof(float);
110	std::unique_ptr<memory_storage_t> scales;
111	memory_storage_t scale = nullptr*;
112	auto s = engine->create_memory_storage(&scale, size);
113	if (s != status::success) return s;
114	float mapped_mem_storage = nullptr*;
115	s = scale->map_data((void )&mapped_mem_storage, nullptr**, size);
116	if (s != status::success) return s;
117	if (!mapped_mem_storage) return status::out_of_memory;
118	mapped_mem_storage[`0`] = s_data[i];
119	s = scale->unmap_data((void )mapped_mem_storage, nullptr*);
120	if (s != status::success) return s;
121	scales.reset(scale);
122	r->add_memory_storage(i, std::move(scales));
123	}
124	return status::success;
125	}
126	status_t execute(const exec_ctx_t &ctx) const override {
127	using namespace memory_tracking::names;
128
129	if (pd()->has_zero_dim_memory()) return status::success;
130
131	const auto n = pd()->n_inputs();
132	exec_args_t r_args;
133
134	std::unique_ptr<memory_t> p_temp_dst_acc;
135	if (pd()->need_output_reorder()) {
136	auto scratchpad = ctx.get_scratchpad_grantor().get_memory_storage(
137	key_sum_reduction);
138	CHECK(safe_ptr_assign(p_temp_dst_acc,
139	new memory_t(ctx.stream()->engine(), pd()->dst_acc_md(),
140	std::move(scratchpad))));
141	}
142
143	auto dst = ctx.args().at(DNNL_ARG_DST);
144	memory_arg_t dst_acc = {p_temp_dst_acc.get(), false};
145
146	for (int i = `0`; i < n; ++i) {
147	memory_t scales_mem(
148	ctx.stream()->engine(), &pd()->scale_md_, nullptr);
149	scales_mem.set_data_handle(CTX_GPU_RES_STORAGE(i).data_handle());
150	r_args[DNNL_ARG_SRC] = ctx.args().at(DNNL_ARG_MULTIPLE_SRC + i);
151	r_args[DNNL_ARG_DST] = pd()->need_output_reorder() ? dst_acc : dst;
152	r_args[DNNL_ARG_ATTR_SCALES \| DNNL_ARG_SRC] = {&scales_mem, true};
153	exec_ctx_t r_ctx(ctx, std::move(r_args));
154
155	nested_scratchpad_t ns(ctx, key_nested_multiple + i, reorders_[i]);
156	r_ctx.set_scratchpad_grantor(ns.grantor());
157	CHECK(reorders_[i]->execute(r_ctx));
158	#ifndef DNNL_SYCL_CUDA
159	ctx.stream()->wait();
160	#endif
161	}
162
163	if (pd()->need_output_reorder()) {
164	dst_acc = {p_temp_dst_acc.get(), true};
165	r_args[DNNL_ARG_SRC] = dst_acc;
166	r_args[DNNL_ARG_DST] = dst;
167	exec_ctx_t r_ctx(ctx, std::move(r_args));
168
169	nested_scratchpad_t ns(ctx, key_nested_multiple + n, reorders_[n]);
170	r_ctx.set_scratchpad_grantor(ns.grantor());
171	CHECK(reorders_[n]->execute(r_ctx));
172	}
173
174	return status::success;
175	}
176
177	private:
178	const pd_t pd() const* { return (const pd_t *)primitive_t::pd().get(); }
179	std::vector<std::shared_ptr<primitive_t>> reorders_;
180	};
181
182	} // namespace ocl
183	} // namespace gpu
184	} // namespace impl
185	} // namespace dnnl
186
187	#endif
188

Browse the source code of oneDNN/src/gpu/ocl/ref_sum.hpp