gen9_sum.hpp source code [oneDNN/src/gpu/ocl/gen9_sum.hpp]

1	/*******************************************************************************
2	* Copyright 2020-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#ifndef GPU_OCL_GEN9_SUM_HPP
18	#define GPU_OCL_GEN9_SUM_HPP
19
20	#include <assert.h>
21
22	#include "common/c_types_map.hpp"
23	#include "common/primitive.hpp"
24	#include "gpu/compute/compute.hpp"
25	#include "gpu/gpu_primitive.hpp"
26	#include "gpu/gpu_resource.hpp"
27	#include "gpu/gpu_sum_pd.hpp"
28	#include "gpu/ocl/ocl_stream.hpp"
29	#include "gpu/ocl/ocl_utils.hpp"
30	#include "gpu/primitive_conf.hpp"
31
32	namespace dnnl {
33	namespace impl {
34	namespace gpu {
35	namespace ocl {
36
37	struct gen9_sum_t : public gpu_primitive_t {
38	using gpu_primitive_t::gpu_primitive_t;
39	struct pd_t : public gpu_sum_pd_t {
40	using gpu_sum_pd_t::gpu_sum_pd_t;
41
42	DECLARE_SUM_PD_T("ocl:gen9:any", gen9_sum_t);
43
44	status_t init(engine_t *engine) {
45	const int n = n_inputs();
46
47	if (n > max_num_arrs) return status::unimplemented;
48
49	const memory_desc_wrapper o_d(dst_md());
50
51	// for IO bytes less than 1MB fall back into many_inputs_sum kernel for better performance.
52	size_t io_bytes = (n + `1`) * o_d.data_type_size() * o_d.nelems(true);
53	if (io_bytes < `1024` * `1024`) return status::unimplemented;
54
55	bool ok = gpu_sum_pd_t::init(engine) == status::success
56	&& !memory_desc_ndims_ok(dst_md());
57
58	if (!ok) return status::unimplemented;
59
60	for (int i = `0`; i < n; ++i) {
61	const memory_desc_wrapper i_d(src_md(i));
62	if (i_d != o_d) return status::unimplemented;
63	}
64
65	return status::success;
66	}
67	};
68
69	status_t init(engine_t *engine) override {
70	compute::kernel_ctx_t kernel_ctx;
71
72	const memory_desc_wrapper data_d(pd()->dst_md());
73	const memory_desc_wrapper data_s(pd()->src_md());
74
75	kernel_ctx.set_data_type(data_s.data_type());
76	size_t io_bytes = (pd()->n_inputs() + `1`) * data_d.data_type_size()
77	* data_d.nelems(true);
78	// Heuristics: for IO bytes smaller than 10MB reduce vector size for better perf.
79	if (io_bytes < `10` * `1024` * `1024`) { vector_size /= `2`; }
80	kernel_ctx.define_int("VECT_DT_N", vector_size);
81	kernel_ctx.define_int("N_INPUTS", pd()->n_inputs());
82	kernel_ctx.define_int("N_ELEMS", data_d.nelems(true));
83
84	def_memory_desc_info(
85	kernel_ctx, memory_desc_info_t::create(data_d), "SRC");
86	def_memory_desc_info(
87	kernel_ctx, memory_desc_info_t::create(data_s), "DST");
88
89	create_kernel(engine, &kernel_, "gen9_sum", kernel_ctx);
90
91	if (!kernel_) return status::runtime_error;
92	return status::success;
93	}
94
95	status_t init_res_storage(
96	engine_t engine, gpu_resource_t r) const override {
97	const dim_t count = pd()->n_inputs();
98	const float *s_data = pd()->scales();
99
100	const size_t size = count * sizeof(float);
101	std::unique_ptr<memory_storage_t> scales;
102	memory_storage_t scale = nullptr*;
103	auto s = engine->create_memory_storage(&scale, size);
104	if (s != status::success) return s;
105	float mapped_mem_storage = nullptr*;
106	s = scale->map_data((void )&mapped_mem_storage, nullptr**, size);
107	if (s != status::success) return s;
108	utils::array_copy(mapped_mem_storage, s_data, count);
109	s = scale->unmap_data((void )mapped_mem_storage, nullptr*);
110	if (s != status::success) return s;
111	scales.reset(scale);
112	r->add_memory_storage(SCALES_, std::move(scales));
113	return status::success;
114	}
115
116	status_t execute(const exec_ctx_t &ctx) const override;
117
118	private:
119	enum { max_num_arrs = `16` };
120	int vector_size = `8`;
121	enum { SCALES_ = `0` };
122	const pd_t pd() const* { return (const pd_t *)primitive_t::pd().get(); }
123	compute::kernel_t kernel_;
124	};
125
126	} // namespace ocl
127	} // namespace gpu
128	} // namespace impl
129	} // namespace dnnl
130
131	#endif
132
133	// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
134

Browse the source code of oneDNN/src/gpu/ocl/gen9_sum.hpp