1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_OCL_GEN9_SUM_HPP
18#define GPU_OCL_GEN9_SUM_HPP
19
20#include <assert.h>
21
22#include "common/c_types_map.hpp"
23#include "common/primitive.hpp"
24#include "gpu/compute/compute.hpp"
25#include "gpu/gpu_primitive.hpp"
26#include "gpu/gpu_resource.hpp"
27#include "gpu/gpu_sum_pd.hpp"
28#include "gpu/ocl/ocl_stream.hpp"
29#include "gpu/ocl/ocl_utils.hpp"
30#include "gpu/primitive_conf.hpp"
31
32namespace dnnl {
33namespace impl {
34namespace gpu {
35namespace ocl {
36
37struct gen9_sum_t : public gpu_primitive_t {
38 using gpu_primitive_t::gpu_primitive_t;
39 struct pd_t : public gpu_sum_pd_t {
40 using gpu_sum_pd_t::gpu_sum_pd_t;
41
42 DECLARE_SUM_PD_T("ocl:gen9:any", gen9_sum_t);
43
44 status_t init(engine_t *engine) {
45 const int n = n_inputs();
46
47 if (n > max_num_arrs) return status::unimplemented;
48
49 const memory_desc_wrapper o_d(dst_md());
50
51 // for IO bytes less than 1MB fall back into many_inputs_sum kernel for better performance.
52 size_t io_bytes = (n + 1) * o_d.data_type_size() * o_d.nelems(true);
53 if (io_bytes < 1024 * 1024) return status::unimplemented;
54
55 bool ok = gpu_sum_pd_t::init(engine) == status::success
56 && !memory_desc_ndims_ok(dst_md());
57
58 if (!ok) return status::unimplemented;
59
60 for (int i = 0; i < n; ++i) {
61 const memory_desc_wrapper i_d(src_md(i));
62 if (i_d != o_d) return status::unimplemented;
63 }
64
65 return status::success;
66 }
67 };
68
69 status_t init(engine_t *engine) override {
70 compute::kernel_ctx_t kernel_ctx;
71
72 const memory_desc_wrapper data_d(pd()->dst_md());
73 const memory_desc_wrapper data_s(pd()->src_md());
74
75 kernel_ctx.set_data_type(data_s.data_type());
76 size_t io_bytes = (pd()->n_inputs() + 1) * data_d.data_type_size()
77 * data_d.nelems(true);
78 // Heuristics: for IO bytes smaller than 10MB reduce vector size for better perf.
79 if (io_bytes < 10 * 1024 * 1024) { vector_size /= 2; }
80 kernel_ctx.define_int("VECT_DT_N", vector_size);
81 kernel_ctx.define_int("N_INPUTS", pd()->n_inputs());
82 kernel_ctx.define_int("N_ELEMS", data_d.nelems(true));
83
84 def_memory_desc_info(
85 kernel_ctx, memory_desc_info_t::create(data_d), "SRC");
86 def_memory_desc_info(
87 kernel_ctx, memory_desc_info_t::create(data_s), "DST");
88
89 create_kernel(engine, &kernel_, "gen9_sum", kernel_ctx);
90
91 if (!kernel_) return status::runtime_error;
92 return status::success;
93 }
94
95 status_t init_res_storage(
96 engine_t *engine, gpu_resource_t *r) const override {
97 const dim_t count = pd()->n_inputs();
98 const float *s_data = pd()->scales();
99
100 const size_t size = count * sizeof(float);
101 std::unique_ptr<memory_storage_t> scales;
102 memory_storage_t *scale = nullptr;
103 auto s = engine->create_memory_storage(&scale, size);
104 if (s != status::success) return s;
105 float *mapped_mem_storage = nullptr;
106 s = scale->map_data((void **)&mapped_mem_storage, nullptr, size);
107 if (s != status::success) return s;
108 utils::array_copy(mapped_mem_storage, s_data, count);
109 s = scale->unmap_data((void *)mapped_mem_storage, nullptr);
110 if (s != status::success) return s;
111 scales.reset(scale);
112 r->add_memory_storage(SCALES_, std::move(scales));
113 return status::success;
114 }
115
116 status_t execute(const exec_ctx_t &ctx) const override;
117
118private:
119 enum { max_num_arrs = 16 };
120 int vector_size = 8;
121 enum { SCALES_ = 0 };
122 const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
123 compute::kernel_t kernel_;
124};
125
126} // namespace ocl
127} // namespace gpu
128} // namespace impl
129} // namespace dnnl
130
131#endif
132
133// vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s
134