1 | /******************************************************************************* |
2 | * Copyright 2020-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_OCL_GEN9_SUM_HPP |
18 | #define GPU_OCL_GEN9_SUM_HPP |
19 | |
20 | #include <assert.h> |
21 | |
22 | #include "common/c_types_map.hpp" |
23 | #include "common/primitive.hpp" |
24 | #include "gpu/compute/compute.hpp" |
25 | #include "gpu/gpu_primitive.hpp" |
26 | #include "gpu/gpu_resource.hpp" |
27 | #include "gpu/gpu_sum_pd.hpp" |
28 | #include "gpu/ocl/ocl_stream.hpp" |
29 | #include "gpu/ocl/ocl_utils.hpp" |
30 | #include "gpu/primitive_conf.hpp" |
31 | |
32 | namespace dnnl { |
33 | namespace impl { |
34 | namespace gpu { |
35 | namespace ocl { |
36 | |
37 | struct gen9_sum_t : public gpu_primitive_t { |
38 | using gpu_primitive_t::gpu_primitive_t; |
39 | struct pd_t : public gpu_sum_pd_t { |
40 | using gpu_sum_pd_t::gpu_sum_pd_t; |
41 | |
42 | DECLARE_SUM_PD_T("ocl:gen9:any" , gen9_sum_t); |
43 | |
44 | status_t init(engine_t *engine) { |
45 | const int n = n_inputs(); |
46 | |
47 | if (n > max_num_arrs) return status::unimplemented; |
48 | |
49 | const memory_desc_wrapper o_d(dst_md()); |
50 | |
51 | // for IO bytes less than 1MB fall back into many_inputs_sum kernel for better performance. |
52 | size_t io_bytes = (n + 1) * o_d.data_type_size() * o_d.nelems(true); |
53 | if (io_bytes < 1024 * 1024) return status::unimplemented; |
54 | |
55 | bool ok = gpu_sum_pd_t::init(engine) == status::success |
56 | && !memory_desc_ndims_ok(dst_md()); |
57 | |
58 | if (!ok) return status::unimplemented; |
59 | |
60 | for (int i = 0; i < n; ++i) { |
61 | const memory_desc_wrapper i_d(src_md(i)); |
62 | if (i_d != o_d) return status::unimplemented; |
63 | } |
64 | |
65 | return status::success; |
66 | } |
67 | }; |
68 | |
69 | status_t init(engine_t *engine) override { |
70 | compute::kernel_ctx_t kernel_ctx; |
71 | |
72 | const memory_desc_wrapper data_d(pd()->dst_md()); |
73 | const memory_desc_wrapper data_s(pd()->src_md()); |
74 | |
75 | kernel_ctx.set_data_type(data_s.data_type()); |
76 | size_t io_bytes = (pd()->n_inputs() + 1) * data_d.data_type_size() |
77 | * data_d.nelems(true); |
78 | // Heuristics: for IO bytes smaller than 10MB reduce vector size for better perf. |
79 | if (io_bytes < 10 * 1024 * 1024) { vector_size /= 2; } |
80 | kernel_ctx.define_int("VECT_DT_N" , vector_size); |
81 | kernel_ctx.define_int("N_INPUTS" , pd()->n_inputs()); |
82 | kernel_ctx.define_int("N_ELEMS" , data_d.nelems(true)); |
83 | |
84 | def_memory_desc_info( |
85 | kernel_ctx, memory_desc_info_t::create(data_d), "SRC" ); |
86 | def_memory_desc_info( |
87 | kernel_ctx, memory_desc_info_t::create(data_s), "DST" ); |
88 | |
89 | create_kernel(engine, &kernel_, "gen9_sum" , kernel_ctx); |
90 | |
91 | if (!kernel_) return status::runtime_error; |
92 | return status::success; |
93 | } |
94 | |
95 | status_t init_res_storage( |
96 | engine_t *engine, gpu_resource_t *r) const override { |
97 | const dim_t count = pd()->n_inputs(); |
98 | const float *s_data = pd()->scales(); |
99 | |
100 | const size_t size = count * sizeof(float); |
101 | std::unique_ptr<memory_storage_t> scales; |
102 | memory_storage_t *scale = nullptr; |
103 | auto s = engine->create_memory_storage(&scale, size); |
104 | if (s != status::success) return s; |
105 | float *mapped_mem_storage = nullptr; |
106 | s = scale->map_data((void **)&mapped_mem_storage, nullptr, size); |
107 | if (s != status::success) return s; |
108 | utils::array_copy(mapped_mem_storage, s_data, count); |
109 | s = scale->unmap_data((void *)mapped_mem_storage, nullptr); |
110 | if (s != status::success) return s; |
111 | scales.reset(scale); |
112 | r->add_memory_storage(SCALES_, std::move(scales)); |
113 | return status::success; |
114 | } |
115 | |
116 | status_t execute(const exec_ctx_t &ctx) const override; |
117 | |
118 | private: |
119 | enum { max_num_arrs = 16 }; |
120 | int vector_size = 8; |
121 | enum { SCALES_ = 0 }; |
122 | const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } |
123 | compute::kernel_t kernel_; |
124 | }; |
125 | |
126 | } // namespace ocl |
127 | } // namespace gpu |
128 | } // namespace impl |
129 | } // namespace dnnl |
130 | |
131 | #endif |
132 | |
133 | // vim: et ts=4 sw=4 cindent cino+=l0,\:4,N-s |
134 | |