1 | /******************************************************************************* |
2 | * Copyright 2021-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_GEN9_REDUCTION_HPP |
18 | #define GPU_GEN9_REDUCTION_HPP |
19 | |
20 | #include "common/c_types_map.hpp" |
21 | #include "common/primitive.hpp" |
22 | #include "common/type_helpers.hpp" |
23 | #include "common/utils.hpp" |
24 | #include "gpu/compute/compute.hpp" |
25 | #include "gpu/gpu_primitive.hpp" |
26 | #include "gpu/gpu_reduction_pd.hpp" |
27 | #include "gpu/gpu_resource.hpp" |
28 | #include "gpu/primitive_conf.hpp" |
29 | |
30 | namespace dnnl { |
31 | namespace impl { |
32 | namespace gpu { |
33 | namespace ocl { |
34 | |
35 | // Requirements for this implementation: |
36 | // - only N and C can be blocked (NCx or nCx) |
37 | // - C is blocked by 16 or 32 |
38 | // - src/dst blocking structures have to match |
39 | // - either all or none of the HWD dims are being reduced |
40 | // - padded C is a multiple of 16 |
41 | |
42 | // This implementation combines any HWD dimensions to a single index, |
43 | // Leaving N, C, and HWD dimensions (plus blocked portions of C and |
44 | // possibly N). Each of these dimensions is broken into chunks, and a |
45 | // work item is assigned one of each of these chunks in the initial step. |
46 | |
47 | // The final phase finishes the reduction by reducing all of the chunks |
48 | // belonging to reduction dimensions. |
49 | |
50 | struct gen9_reduction_t : public gpu_primitive_t { |
51 | using gpu_primitive_t::gpu_primitive_t; |
52 | struct pd_t : public gpu_reduction_pd_t { |
53 | using gpu_reduction_pd_t::gpu_reduction_pd_t; |
54 | |
55 | DECLARE_COMMON_PD_T("ocl:gen9" , gen9_reduction_t); |
56 | |
57 | status_t init(engine_t *engine) { |
58 | using smask_t = primitive_attr_t::skip_mask_t; |
59 | bool ok = set_default_params() == status::success |
60 | && attr_.has_default_values( |
61 | smask_t::post_ops | smask_t::gpu_attr) |
62 | && !memory_desc_ndims_ok(src_md(), dst_md()) |
63 | && post_ops_with_binary_ok(attr(), dst_md()->data_type, 5) |
64 | && attr_.set_default_formats(dst_md(0)) == status::success; |
65 | if (!ok) return status::unimplemented; |
66 | |
67 | CHECK(init_conf(engine)); |
68 | init_scratchpad(); |
69 | |
70 | return status::success; |
71 | } |
72 | |
73 | status_t init_conf(engine_t *engine); |
74 | status_t init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const; |
75 | void init_scratchpad(); |
76 | |
77 | reduction_conf_t conf; |
78 | }; |
79 | |
80 | status_t init(engine_t *engine) override { |
81 | compute::kernel_ctx_t kernel_ctx(pd()->attr()); |
82 | |
83 | status_t status = pd()->init_kernel_ctx(kernel_ctx); |
84 | CHECK(status); |
85 | |
86 | status = create_kernel( |
87 | engine, &initial_kernel, "gen9_initial_reduce" , kernel_ctx); |
88 | CHECK(status); |
89 | if (!pd()->conf.skip_final_phase) { |
90 | status = create_kernel( |
91 | engine, &final_kernel, "gen9_final_reduce" , kernel_ctx); |
92 | CHECK(status); |
93 | } |
94 | |
95 | return status::success; |
96 | } |
97 | |
98 | virtual status_t execute(const exec_ctx_t &ctx) const override { |
99 | return execute_gen9(ctx); |
100 | } |
101 | |
102 | private: |
103 | status_t execute_gen9(const exec_ctx_t &ctx) const; |
104 | const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); } |
105 | |
106 | compute::kernel_t initial_kernel; |
107 | compute::kernel_t final_kernel; |
108 | }; |
109 | |
110 | } // namespace ocl |
111 | } // namespace gpu |
112 | } // namespace impl |
113 | } // namespace dnnl |
114 | |
115 | #endif |
116 | |