1/*******************************************************************************
2* Copyright 2021-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_GEN9_REDUCTION_HPP
18#define GPU_GEN9_REDUCTION_HPP
19
20#include "common/c_types_map.hpp"
21#include "common/primitive.hpp"
22#include "common/type_helpers.hpp"
23#include "common/utils.hpp"
24#include "gpu/compute/compute.hpp"
25#include "gpu/gpu_primitive.hpp"
26#include "gpu/gpu_reduction_pd.hpp"
27#include "gpu/gpu_resource.hpp"
28#include "gpu/primitive_conf.hpp"
29
30namespace dnnl {
31namespace impl {
32namespace gpu {
33namespace ocl {
34
35// Requirements for this implementation:
36// - only N and C can be blocked (NCx or nCx)
37// - C is blocked by 16 or 32
38// - src/dst blocking structures have to match
39// - either all or none of the HWD dims are being reduced
40// - padded C is a multiple of 16
41
42// This implementation combines any HWD dimensions to a single index,
43// Leaving N, C, and HWD dimensions (plus blocked portions of C and
44// possibly N). Each of these dimensions is broken into chunks, and a
45// work item is assigned one of each of these chunks in the initial step.
46
47// The final phase finishes the reduction by reducing all of the chunks
48// belonging to reduction dimensions.
49
50struct gen9_reduction_t : public gpu_primitive_t {
51 using gpu_primitive_t::gpu_primitive_t;
52 struct pd_t : public gpu_reduction_pd_t {
53 using gpu_reduction_pd_t::gpu_reduction_pd_t;
54
55 DECLARE_COMMON_PD_T("ocl:gen9", gen9_reduction_t);
56
57 status_t init(engine_t *engine) {
58 using smask_t = primitive_attr_t::skip_mask_t;
59 bool ok = set_default_params() == status::success
60 && attr_.has_default_values(
61 smask_t::post_ops | smask_t::gpu_attr)
62 && !memory_desc_ndims_ok(src_md(), dst_md())
63 && post_ops_with_binary_ok(attr(), dst_md()->data_type, 5)
64 && attr_.set_default_formats(dst_md(0)) == status::success;
65 if (!ok) return status::unimplemented;
66
67 CHECK(init_conf(engine));
68 init_scratchpad();
69
70 return status::success;
71 }
72
73 status_t init_conf(engine_t *engine);
74 status_t init_kernel_ctx(compute::kernel_ctx_t &kernel_ctx) const;
75 void init_scratchpad();
76
77 reduction_conf_t conf;
78 };
79
80 status_t init(engine_t *engine) override {
81 compute::kernel_ctx_t kernel_ctx(pd()->attr());
82
83 status_t status = pd()->init_kernel_ctx(kernel_ctx);
84 CHECK(status);
85
86 status = create_kernel(
87 engine, &initial_kernel, "gen9_initial_reduce", kernel_ctx);
88 CHECK(status);
89 if (!pd()->conf.skip_final_phase) {
90 status = create_kernel(
91 engine, &final_kernel, "gen9_final_reduce", kernel_ctx);
92 CHECK(status);
93 }
94
95 return status::success;
96 }
97
98 virtual status_t execute(const exec_ctx_t &ctx) const override {
99 return execute_gen9(ctx);
100 }
101
102private:
103 status_t execute_gen9(const exec_ctx_t &ctx) const;
104 const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
105
106 compute::kernel_t initial_kernel;
107 compute::kernel_t final_kernel;
108};
109
110} // namespace ocl
111} // namespace gpu
112} // namespace impl
113} // namespace dnnl
114
115#endif
116