1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef GPU_GPU_PRIMITIVE_HPP
18#define GPU_GPU_PRIMITIVE_HPP
19
20#include <cassert>
21
22#include "common/cache_blob.hpp"
23#include "common/primitive.hpp"
24#include "common/utils.hpp"
25#include "gpu/compute/compute.hpp"
26#include "gpu/gemm/gpu_gemm_exec_types.hpp"
27#include "gpu/gpu_resource.hpp"
28
29#define CTX_GPU_RES_STORAGE(arg) \
30 (*(cached_mapper() \
31 ->template get<gpu_resource_t>(this) \
32 ->get_memory_storage(arg)))
33
34namespace dnnl {
35namespace impl {
36namespace gpu {
37
38struct gpu_primitive_t : public primitive_t {
39 using primitive_t::primitive_t;
40
41 struct compute_block_t {
42 enum class kind_t { kernel, primitive };
43
44 compute_block_t(const compute::kernel_t &kernel)
45 : kind_(kind_t::kernel), kernel_(kernel), primitive_(nullptr) {}
46 compute_block_t(const primitive_t *primitive)
47 : kind_(kind_t::primitive), primitive_(primitive) {}
48
49 bool is_kernel() const { return kind_ == kind_t::kernel; }
50 bool is_primitive() const { return kind_ == kind_t::primitive; }
51 explicit operator bool() const { return kernel_ || primitive_; }
52
53 const primitive_t *primitive() const { return primitive_; }
54 compute::kernel_t kernel() const { return kernel_; }
55 kind_t kind() const { return kind_; }
56
57 private:
58 kind_t kind_;
59 compute::kernel_t kernel_;
60 const primitive_t *primitive_;
61 };
62
63 const resource_mapper_t *cached_mapper() const { return &cached_mapper_; }
64
65 status_t init_cached_resource(engine_t *engine) const override {
66 CHECK(fill_mapper(engine, cached_mapper_));
67 // When caching kernels, each primitve from the hierarchy has its
68 // own mapper and is responsible for filling it.
69 for (const auto &cb : compute_blocks()) {
70 if (!cb) continue;
71
72 switch (cb.kind()) {
73 case compute_block_t::kind_t::primitive:
74 CHECK(cb.primitive()->init_cached_resource(engine));
75 break;
76 case compute_block_t::kind_t::kernel:
77 // Clear kernels with binary state to decrease memory
78 // consumption.
79 cb.kernel().clear();
80 break;
81 default: assert(!"unexpected"); return status::runtime_error;
82 }
83 }
84 return status::success;
85 }
86
87 status_t get_cache_blob_size(size_t *size) const override {
88 if (!size) return status::invalid_arguments;
89 // Query binary size for each created kernel.
90 for (const auto &cb : compute_blocks()) {
91 if (!cb) continue;
92
93 switch (cb.kind()) {
94 case compute_block_t::kind_t::kernel: {
95 size_t sz = 0;
96 CHECK(cb.kernel().binary_size(&sz));
97 // We need additional sizeof(size_t) bytes to store the size
98 // of the binary when packing.
99 (*size) += sz + sizeof(size_t);
100 break;
101 }
102 case compute_block_t::kind_t::primitive:
103 CHECK(cb.primitive()->get_cache_blob_size(size));
104 break;
105 default: assert(!"unexpected"); return status::runtime_error;
106 }
107 }
108 return status::success;
109 }
110
111 status_t get_cache_blob(
112 engine_t *engine, cache_blob_t &blob) const override {
113 for (const auto &cb : compute_blocks()) {
114 if (!cb) continue;
115
116 switch (cb.kind()) {
117 case compute_block_t::kind_t::kernel: {
118 compute::binary_t binary;
119 const resource_mapper_t *rm = cached_mapper();
120 ;
121 const auto *resource = rm->get<gpu_resource_t>(this);
122 const auto &realized_kernel
123 = resource->get_kernel(cb.kernel().id());
124 // Get binaries for all kernels within current primitive.
125 // TODO: Copy binary directly to `blob` when binary cache
126 // mode is removed.
127 CHECK(realized_kernel.binary(engine, binary));
128 CHECK(blob.add_binary(binary.data(), binary.size()));
129 break;
130 }
131 case compute_block_t::kind_t::primitive:
132 CHECK(cb.primitive()->get_cache_blob(engine, blob));
133 break;
134 default: assert(!"unexpected"); return status::runtime_error;
135 }
136 }
137 return status::success;
138 }
139
140 status_t create_kernel(engine_t *engine, compute::kernel_t *kernel,
141 jit::jit_generator_base *jitter) {
142 auto *compute_engine
143 = utils::downcast<compute::compute_engine_t *>(engine);
144 CHECK(compute_engine->create_kernel(kernel, jitter, cache_blob()));
145 register_kernels({*kernel});
146 return status::success;
147 }
148
149 status_t create_kernels(engine_t *engine,
150 std::vector<compute::kernel_t> *kernels,
151 const std::vector<const char *> &kernel_names,
152 const compute::kernel_ctx_t &kernel_ctx) {
153 auto *compute_engine
154 = utils::downcast<compute::compute_engine_t *>(engine);
155 CHECK(compute_engine->create_kernels(
156 kernels, kernel_names, kernel_ctx, cache_blob()));
157 register_kernels(*kernels);
158 return status::success;
159 }
160
161 status_t create_kernel(engine_t *engine, compute::kernel_t *kernel,
162 const char *kernel_name, const compute::kernel_ctx_t &kernel_ctx) {
163
164 std::vector<compute::kernel_t> kernels(1);
165 auto status
166 = create_kernels(engine, &kernels, {kernel_name}, kernel_ctx);
167 if (status == status::success) *kernel = kernels[0];
168 return status;
169 }
170
171 status_t create_nested_primitive(std::shared_ptr<primitive_t> &primitive,
172 const std::shared_ptr<primitive_desc_t> &pd, engine_t *engine) {
173 CHECK(pd->create_primitive(primitive, engine, cache_blob()));
174 register_primitive(primitive.get());
175 return status::success;
176 }
177
178protected:
179 void register_primitive(const primitive_t *primitive) {
180 registered_compute_blocks_.emplace_back(primitive);
181 }
182
183 void register_kernels(const std::vector<compute::kernel_t> &kernels) {
184 for (const auto &k : kernels) {
185 registered_compute_blocks_.emplace_back(k);
186 }
187 }
188
189 virtual status_t init_res_storage(
190 engine_t *engine, gpu_resource_t *r) const {
191 return status::success;
192 }
193
194 // TODO: use inheritance for exec_ctx_t to get rid of such places...
195 status_t parallel_for(const gemm_exec_ctx_t &ctx,
196 const compute::nd_range_t &range, const compute::kernel_t &kernel,
197 const compute::kernel_arg_list_t &arg_list) const {
198 const resource_mapper_t *rm = nullptr;
199 rm = cached_mapper();
200 return parallel_for(rm, ctx.stream(), range, kernel, arg_list);
201 }
202
203 status_t parallel_for(const exec_ctx_t &ctx,
204 const compute::nd_range_t &range, const compute::kernel_t &kernel,
205 const compute::kernel_arg_list_t &arg_list) const {
206 const resource_mapper_t *rm = nullptr;
207 rm = cached_mapper();
208 return parallel_for(rm, ctx.stream(), range, kernel, arg_list);
209 }
210
211private:
212 const std::vector<compute_block_t> &compute_blocks() const {
213 return registered_compute_blocks_;
214 }
215
216 status_t fill_mapper(engine_t *engine, resource_mapper_t &mapper) const {
217 if (mapper.has_resource(this)) return status::success;
218 auto r = utils::make_unique<gpu_resource_t>();
219 if (!r) return status::out_of_memory;
220 compute::program_list_t programs(engine);
221 for (const auto &cb : compute_blocks()) {
222 if (!cb || !cb.is_kernel()) continue;
223 compute::kernel_t realized_kernel;
224 CHECK(cb.kernel().realize(&realized_kernel, engine, &programs));
225 r->add_kernel(cb.kernel().id(), realized_kernel);
226 }
227 CHECK(init_res_storage(engine, r.get()));
228 mapper.add(this, std::move(r));
229 return status::success;
230 }
231
232 status_t parallel_for(const resource_mapper_t *resource_mapper,
233 stream_t *stream, const compute::nd_range_t &range,
234 const compute::kernel_t &kernel,
235 const compute::kernel_arg_list_t &arg_list) const {
236
237 compute::compute_stream_t *compute_stream
238 = utils::downcast<compute::compute_stream_t *>(stream);
239 const auto *resource = resource_mapper->get<gpu_resource_t>(this);
240 const auto &realized_kernel = resource->get_kernel(kernel.id());
241
242 CHECK(compute_stream->parallel_for(range, realized_kernel, arg_list));
243 return status::success;
244 }
245
246 // Make these mutable to allow modifying them from `init_cached_resource`.
247 mutable resource_mapper_t cached_mapper_;
248 mutable std::vector<compute_block_t> registered_compute_blocks_;
249};
250
251} // namespace gpu
252} // namespace impl
253} // namespace dnnl
254
255#endif
256