1 | /******************************************************************************* |
2 | * Copyright 2020-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef GPU_GPU_PRIMITIVE_HPP |
18 | #define GPU_GPU_PRIMITIVE_HPP |
19 | |
20 | #include <cassert> |
21 | |
22 | #include "common/cache_blob.hpp" |
23 | #include "common/primitive.hpp" |
24 | #include "common/utils.hpp" |
25 | #include "gpu/compute/compute.hpp" |
26 | #include "gpu/gemm/gpu_gemm_exec_types.hpp" |
27 | #include "gpu/gpu_resource.hpp" |
28 | |
29 | #define CTX_GPU_RES_STORAGE(arg) \ |
30 | (*(cached_mapper() \ |
31 | ->template get<gpu_resource_t>(this) \ |
32 | ->get_memory_storage(arg))) |
33 | |
34 | namespace dnnl { |
35 | namespace impl { |
36 | namespace gpu { |
37 | |
38 | struct gpu_primitive_t : public primitive_t { |
39 | using primitive_t::primitive_t; |
40 | |
41 | struct compute_block_t { |
42 | enum class kind_t { kernel, primitive }; |
43 | |
44 | compute_block_t(const compute::kernel_t &kernel) |
45 | : kind_(kind_t::kernel), kernel_(kernel), primitive_(nullptr) {} |
46 | compute_block_t(const primitive_t *primitive) |
47 | : kind_(kind_t::primitive), primitive_(primitive) {} |
48 | |
49 | bool is_kernel() const { return kind_ == kind_t::kernel; } |
50 | bool is_primitive() const { return kind_ == kind_t::primitive; } |
51 | explicit operator bool() const { return kernel_ || primitive_; } |
52 | |
53 | const primitive_t *primitive() const { return primitive_; } |
54 | compute::kernel_t kernel() const { return kernel_; } |
55 | kind_t kind() const { return kind_; } |
56 | |
57 | private: |
58 | kind_t kind_; |
59 | compute::kernel_t kernel_; |
60 | const primitive_t *primitive_; |
61 | }; |
62 | |
63 | const resource_mapper_t *cached_mapper() const { return &cached_mapper_; } |
64 | |
65 | status_t init_cached_resource(engine_t *engine) const override { |
66 | CHECK(fill_mapper(engine, cached_mapper_)); |
67 | // When caching kernels, each primitve from the hierarchy has its |
68 | // own mapper and is responsible for filling it. |
69 | for (const auto &cb : compute_blocks()) { |
70 | if (!cb) continue; |
71 | |
72 | switch (cb.kind()) { |
73 | case compute_block_t::kind_t::primitive: |
74 | CHECK(cb.primitive()->init_cached_resource(engine)); |
75 | break; |
76 | case compute_block_t::kind_t::kernel: |
77 | // Clear kernels with binary state to decrease memory |
78 | // consumption. |
79 | cb.kernel().clear(); |
80 | break; |
81 | default: assert(!"unexpected" ); return status::runtime_error; |
82 | } |
83 | } |
84 | return status::success; |
85 | } |
86 | |
87 | status_t get_cache_blob_size(size_t *size) const override { |
88 | if (!size) return status::invalid_arguments; |
89 | // Query binary size for each created kernel. |
90 | for (const auto &cb : compute_blocks()) { |
91 | if (!cb) continue; |
92 | |
93 | switch (cb.kind()) { |
94 | case compute_block_t::kind_t::kernel: { |
95 | size_t sz = 0; |
96 | CHECK(cb.kernel().binary_size(&sz)); |
97 | // We need additional sizeof(size_t) bytes to store the size |
98 | // of the binary when packing. |
99 | (*size) += sz + sizeof(size_t); |
100 | break; |
101 | } |
102 | case compute_block_t::kind_t::primitive: |
103 | CHECK(cb.primitive()->get_cache_blob_size(size)); |
104 | break; |
105 | default: assert(!"unexpected" ); return status::runtime_error; |
106 | } |
107 | } |
108 | return status::success; |
109 | } |
110 | |
111 | status_t get_cache_blob( |
112 | engine_t *engine, cache_blob_t &blob) const override { |
113 | for (const auto &cb : compute_blocks()) { |
114 | if (!cb) continue; |
115 | |
116 | switch (cb.kind()) { |
117 | case compute_block_t::kind_t::kernel: { |
118 | compute::binary_t binary; |
119 | const resource_mapper_t *rm = cached_mapper(); |
120 | ; |
121 | const auto *resource = rm->get<gpu_resource_t>(this); |
122 | const auto &realized_kernel |
123 | = resource->get_kernel(cb.kernel().id()); |
124 | // Get binaries for all kernels within current primitive. |
125 | // TODO: Copy binary directly to `blob` when binary cache |
126 | // mode is removed. |
127 | CHECK(realized_kernel.binary(engine, binary)); |
128 | CHECK(blob.add_binary(binary.data(), binary.size())); |
129 | break; |
130 | } |
131 | case compute_block_t::kind_t::primitive: |
132 | CHECK(cb.primitive()->get_cache_blob(engine, blob)); |
133 | break; |
134 | default: assert(!"unexpected" ); return status::runtime_error; |
135 | } |
136 | } |
137 | return status::success; |
138 | } |
139 | |
140 | status_t create_kernel(engine_t *engine, compute::kernel_t *kernel, |
141 | jit::jit_generator_base *jitter) { |
142 | auto *compute_engine |
143 | = utils::downcast<compute::compute_engine_t *>(engine); |
144 | CHECK(compute_engine->create_kernel(kernel, jitter, cache_blob())); |
145 | register_kernels({*kernel}); |
146 | return status::success; |
147 | } |
148 | |
149 | status_t create_kernels(engine_t *engine, |
150 | std::vector<compute::kernel_t> *kernels, |
151 | const std::vector<const char *> &kernel_names, |
152 | const compute::kernel_ctx_t &kernel_ctx) { |
153 | auto *compute_engine |
154 | = utils::downcast<compute::compute_engine_t *>(engine); |
155 | CHECK(compute_engine->create_kernels( |
156 | kernels, kernel_names, kernel_ctx, cache_blob())); |
157 | register_kernels(*kernels); |
158 | return status::success; |
159 | } |
160 | |
161 | status_t create_kernel(engine_t *engine, compute::kernel_t *kernel, |
162 | const char *kernel_name, const compute::kernel_ctx_t &kernel_ctx) { |
163 | |
164 | std::vector<compute::kernel_t> kernels(1); |
165 | auto status |
166 | = create_kernels(engine, &kernels, {kernel_name}, kernel_ctx); |
167 | if (status == status::success) *kernel = kernels[0]; |
168 | return status; |
169 | } |
170 | |
171 | status_t create_nested_primitive(std::shared_ptr<primitive_t> &primitive, |
172 | const std::shared_ptr<primitive_desc_t> &pd, engine_t *engine) { |
173 | CHECK(pd->create_primitive(primitive, engine, cache_blob())); |
174 | register_primitive(primitive.get()); |
175 | return status::success; |
176 | } |
177 | |
178 | protected: |
179 | void register_primitive(const primitive_t *primitive) { |
180 | registered_compute_blocks_.emplace_back(primitive); |
181 | } |
182 | |
183 | void register_kernels(const std::vector<compute::kernel_t> &kernels) { |
184 | for (const auto &k : kernels) { |
185 | registered_compute_blocks_.emplace_back(k); |
186 | } |
187 | } |
188 | |
189 | virtual status_t init_res_storage( |
190 | engine_t *engine, gpu_resource_t *r) const { |
191 | return status::success; |
192 | } |
193 | |
194 | // TODO: use inheritance for exec_ctx_t to get rid of such places... |
195 | status_t parallel_for(const gemm_exec_ctx_t &ctx, |
196 | const compute::nd_range_t &range, const compute::kernel_t &kernel, |
197 | const compute::kernel_arg_list_t &arg_list) const { |
198 | const resource_mapper_t *rm = nullptr; |
199 | rm = cached_mapper(); |
200 | return parallel_for(rm, ctx.stream(), range, kernel, arg_list); |
201 | } |
202 | |
203 | status_t parallel_for(const exec_ctx_t &ctx, |
204 | const compute::nd_range_t &range, const compute::kernel_t &kernel, |
205 | const compute::kernel_arg_list_t &arg_list) const { |
206 | const resource_mapper_t *rm = nullptr; |
207 | rm = cached_mapper(); |
208 | return parallel_for(rm, ctx.stream(), range, kernel, arg_list); |
209 | } |
210 | |
211 | private: |
212 | const std::vector<compute_block_t> &compute_blocks() const { |
213 | return registered_compute_blocks_; |
214 | } |
215 | |
216 | status_t fill_mapper(engine_t *engine, resource_mapper_t &mapper) const { |
217 | if (mapper.has_resource(this)) return status::success; |
218 | auto r = utils::make_unique<gpu_resource_t>(); |
219 | if (!r) return status::out_of_memory; |
220 | compute::program_list_t programs(engine); |
221 | for (const auto &cb : compute_blocks()) { |
222 | if (!cb || !cb.is_kernel()) continue; |
223 | compute::kernel_t realized_kernel; |
224 | CHECK(cb.kernel().realize(&realized_kernel, engine, &programs)); |
225 | r->add_kernel(cb.kernel().id(), realized_kernel); |
226 | } |
227 | CHECK(init_res_storage(engine, r.get())); |
228 | mapper.add(this, std::move(r)); |
229 | return status::success; |
230 | } |
231 | |
232 | status_t parallel_for(const resource_mapper_t *resource_mapper, |
233 | stream_t *stream, const compute::nd_range_t &range, |
234 | const compute::kernel_t &kernel, |
235 | const compute::kernel_arg_list_t &arg_list) const { |
236 | |
237 | compute::compute_stream_t *compute_stream |
238 | = utils::downcast<compute::compute_stream_t *>(stream); |
239 | const auto *resource = resource_mapper->get<gpu_resource_t>(this); |
240 | const auto &realized_kernel = resource->get_kernel(kernel.id()); |
241 | |
242 | CHECK(compute_stream->parallel_for(range, realized_kernel, arg_list)); |
243 | return status::success; |
244 | } |
245 | |
246 | // Make these mutable to allow modifying them from `init_cached_resource`. |
247 | mutable resource_mapper_t cached_mapper_; |
248 | mutable std::vector<compute_block_t> registered_compute_blocks_; |
249 | }; |
250 | |
251 | } // namespace gpu |
252 | } // namespace impl |
253 | } // namespace dnnl |
254 | |
255 | #endif |
256 | |