1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <assert.h> |
18 | #include <string> |
19 | #include <CL/cl.h> |
20 | |
21 | #include "gpu/ocl/ocl_gpu_kernel.hpp" |
22 | |
23 | #include "common/rw_mutex.hpp" |
24 | #include "common/utils.hpp" |
25 | #include "gpu/compute/program_list.hpp" |
26 | #include "gpu/ocl/ocl_memory_storage.hpp" |
27 | #include "gpu/ocl/ocl_stream.hpp" |
28 | #include "gpu/ocl/ocl_usm_utils.hpp" |
29 | #include "gpu/ocl/ocl_utils.hpp" |
30 | #include "gpu/ocl/profile.hpp" |
31 | #include "gpu/profile.hpp" |
32 | |
33 | namespace dnnl { |
34 | namespace impl { |
35 | namespace gpu { |
36 | namespace ocl { |
37 | |
38 | // Kernel wrapper storing a per-thread copy of cl_kernel. |
39 | class kernel_wrapper_t { |
40 | public: |
41 | kernel_wrapper_t(cl_kernel kernel = nullptr) : kernel_(kernel) {} |
42 | |
43 | operator cl_kernel() const { return kernel_; } |
44 | |
45 | status_t set_arg(int arg_index, size_t arg_size, const void *arg_value) { |
46 | cl_int err = clSetKernelArg(kernel_, arg_index, arg_size, arg_value); |
47 | return convert_to_dnnl(err); |
48 | } |
49 | |
50 | status_t set_svm_arg(int arg_index, const void *arg_value) { |
51 | #ifdef CL_VERSION_2_0 |
52 | cl_int err = clSetKernelArgSVMPointer(kernel_, arg_index, arg_value); |
53 | return convert_to_dnnl(err); |
54 | #else |
55 | // SVM is not supported. |
56 | UNUSED(arg_index); |
57 | UNUSED(arg_value); |
58 | return status::runtime_error; |
59 | #endif |
60 | } |
61 | |
62 | status_t set_usm_arg( |
63 | engine_t *engine, int arg_index, const void *arg_value) { |
64 | return usm::set_kernel_arg_usm(engine, kernel_, arg_index, arg_value); |
65 | } |
66 | |
67 | private: |
68 | cl_kernel kernel_; |
69 | }; |
70 | |
71 | class ocl_gpu_kernel_cache_t { |
72 | public: |
73 | ocl_gpu_kernel_cache_t(cl_kernel main_kernel) : main_kernel_(main_kernel) {} |
74 | |
75 | ~ocl_gpu_kernel_cache_t() { |
76 | for (auto &kv : kernels_) { |
77 | OCL_CHECK_V(clReleaseKernel(kv.second)); |
78 | } |
79 | } |
80 | |
81 | status_t get(kernel_wrapper_t **kernel) { |
82 | auto id = std::this_thread::get_id(); |
83 | { |
84 | utils::lock_read_t lock_read(mutex_); |
85 | auto it = kernels_.find(id); |
86 | if (it != kernels_.end()) { |
87 | *kernel = &it->second; |
88 | return status::success; |
89 | } |
90 | } |
91 | |
92 | // No copy for this thread, clone the original kernel and save the |
93 | // copy. |
94 | cl_kernel cloned_kernel; |
95 | CHECK(clone_kernel(main_kernel_, &cloned_kernel)); |
96 | |
97 | utils::lock_write_t lock_write(mutex_); |
98 | auto ret = kernels_.emplace(id, cloned_kernel); |
99 | *kernel = &ret.first->second; |
100 | return status::success; |
101 | } |
102 | |
103 | private: |
104 | cl_kernel main_kernel_; |
105 | std::unordered_map<std::thread::id, kernel_wrapper_t> kernels_; |
106 | utils::rw_mutex_t mutex_; |
107 | }; |
108 | |
109 | ocl_gpu_kernel_t::ocl_gpu_kernel_t(cl_kernel ocl_kernel, |
110 | const std::vector<gpu::compute::scalar_type_t> &arg_types) |
111 | : state_(state_t::kernel), ocl_kernel_(ocl_kernel), arg_types_(arg_types) { |
112 | OCL_CHECK_V(clRetainKernel(ocl_kernel_)); |
113 | cache_ = std::make_shared<ocl_gpu_kernel_cache_t>(ocl_kernel_); |
114 | } |
115 | |
116 | ocl_gpu_kernel_t::~ocl_gpu_kernel_t() { |
117 | if (ocl_kernel_) OCL_CHECK_V(clReleaseKernel(ocl_kernel_)); |
118 | } |
119 | |
120 | status_t ocl_gpu_kernel_t::parallel_for(stream_t &stream, |
121 | const compute::nd_range_t &range, |
122 | const compute::kernel_arg_list_t &arg_list) { |
123 | assert(state_ == state_t::kernel); |
124 | |
125 | auto *ocl_stream = utils::downcast<ocl_stream_t *>(&stream); |
126 | cl_command_queue queue = ocl_stream->queue(); |
127 | |
128 | kernel_wrapper_t *kernel = nullptr; |
129 | CHECK(cache_->get(&kernel)); |
130 | CHECK(gpu::compute::check_scalar_arguments(arg_list, arg_types_)); |
131 | for (int i = 0; i < arg_list.nargs(); ++i) { |
132 | auto &arg = arg_list.get(i); |
133 | if (arg.is_global()) { |
134 | auto *mem_storage |
135 | = static_cast<const memory_storage_t *>(arg.value()); |
136 | if (!mem_storage->is_null()) { |
137 | auto *ocl_mem_storage |
138 | = utils::downcast<const ocl_memory_storage_base_t *>( |
139 | mem_storage); |
140 | |
141 | // Validate that the OpenCL contexts match for execution |
142 | // context and memory. |
143 | auto stream_ocl_ctx |
144 | = utils::downcast<ocl_gpu_engine_t *>(stream.engine()) |
145 | ->context(); |
146 | auto memory_storage_ocl_ctx |
147 | = utils::downcast<ocl_gpu_engine_t *>( |
148 | ocl_mem_storage->engine()) |
149 | ->context(); |
150 | if (stream_ocl_ctx != memory_storage_ocl_ctx) { |
151 | MAYBE_REPORT_ERROR( |
152 | "mismatched OpenCL context for primitive/memory" ); |
153 | return status::invalid_arguments; |
154 | } |
155 | |
156 | switch (ocl_mem_storage->memory_kind()) { |
157 | case memory_kind::buffer: { |
158 | auto *m = utils::downcast< |
159 | const ocl_buffer_memory_storage_t *>( |
160 | ocl_mem_storage); |
161 | auto ocl_mem = m->mem_object(); |
162 | CHECK(kernel->set_arg(i, sizeof(cl_mem), &ocl_mem)); |
163 | break; |
164 | } |
165 | case memory_kind::usm: { |
166 | auto *m = utils::downcast< |
167 | const ocl_usm_memory_storage_t *>( |
168 | ocl_mem_storage); |
169 | auto *usm_ptr = m->usm_ptr(); |
170 | CHECK(kernel->set_usm_arg(stream.engine(), i, usm_ptr)); |
171 | break; |
172 | } |
173 | default: assert(!"not expected" ); |
174 | } |
175 | } else { |
176 | if (usm::is_usm_supported(stream.engine())) { |
177 | CHECK(kernel->set_usm_arg(stream.engine(), i, nullptr)); |
178 | } else { |
179 | cl_mem null_mem = nullptr; |
180 | CHECK(kernel->set_arg(i, sizeof(cl_mem), &null_mem)); |
181 | } |
182 | } |
183 | } else if (arg.is_local()) { |
184 | CHECK(kernel->set_arg(i, arg.size(), arg.value())); |
185 | } else if (arg.is_svm_pointer()) { |
186 | CHECK(kernel->set_svm_arg(i, arg.value())); |
187 | } else { |
188 | CHECK(kernel->set_arg(i, arg.size(), arg.value())); |
189 | } |
190 | } |
191 | |
192 | cl_uint ndims = static_cast<cl_uint>(range.ndims()); |
193 | if (range.is_zero()) { return status::success; } |
194 | |
195 | cl_event event; |
196 | if (ocl_stream->flags() & stream_flags::out_of_order) { |
197 | const auto &event_wrappers = ocl_stream->get_deps(); |
198 | std::vector<cl_event> events( |
199 | event_wrappers.begin(), event_wrappers.end()); |
200 | |
201 | cl_uint num_events = events.size(); |
202 | const cl_event *events_data = num_events ? events.data() : nullptr; |
203 | cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr, |
204 | range.global_range(), range.local_range(), num_events, |
205 | events_data, &event); |
206 | OCL_CHECK(err); |
207 | ocl_stream->set_deps({ocl_wrapper_t<cl_event>(event, true)}); |
208 | } else { |
209 | cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr, |
210 | range.global_range(), range.local_range(), 0, nullptr, |
211 | is_profiling_enabled() ? &event : nullptr); |
212 | OCL_CHECK(err); |
213 | } |
214 | |
215 | if (is_profiling_enabled()) register_profile_event(event, ocl_stream); |
216 | return status::success; |
217 | } |
218 | |
219 | status_t ocl_gpu_kernel_t::realize(compute::kernel_t *kernel, |
220 | const engine_t *engine, compute::program_list_t *programs) const { |
221 | assert(state_ == state_t::binary); |
222 | if (!binary_) return status::success; |
223 | |
224 | cl_int err; |
225 | if (programs) { |
226 | auto *p = programs->get<cl_program>(binary_.get()); |
227 | if (p) { |
228 | auto k = make_ocl_wrapper(clCreateKernel(p, name(), &err)); |
229 | OCL_CHECK(err); |
230 | (*kernel) = compute::kernel_t(new ocl_gpu_kernel_t(k, arg_types_)); |
231 | return status::success; |
232 | } |
233 | } |
234 | |
235 | auto *compute_engine = utils::downcast<const ocl_gpu_engine_t *>(engine); |
236 | cl_device_id dev = compute_engine->device(); |
237 | cl_context ctx = compute_engine->context(); |
238 | const unsigned char *binary_buffer = binary_->data(); |
239 | size_t binary_size = binary_->size(); |
240 | assert(binary_size > 0); |
241 | |
242 | auto program = make_ocl_wrapper(clCreateProgramWithBinary( |
243 | ctx, 1, &dev, &binary_size, &binary_buffer, nullptr, &err)); |
244 | OCL_CHECK(err); |
245 | err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr); |
246 | OCL_CHECK(err); |
247 | |
248 | auto ocl_kernel = make_ocl_wrapper(clCreateKernel(program, name(), &err)); |
249 | OCL_CHECK(err); |
250 | (*kernel) = compute::kernel_t(new ocl_gpu_kernel_t(ocl_kernel, arg_types_)); |
251 | |
252 | if (programs) { |
253 | programs->add(binary_.get(), program.get()); |
254 | program.release(); |
255 | } |
256 | |
257 | return status::success; |
258 | } |
259 | |
260 | status_t ocl_gpu_kernel_t::binary( |
261 | engine_t *engine, compute::binary_t &binary) const { |
262 | const auto *ocl_engine = utils::downcast<const ocl_gpu_engine_t *>(engine); |
263 | std::shared_ptr<compute::binary_t> shared_binary; |
264 | CHECK(get_ocl_program_binary( |
265 | ocl_kernel_, ocl_engine->device(), shared_binary)); |
266 | binary = std::move(*shared_binary); |
267 | return status::success; |
268 | } |
269 | |
270 | } // namespace ocl |
271 | } // namespace gpu |
272 | } // namespace impl |
273 | } // namespace dnnl |
274 | |