1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <assert.h>
18#include <string>
19#include <CL/cl.h>
20
21#include "gpu/ocl/ocl_gpu_kernel.hpp"
22
23#include "common/rw_mutex.hpp"
24#include "common/utils.hpp"
25#include "gpu/compute/program_list.hpp"
26#include "gpu/ocl/ocl_memory_storage.hpp"
27#include "gpu/ocl/ocl_stream.hpp"
28#include "gpu/ocl/ocl_usm_utils.hpp"
29#include "gpu/ocl/ocl_utils.hpp"
30#include "gpu/ocl/profile.hpp"
31#include "gpu/profile.hpp"
32
33namespace dnnl {
34namespace impl {
35namespace gpu {
36namespace ocl {
37
38// Kernel wrapper storing a per-thread copy of cl_kernel.
39class kernel_wrapper_t {
40public:
41 kernel_wrapper_t(cl_kernel kernel = nullptr) : kernel_(kernel) {}
42
43 operator cl_kernel() const { return kernel_; }
44
45 status_t set_arg(int arg_index, size_t arg_size, const void *arg_value) {
46 cl_int err = clSetKernelArg(kernel_, arg_index, arg_size, arg_value);
47 return convert_to_dnnl(err);
48 }
49
50 status_t set_svm_arg(int arg_index, const void *arg_value) {
51#ifdef CL_VERSION_2_0
52 cl_int err = clSetKernelArgSVMPointer(kernel_, arg_index, arg_value);
53 return convert_to_dnnl(err);
54#else
55 // SVM is not supported.
56 UNUSED(arg_index);
57 UNUSED(arg_value);
58 return status::runtime_error;
59#endif
60 }
61
62 status_t set_usm_arg(
63 engine_t *engine, int arg_index, const void *arg_value) {
64 return usm::set_kernel_arg_usm(engine, kernel_, arg_index, arg_value);
65 }
66
67private:
68 cl_kernel kernel_;
69};
70
71class ocl_gpu_kernel_cache_t {
72public:
73 ocl_gpu_kernel_cache_t(cl_kernel main_kernel) : main_kernel_(main_kernel) {}
74
75 ~ocl_gpu_kernel_cache_t() {
76 for (auto &kv : kernels_) {
77 OCL_CHECK_V(clReleaseKernel(kv.second));
78 }
79 }
80
81 status_t get(kernel_wrapper_t **kernel) {
82 auto id = std::this_thread::get_id();
83 {
84 utils::lock_read_t lock_read(mutex_);
85 auto it = kernels_.find(id);
86 if (it != kernels_.end()) {
87 *kernel = &it->second;
88 return status::success;
89 }
90 }
91
92 // No copy for this thread, clone the original kernel and save the
93 // copy.
94 cl_kernel cloned_kernel;
95 CHECK(clone_kernel(main_kernel_, &cloned_kernel));
96
97 utils::lock_write_t lock_write(mutex_);
98 auto ret = kernels_.emplace(id, cloned_kernel);
99 *kernel = &ret.first->second;
100 return status::success;
101 }
102
103private:
104 cl_kernel main_kernel_;
105 std::unordered_map<std::thread::id, kernel_wrapper_t> kernels_;
106 utils::rw_mutex_t mutex_;
107};
108
109ocl_gpu_kernel_t::ocl_gpu_kernel_t(cl_kernel ocl_kernel,
110 const std::vector<gpu::compute::scalar_type_t> &arg_types)
111 : state_(state_t::kernel), ocl_kernel_(ocl_kernel), arg_types_(arg_types) {
112 OCL_CHECK_V(clRetainKernel(ocl_kernel_));
113 cache_ = std::make_shared<ocl_gpu_kernel_cache_t>(ocl_kernel_);
114}
115
116ocl_gpu_kernel_t::~ocl_gpu_kernel_t() {
117 if (ocl_kernel_) OCL_CHECK_V(clReleaseKernel(ocl_kernel_));
118}
119
120status_t ocl_gpu_kernel_t::parallel_for(stream_t &stream,
121 const compute::nd_range_t &range,
122 const compute::kernel_arg_list_t &arg_list) {
123 assert(state_ == state_t::kernel);
124
125 auto *ocl_stream = utils::downcast<ocl_stream_t *>(&stream);
126 cl_command_queue queue = ocl_stream->queue();
127
128 kernel_wrapper_t *kernel = nullptr;
129 CHECK(cache_->get(&kernel));
130 CHECK(gpu::compute::check_scalar_arguments(arg_list, arg_types_));
131 for (int i = 0; i < arg_list.nargs(); ++i) {
132 auto &arg = arg_list.get(i);
133 if (arg.is_global()) {
134 auto *mem_storage
135 = static_cast<const memory_storage_t *>(arg.value());
136 if (!mem_storage->is_null()) {
137 auto *ocl_mem_storage
138 = utils::downcast<const ocl_memory_storage_base_t *>(
139 mem_storage);
140
141 // Validate that the OpenCL contexts match for execution
142 // context and memory.
143 auto stream_ocl_ctx
144 = utils::downcast<ocl_gpu_engine_t *>(stream.engine())
145 ->context();
146 auto memory_storage_ocl_ctx
147 = utils::downcast<ocl_gpu_engine_t *>(
148 ocl_mem_storage->engine())
149 ->context();
150 if (stream_ocl_ctx != memory_storage_ocl_ctx) {
151 MAYBE_REPORT_ERROR(
152 "mismatched OpenCL context for primitive/memory");
153 return status::invalid_arguments;
154 }
155
156 switch (ocl_mem_storage->memory_kind()) {
157 case memory_kind::buffer: {
158 auto *m = utils::downcast<
159 const ocl_buffer_memory_storage_t *>(
160 ocl_mem_storage);
161 auto ocl_mem = m->mem_object();
162 CHECK(kernel->set_arg(i, sizeof(cl_mem), &ocl_mem));
163 break;
164 }
165 case memory_kind::usm: {
166 auto *m = utils::downcast<
167 const ocl_usm_memory_storage_t *>(
168 ocl_mem_storage);
169 auto *usm_ptr = m->usm_ptr();
170 CHECK(kernel->set_usm_arg(stream.engine(), i, usm_ptr));
171 break;
172 }
173 default: assert(!"not expected");
174 }
175 } else {
176 if (usm::is_usm_supported(stream.engine())) {
177 CHECK(kernel->set_usm_arg(stream.engine(), i, nullptr));
178 } else {
179 cl_mem null_mem = nullptr;
180 CHECK(kernel->set_arg(i, sizeof(cl_mem), &null_mem));
181 }
182 }
183 } else if (arg.is_local()) {
184 CHECK(kernel->set_arg(i, arg.size(), arg.value()));
185 } else if (arg.is_svm_pointer()) {
186 CHECK(kernel->set_svm_arg(i, arg.value()));
187 } else {
188 CHECK(kernel->set_arg(i, arg.size(), arg.value()));
189 }
190 }
191
192 cl_uint ndims = static_cast<cl_uint>(range.ndims());
193 if (range.is_zero()) { return status::success; }
194
195 cl_event event;
196 if (ocl_stream->flags() & stream_flags::out_of_order) {
197 const auto &event_wrappers = ocl_stream->get_deps();
198 std::vector<cl_event> events(
199 event_wrappers.begin(), event_wrappers.end());
200
201 cl_uint num_events = events.size();
202 const cl_event *events_data = num_events ? events.data() : nullptr;
203 cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr,
204 range.global_range(), range.local_range(), num_events,
205 events_data, &event);
206 OCL_CHECK(err);
207 ocl_stream->set_deps({ocl_wrapper_t<cl_event>(event, true)});
208 } else {
209 cl_int err = clEnqueueNDRangeKernel(queue, *kernel, ndims, nullptr,
210 range.global_range(), range.local_range(), 0, nullptr,
211 is_profiling_enabled() ? &event : nullptr);
212 OCL_CHECK(err);
213 }
214
215 if (is_profiling_enabled()) register_profile_event(event, ocl_stream);
216 return status::success;
217}
218
219status_t ocl_gpu_kernel_t::realize(compute::kernel_t *kernel,
220 const engine_t *engine, compute::program_list_t *programs) const {
221 assert(state_ == state_t::binary);
222 if (!binary_) return status::success;
223
224 cl_int err;
225 if (programs) {
226 auto *p = programs->get<cl_program>(binary_.get());
227 if (p) {
228 auto k = make_ocl_wrapper(clCreateKernel(p, name(), &err));
229 OCL_CHECK(err);
230 (*kernel) = compute::kernel_t(new ocl_gpu_kernel_t(k, arg_types_));
231 return status::success;
232 }
233 }
234
235 auto *compute_engine = utils::downcast<const ocl_gpu_engine_t *>(engine);
236 cl_device_id dev = compute_engine->device();
237 cl_context ctx = compute_engine->context();
238 const unsigned char *binary_buffer = binary_->data();
239 size_t binary_size = binary_->size();
240 assert(binary_size > 0);
241
242 auto program = make_ocl_wrapper(clCreateProgramWithBinary(
243 ctx, 1, &dev, &binary_size, &binary_buffer, nullptr, &err));
244 OCL_CHECK(err);
245 err = clBuildProgram(program, 1, &dev, nullptr, nullptr, nullptr);
246 OCL_CHECK(err);
247
248 auto ocl_kernel = make_ocl_wrapper(clCreateKernel(program, name(), &err));
249 OCL_CHECK(err);
250 (*kernel) = compute::kernel_t(new ocl_gpu_kernel_t(ocl_kernel, arg_types_));
251
252 if (programs) {
253 programs->add(binary_.get(), program.get());
254 program.release();
255 }
256
257 return status::success;
258}
259
260status_t ocl_gpu_kernel_t::binary(
261 engine_t *engine, compute::binary_t &binary) const {
262 const auto *ocl_engine = utils::downcast<const ocl_gpu_engine_t *>(engine);
263 std::shared_ptr<compute::binary_t> shared_binary;
264 CHECK(get_ocl_program_binary(
265 ocl_kernel_, ocl_engine->device(), shared_binary));
266 binary = std::move(*shared_binary);
267 return status::success;
268}
269
270} // namespace ocl
271} // namespace gpu
272} // namespace impl
273} // namespace dnnl
274