1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <cstring>
18
19#include <CL/cl.h>
20
21#include "gpu/ocl/ocl_stream.hpp"
22
23#include "common/verbose.hpp"
24#include "gpu/ocl/ocl_memory_storage.hpp"
25#include "gpu/ocl/ocl_utils.hpp"
26#include "gpu/ocl/profile.hpp"
27#include "gpu/profile.hpp"
28
29namespace dnnl {
30namespace impl {
31namespace gpu {
32namespace ocl {
33
34status_t ocl_stream_t::init() {
35 if (is_profiling_enabled()) {
36 mdapi_helper_ = utils::make_unique<mdapi_helper_t>();
37 }
38 // Restore queue on successful exit, otherwise queue may be released
39 // without retain
40 cl_command_queue queue = queue_;
41 queue_ = nullptr;
42
43 assert(engine()->kind() == engine_kind::gpu);
44
45 ocl_gpu_engine_t *ocl_engine
46 = utils::downcast<ocl_gpu_engine_t *>(engine());
47
48 // Create queue if it is not set
49 if (!queue) {
50 cl_int err;
51 queue = create_queue(ocl_engine->context(), ocl_engine->device(), &err);
52 OCL_CHECK(err);
53 } else {
54 // Check that queue is compatible with the engine
55 cl_context ocl_ctx;
56 OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT,
57 sizeof(cl_context), &ocl_ctx, nullptr));
58
59 cl_device_id ocl_dev;
60 OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE,
61 sizeof(cl_device_id), &ocl_dev, nullptr));
62
63 if (ocl_engine->device() != ocl_dev || ocl_engine->context() != ocl_ctx)
64 return status::invalid_arguments;
65
66 OCL_CHECK(clRetainCommandQueue(queue));
67 }
68 queue_ = queue;
69
70 if (gpu::is_profiling_enabled()) {
71 cl_command_queue_properties props;
72 OCL_CHECK(clGetCommandQueueInfo(
73 queue_, CL_QUEUE_PROPERTIES, sizeof(props), &props, nullptr));
74 bool is_out_of_order
75 = (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0;
76 if (is_out_of_order) {
77 if (get_verbose()) {
78 printf("onednn_verbose,gpu,error,OpenCL kernel profiling is "
79 "not "
80 "supported with out-of-order queues\n");
81 fflush(nullptr);
82 }
83 return status::invalid_arguments;
84 }
85 }
86
87 return status::success;
88}
89
90cl_command_queue ocl_stream_t::create_queue(
91 cl_context ctx, cl_device_id dev, cl_int *err) const {
92 if (is_profiling_enabled() && mdapi_helper_) {
93 auto ret = mdapi_helper_->create_queue(ctx, dev, err);
94 if (ret) return ret;
95 }
96
97 const bool is_out_of_order = (flags() & stream_flags::out_of_order);
98 if (is_out_of_order) assert(!is_profiling_enabled());
99#ifdef CL_VERSION_2_0
100 cl_queue_properties profiling_props[]
101 = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0};
102 cl_queue_properties out_of_order_props[]
103 = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0};
104 return clCreateCommandQueueWithProperties(ctx, dev,
105 is_profiling_enabled()
106 ? profiling_props
107 : is_out_of_order ? out_of_order_props : nullptr,
108 err);
109#else
110 return clCreateCommandQueue(ctx, dev,
111 is_profiling_enabled()
112 ? CL_QUEUE_PROFILING_ENABLE
113 : is_out_of_order ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
114 : 0,
115 err);
116#endif
117}
118
119void ocl_stream_t::before_exec_hook() {
120 if (is_profiling_enabled()) notify_before_exec();
121}
122
123void ocl_stream_t::after_exec_hook() {
124 set_deps({});
125}
126
127status_t ocl_stream_t::copy(
128 const memory_storage_t &src, const memory_storage_t &dst, size_t size) {
129
130 if (size == 0) return status::success;
131
132 if (src.engine()->kind() == engine_kind::cpu
133 && is_native_runtime(src.engine()->runtime_kind())) {
134 assert(dst.engine()->kind() == engine_kind::gpu);
135
136 void *src_ptr = nullptr;
137 src.get_data_handle(&src_ptr);
138
139 const auto *ocl_dst
140 = utils::downcast<const ocl_memory_storage_base_t *>(&dst);
141 bool usm_dst = ocl_dst->memory_kind() == memory_kind::usm;
142
143 if (usm_dst) {
144 const auto *ocl_usm_dst
145 = utils::downcast<const ocl_usm_memory_storage_t *>(
146 ocl_dst);
147 CHECK(usm::memcpy(this, ocl_usm_dst->usm_ptr(), src_ptr, size));
148 } else {
149 const auto *ocl_buffer_dst
150 = utils::downcast<const ocl_buffer_memory_storage_t *>(
151 ocl_dst);
152
153 cl_mem ocl_mem = ocl_buffer_dst->mem_object();
154 cl_int err = clEnqueueWriteBuffer(queue(), ocl_mem, CL_TRUE, 0,
155 size, src_ptr, 0, nullptr, nullptr);
156 OCL_CHECK(err);
157 }
158 } else if (dst.engine()->kind() == engine_kind::cpu
159 && is_native_runtime(dst.engine()->runtime_kind())) {
160 assert(src.engine()->kind() == engine_kind::gpu);
161
162 void *dst_ptr = nullptr;
163 dst.get_data_handle(&dst_ptr);
164
165 const auto *ocl_src
166 = utils::downcast<const ocl_memory_storage_base_t *>(&src);
167 bool usm_src = ocl_src->memory_kind() == memory_kind::usm;
168
169 if (usm_src) {
170 const auto *ocl_usm_src
171 = utils::downcast<const ocl_usm_memory_storage_t *>(
172 ocl_src);
173 CHECK(usm::memcpy(this, dst_ptr, ocl_usm_src->usm_ptr(), size));
174 } else {
175 const auto *ocl_buffer_src
176 = utils::downcast<const ocl_buffer_memory_storage_t *>(
177 ocl_src);
178
179 cl_mem ocl_mem = ocl_buffer_src->mem_object();
180 cl_int err = clEnqueueReadBuffer(queue(), ocl_mem, CL_TRUE, 0, size,
181 dst_ptr, 0, nullptr, nullptr);
182 OCL_CHECK(err);
183 }
184 } else {
185 wait();
186
187 // Use map/unmap
188 void *src_mapped_ptr;
189 void *dst_mapped_ptr;
190
191 CHECK(src.map_data(&src_mapped_ptr, this, size));
192 CHECK(dst.map_data(&dst_mapped_ptr, this, size));
193
194 std::memcpy(static_cast<void *>(dst_mapped_ptr),
195 static_cast<const void *>(src_mapped_ptr), size);
196
197 CHECK(src.unmap_data(src_mapped_ptr, this));
198 CHECK(dst.unmap_data(dst_mapped_ptr, this));
199 }
200 return status::success;
201}
202
203status_t ocl_stream_t::fill(
204 const memory_storage_t &dst, uint8_t pattern, size_t size) {
205 using namespace dnnl::impl::utils;
206
207 const auto *ocl_dst = downcast<const ocl_memory_storage_base_t *>(&dst);
208
209 if (ocl_dst->memory_kind() == memory_kind::usm) {
210 const auto *ocl_usm_dst
211 = downcast<const ocl_usm_memory_storage_t *>(ocl_dst);
212 CHECK(usm::fill(
213 this, ocl_usm_dst->usm_ptr(), &pattern, sizeof(pattern), size));
214 } else {
215 const auto *ocl_buffer_dst
216 = downcast<const ocl_buffer_memory_storage_t *>(ocl_dst);
217 cl_int err = clEnqueueFillBuffer(queue(), ocl_buffer_dst->mem_object(),
218 &pattern, sizeof(uint8_t), dst.offset(), size, 0, nullptr,
219 nullptr);
220 OCL_CHECK(err);
221 }
222 return status::success;
223}
224
225} // namespace ocl
226} // namespace gpu
227} // namespace impl
228} // namespace dnnl
229