1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <cstring> |
18 | |
19 | #include <CL/cl.h> |
20 | |
21 | #include "gpu/ocl/ocl_stream.hpp" |
22 | |
23 | #include "common/verbose.hpp" |
24 | #include "gpu/ocl/ocl_memory_storage.hpp" |
25 | #include "gpu/ocl/ocl_utils.hpp" |
26 | #include "gpu/ocl/profile.hpp" |
27 | #include "gpu/profile.hpp" |
28 | |
29 | namespace dnnl { |
30 | namespace impl { |
31 | namespace gpu { |
32 | namespace ocl { |
33 | |
34 | status_t ocl_stream_t::init() { |
35 | if (is_profiling_enabled()) { |
36 | mdapi_helper_ = utils::make_unique<mdapi_helper_t>(); |
37 | } |
38 | // Restore queue on successful exit, otherwise queue may be released |
39 | // without retain |
40 | cl_command_queue queue = queue_; |
41 | queue_ = nullptr; |
42 | |
43 | assert(engine()->kind() == engine_kind::gpu); |
44 | |
45 | ocl_gpu_engine_t *ocl_engine |
46 | = utils::downcast<ocl_gpu_engine_t *>(engine()); |
47 | |
48 | // Create queue if it is not set |
49 | if (!queue) { |
50 | cl_int err; |
51 | queue = create_queue(ocl_engine->context(), ocl_engine->device(), &err); |
52 | OCL_CHECK(err); |
53 | } else { |
54 | // Check that queue is compatible with the engine |
55 | cl_context ocl_ctx; |
56 | OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_CONTEXT, |
57 | sizeof(cl_context), &ocl_ctx, nullptr)); |
58 | |
59 | cl_device_id ocl_dev; |
60 | OCL_CHECK(clGetCommandQueueInfo(queue, CL_QUEUE_DEVICE, |
61 | sizeof(cl_device_id), &ocl_dev, nullptr)); |
62 | |
63 | if (ocl_engine->device() != ocl_dev || ocl_engine->context() != ocl_ctx) |
64 | return status::invalid_arguments; |
65 | |
66 | OCL_CHECK(clRetainCommandQueue(queue)); |
67 | } |
68 | queue_ = queue; |
69 | |
70 | if (gpu::is_profiling_enabled()) { |
71 | cl_command_queue_properties props; |
72 | OCL_CHECK(clGetCommandQueueInfo( |
73 | queue_, CL_QUEUE_PROPERTIES, sizeof(props), &props, nullptr)); |
74 | bool is_out_of_order |
75 | = (props & CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE) != 0; |
76 | if (is_out_of_order) { |
77 | if (get_verbose()) { |
78 | printf("onednn_verbose,gpu,error,OpenCL kernel profiling is " |
79 | "not " |
80 | "supported with out-of-order queues\n" ); |
81 | fflush(nullptr); |
82 | } |
83 | return status::invalid_arguments; |
84 | } |
85 | } |
86 | |
87 | return status::success; |
88 | } |
89 | |
90 | cl_command_queue ocl_stream_t::create_queue( |
91 | cl_context ctx, cl_device_id dev, cl_int *err) const { |
92 | if (is_profiling_enabled() && mdapi_helper_) { |
93 | auto ret = mdapi_helper_->create_queue(ctx, dev, err); |
94 | if (ret) return ret; |
95 | } |
96 | |
97 | const bool is_out_of_order = (flags() & stream_flags::out_of_order); |
98 | if (is_out_of_order) assert(!is_profiling_enabled()); |
99 | #ifdef CL_VERSION_2_0 |
100 | cl_queue_properties profiling_props[] |
101 | = {CL_QUEUE_PROPERTIES, CL_QUEUE_PROFILING_ENABLE, 0}; |
102 | cl_queue_properties out_of_order_props[] |
103 | = {CL_QUEUE_PROPERTIES, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, 0}; |
104 | return clCreateCommandQueueWithProperties(ctx, dev, |
105 | is_profiling_enabled() |
106 | ? profiling_props |
107 | : is_out_of_order ? out_of_order_props : nullptr, |
108 | err); |
109 | #else |
110 | return clCreateCommandQueue(ctx, dev, |
111 | is_profiling_enabled() |
112 | ? CL_QUEUE_PROFILING_ENABLE |
113 | : is_out_of_order ? CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE |
114 | : 0, |
115 | err); |
116 | #endif |
117 | } |
118 | |
119 | void ocl_stream_t::before_exec_hook() { |
120 | if (is_profiling_enabled()) notify_before_exec(); |
121 | } |
122 | |
123 | void ocl_stream_t::after_exec_hook() { |
124 | set_deps({}); |
125 | } |
126 | |
127 | status_t ocl_stream_t::copy( |
128 | const memory_storage_t &src, const memory_storage_t &dst, size_t size) { |
129 | |
130 | if (size == 0) return status::success; |
131 | |
132 | if (src.engine()->kind() == engine_kind::cpu |
133 | && is_native_runtime(src.engine()->runtime_kind())) { |
134 | assert(dst.engine()->kind() == engine_kind::gpu); |
135 | |
136 | void *src_ptr = nullptr; |
137 | src.get_data_handle(&src_ptr); |
138 | |
139 | const auto *ocl_dst |
140 | = utils::downcast<const ocl_memory_storage_base_t *>(&dst); |
141 | bool usm_dst = ocl_dst->memory_kind() == memory_kind::usm; |
142 | |
143 | if (usm_dst) { |
144 | const auto *ocl_usm_dst |
145 | = utils::downcast<const ocl_usm_memory_storage_t *>( |
146 | ocl_dst); |
147 | CHECK(usm::memcpy(this, ocl_usm_dst->usm_ptr(), src_ptr, size)); |
148 | } else { |
149 | const auto *ocl_buffer_dst |
150 | = utils::downcast<const ocl_buffer_memory_storage_t *>( |
151 | ocl_dst); |
152 | |
153 | cl_mem ocl_mem = ocl_buffer_dst->mem_object(); |
154 | cl_int err = clEnqueueWriteBuffer(queue(), ocl_mem, CL_TRUE, 0, |
155 | size, src_ptr, 0, nullptr, nullptr); |
156 | OCL_CHECK(err); |
157 | } |
158 | } else if (dst.engine()->kind() == engine_kind::cpu |
159 | && is_native_runtime(dst.engine()->runtime_kind())) { |
160 | assert(src.engine()->kind() == engine_kind::gpu); |
161 | |
162 | void *dst_ptr = nullptr; |
163 | dst.get_data_handle(&dst_ptr); |
164 | |
165 | const auto *ocl_src |
166 | = utils::downcast<const ocl_memory_storage_base_t *>(&src); |
167 | bool usm_src = ocl_src->memory_kind() == memory_kind::usm; |
168 | |
169 | if (usm_src) { |
170 | const auto *ocl_usm_src |
171 | = utils::downcast<const ocl_usm_memory_storage_t *>( |
172 | ocl_src); |
173 | CHECK(usm::memcpy(this, dst_ptr, ocl_usm_src->usm_ptr(), size)); |
174 | } else { |
175 | const auto *ocl_buffer_src |
176 | = utils::downcast<const ocl_buffer_memory_storage_t *>( |
177 | ocl_src); |
178 | |
179 | cl_mem ocl_mem = ocl_buffer_src->mem_object(); |
180 | cl_int err = clEnqueueReadBuffer(queue(), ocl_mem, CL_TRUE, 0, size, |
181 | dst_ptr, 0, nullptr, nullptr); |
182 | OCL_CHECK(err); |
183 | } |
184 | } else { |
185 | wait(); |
186 | |
187 | // Use map/unmap |
188 | void *src_mapped_ptr; |
189 | void *dst_mapped_ptr; |
190 | |
191 | CHECK(src.map_data(&src_mapped_ptr, this, size)); |
192 | CHECK(dst.map_data(&dst_mapped_ptr, this, size)); |
193 | |
194 | std::memcpy(static_cast<void *>(dst_mapped_ptr), |
195 | static_cast<const void *>(src_mapped_ptr), size); |
196 | |
197 | CHECK(src.unmap_data(src_mapped_ptr, this)); |
198 | CHECK(dst.unmap_data(dst_mapped_ptr, this)); |
199 | } |
200 | return status::success; |
201 | } |
202 | |
203 | status_t ocl_stream_t::fill( |
204 | const memory_storage_t &dst, uint8_t pattern, size_t size) { |
205 | using namespace dnnl::impl::utils; |
206 | |
207 | const auto *ocl_dst = downcast<const ocl_memory_storage_base_t *>(&dst); |
208 | |
209 | if (ocl_dst->memory_kind() == memory_kind::usm) { |
210 | const auto *ocl_usm_dst |
211 | = downcast<const ocl_usm_memory_storage_t *>(ocl_dst); |
212 | CHECK(usm::fill( |
213 | this, ocl_usm_dst->usm_ptr(), &pattern, sizeof(pattern), size)); |
214 | } else { |
215 | const auto *ocl_buffer_dst |
216 | = downcast<const ocl_buffer_memory_storage_t *>(ocl_dst); |
217 | cl_int err = clEnqueueFillBuffer(queue(), ocl_buffer_dst->mem_object(), |
218 | &pattern, sizeof(uint8_t), dst.offset(), size, 0, nullptr, |
219 | nullptr); |
220 | OCL_CHECK(err); |
221 | } |
222 | return status::success; |
223 | } |
224 | |
225 | } // namespace ocl |
226 | } // namespace gpu |
227 | } // namespace impl |
228 | } // namespace dnnl |
229 | |