1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include <atomic> |
19 | #include <cctype> |
20 | #include <memory> |
21 | #include <numeric> |
22 | #include <string> |
23 | |
24 | #include "oneapi/dnnl/dnnl.h" |
25 | |
26 | #ifdef DNNL_WITH_SYCL |
27 | #include "oneapi/dnnl/dnnl_sycl.hpp" |
28 | #endif |
29 | |
30 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
31 | #include "oneapi/dnnl/dnnl_ocl.hpp" |
32 | #include "src/gpu/ocl/ocl_usm_utils.hpp" |
33 | #endif |
34 | |
35 | #include "tests/test_thread.hpp" |
36 | |
37 | #include "dnn_types.hpp" |
38 | #include "dnnl_common.hpp" |
39 | #include "dnnl_memory.hpp" |
40 | #include "utils/dnnl_query.hpp" |
41 | #include "utils/parallel.hpp" |
42 | |
43 | extern "C" dnnl_status_t dnnl_memory_desc_create_with_string_tag( |
44 | dnnl_memory_desc_t *, int, const dnnl_dims_t, dnnl_data_type_t, |
45 | const char *); |
46 | |
47 | extern "C" dnnl_status_t dnnl_memory_desc_set_data_type( |
48 | dnnl_memory_desc_t memory_desc, dnnl_data_type_t data_type); |
49 | |
50 | dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_engine_t engine, |
51 | const handle_info_t &handle_info) { |
52 | if (query_md_ndims(md) > 0) { |
53 | auto status = dnnl_memory_desc_clone(&md_, md); |
54 | (void)status; |
55 | assert(status == dnnl_success); |
56 | active_ = (initialize(engine, handle_info) == OK); |
57 | } |
58 | } |
59 | |
60 | dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_data_type_t dt, |
61 | const std::string &tag, dnnl_engine_t engine) { |
62 | const int ndims = query_md_ndims(md); |
63 | if (ndims > 0) { |
64 | auto md_wrapper = dnn_mem_t::init_md(ndims, query_md_dims(md), dt, tag); |
65 | md_ = md_wrapper.release(); |
66 | active_ = (initialize(engine) == OK); |
67 | } |
68 | } |
69 | |
70 | dnn_mem_t::dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt, |
71 | const std::string &tag, dnnl_engine_t engine) { |
72 | if (ndims > 0) { |
73 | auto md_wrapper = dnn_mem_t::init_md(ndims, dims, dt, tag); |
74 | md_ = md_wrapper.release(); |
75 | active_ = (initialize(engine) == OK); |
76 | } |
77 | } |
78 | |
79 | dnn_mem_t::dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt, |
80 | const dnnl_dims_t strides, dnnl_engine_t engine) { |
81 | if (ndims > 0) { |
82 | auto status = dnnl_memory_desc_create_with_strides( |
83 | &md_, ndims, dims, dt, strides); |
84 | (void)status; |
85 | assert(status == dnnl_success); |
86 | active_ = (initialize(engine) == OK); |
87 | } |
88 | } |
89 | |
90 | dnn_mem_t::dnn_mem_t(const dnn_mem_t &rhs, dnnl_data_type_t dt, |
91 | const std::string &tag, dnnl_engine_t engine) |
92 | : dnn_mem_t(rhs.md_, dt, tag, engine) { |
93 | if (active_) reorder(rhs); |
94 | } |
95 | |
96 | int execute_reorder(const dnn_mem_t &src, dnn_mem_t &dst, |
97 | const_dnnl_primitive_attr_t attr) { |
98 | std::shared_ptr<const dnn_mem_t> r_src(&src, [](const dnn_mem_t *) {}); |
99 | std::shared_ptr<dnn_mem_t> r_dst(&dst, [](dnn_mem_t *) {}); |
100 | |
101 | dnnl_primitive_desc_t r_pd_ {}; |
102 | dnnl_primitive_t prim_ {}; |
103 | |
104 | // Optimization to reduce testing time for GPU. |
105 | // |
106 | // For CPU <-> GPU reorders, the library creates GPU-side kernels. |
107 | // Benchdnn heavily relies on reorders and this greatly increases execution |
108 | // time because of big overhead on building OpenCL kernels. |
109 | // |
110 | // First, try to create CPU reorder for the requested GPU reorder. If |
111 | // succeeded, then create CPU memory object wrapping mapped pointers of |
112 | // source and destination and execute CPU reorder. If CPU reorder can't be |
113 | // create, then just execute a regular GPU reorder. |
114 | // |
115 | // This optimization is skipped when testing reorder, sum and concat |
116 | // primitives because they are used specifically to test GPU reorders. |
117 | #if ((DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL) \ |
118 | || (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL)) \ |
119 | && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE |
120 | bool is_reorder_related_driver = (driver_name == "reorder" |
121 | || driver_name == "sum" || driver_name == "concat" ); |
122 | const auto &cpu_engine = get_cpu_engine(); |
123 | if (!is_reorder_related_driver |
124 | && (src.engine_kind() == dnnl_gpu |
125 | || dst.engine_kind() == dnnl_gpu)) { |
126 | |
127 | dnnl_status_t status = dnnl_reorder_primitive_desc_create( |
128 | &r_pd_, src.md_, cpu_engine, dst.md_, cpu_engine, attr); |
129 | if (status == dnnl_success) { |
130 | // Create CPU memory objects wrapping mapped pointers of source and |
131 | // destination |
132 | r_src = std::make_shared<dnn_mem_t>(dnn_mem_t::create_from_host_ptr( |
133 | src.md_, cpu_engine, (void *)src)); |
134 | r_dst = std::make_shared<dnn_mem_t>(dnn_mem_t::create_from_host_ptr( |
135 | dst.md_, cpu_engine, (void *)dst)); |
136 | } |
137 | } |
138 | #endif |
139 | |
140 | if (!r_pd_) { |
141 | DNN_SAFE(dnnl_reorder_primitive_desc_create(&r_pd_, src.md_, |
142 | src.engine(), dst.md_, dst.engine(), attr), |
143 | WARN); |
144 | } |
145 | auto r_pd = make_benchdnn_dnnl_wrapper(r_pd_); |
146 | const auto &scratchpad_md = query_md(r_pd, DNNL_ARG_SCRATCHPAD); |
147 | dnn_mem_t scratchpad(scratchpad_md, src.engine()); |
148 | |
149 | DNN_SAFE(dnnl_primitive_create(&prim_, r_pd), CRIT); |
150 | auto prim = make_benchdnn_dnnl_wrapper(prim_); |
151 | |
152 | args_t args; |
153 | args.set(DNNL_ARG_FROM, *r_src); |
154 | args.set(DNNL_ARG_TO, *r_dst); |
155 | args.set(DNNL_ARG_SCRATCHPAD, scratchpad); |
156 | |
157 | return execute_and_wait(prim, args); |
158 | } |
159 | int dnn_mem_t::reorder(const dnn_mem_t &rhs, const_dnnl_primitive_attr_t attr) { |
160 | if (this == &rhs) return OK; |
161 | return execute_reorder(rhs, *this, attr); |
162 | } |
163 | |
164 | size_t dnn_mem_t::size() const { |
165 | return dnnl_memory_desc_get_size(md_); |
166 | } |
167 | |
168 | size_t dnn_mem_t::sizeof_dt() const { |
169 | return dnnl_data_type_size(dt()); |
170 | } |
171 | |
172 | float dnn_mem_t::get_elem(int64_t idx) const { |
173 | void *data = (void *)*this; |
174 | float elem = 0.0; |
175 | switch (dt()) { |
176 | case dnnl_s8: elem = static_cast<int8_t *>(data)[idx]; break; |
177 | case dnnl_u8: elem = static_cast<uint8_t *>(data)[idx]; break; |
178 | case dnnl_s32: elem = static_cast<int32_t *>(data)[idx]; break; |
179 | case dnnl_f32: elem = static_cast<float *>(data)[idx]; break; |
180 | case dnnl_f64: elem = static_cast<double *>(data)[idx]; break; |
181 | case dnnl_f16: elem = static_cast<float16_t *>(data)[idx]; break; |
182 | case dnnl_bf16: elem = static_cast<bfloat16_t *>(data)[idx]; break; |
183 | default: assert(!"bad data type" ); |
184 | } |
185 | return elem; |
186 | } |
187 | |
188 | void dnn_mem_t::set_elem(int64_t idx, float value) const { |
189 | void *data = (void *)*this; |
190 | switch (dt()) { |
191 | case dnnl_s8: ((int8_t *)data)[idx] = value; break; |
192 | case dnnl_u8: ((uint8_t *)data)[idx] = value; break; |
193 | case dnnl_s32: ((int32_t *)data)[idx] = value; break; |
194 | case dnnl_f32: ((float *)data)[idx] = value; break; |
195 | case dnnl_f64: ((double *)data)[idx] = value; break; |
196 | case dnnl_f16: ((float16_t *)data)[idx] = value; break; |
197 | case dnnl_bf16: ((bfloat16_t *)data)[idx] = value; break; |
198 | default: assert(!"bad data type" ); |
199 | } |
200 | } |
201 | |
202 | // Creates a memory object from the underlying buffer of an existing memory |
203 | // object `mem`. The size of `mem` must not be less than the size of `md`. |
204 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || defined(DNNL_WITH_SYCL) |
205 | static int init_memory( |
206 | dnnl_memory_t *ret, const dnnl_memory_desc_t &md, dnnl_memory_t mem) { |
207 | void *handle; |
208 | DNN_SAFE(dnnl_memory_get_data_handle(mem, &handle), CRIT); |
209 | |
210 | dnnl_engine_t engine; |
211 | DNN_SAFE(dnnl_memory_get_engine(mem, &engine), CRIT); |
212 | |
213 | bool is_sycl = is_sycl_engine(engine); |
214 | bool is_opencl = is_opencl_engine(engine); |
215 | |
216 | *ret = nullptr; |
217 | |
218 | if (is_opencl) { |
219 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
220 | dnnl_ocl_interop_memory_kind_t mem_kind; |
221 | DNN_SAFE(dnnl_ocl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT); |
222 | DNN_SAFE(dnnl_ocl_interop_memory_create( |
223 | ret, md, engine, mem_kind, handle), |
224 | CRIT); |
225 | #endif |
226 | } else if (is_sycl) { |
227 | #ifdef DNNL_WITH_SYCL |
228 | dnnl_sycl_interop_memory_kind_t mem_kind; |
229 | DNN_SAFE( |
230 | dnnl_sycl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT); |
231 | DNN_SAFE(dnnl_sycl_interop_memory_create( |
232 | ret, md, engine, mem_kind, handle), |
233 | CRIT); |
234 | #endif |
235 | } |
236 | |
237 | // Memory must be initialized at this point in some of the branches above. |
238 | if (!*ret) assert(!"not expected" ); |
239 | |
240 | return OK; |
241 | } |
242 | #endif |
243 | |
244 | void dnn_mem_t::map() const { |
245 | assert(!is_mapped_ && "memory is already mapped" ); |
246 | is_mapped_ = true; |
247 | |
248 | if (!m_) return; |
249 | auto mem = m_padded_ ? m_padded_ : m_; |
250 | DNN_SAFE_V(dnnl_memory_map_data(mem, &mapped_ptr_)); |
251 | } |
252 | |
253 | void dnn_mem_t::unmap() const { |
254 | assert(is_mapped_ && "memory is not mapped" ); |
255 | is_mapped_ = false; |
256 | |
257 | if (!m_) return; |
258 | auto mem = m_padded_ ? m_padded_ : m_; |
259 | DNN_SAFE_V(dnnl_memory_unmap_data(mem, mapped_ptr_)); |
260 | mapped_ptr_ = nullptr; |
261 | } |
262 | |
263 | void dnn_mem_t::memset(int value, size_t size) const { |
264 | bool is_opencl = is_opencl_engine(engine_); |
265 | bool is_sycl = is_sycl_engine(engine_); |
266 | auto mem = m_padded_ ? m_padded_ : m_; |
267 | void *mem_handle; |
268 | DNN_SAFE_V(dnnl_memory_get_data_handle(mem, &mem_handle)); |
269 | if (is_opencl) { |
270 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
271 | stream_t stream(engine_); |
272 | switch (memory_kind) { |
273 | case memory_kind_ext_t::buffer: { |
274 | auto buf = static_cast<cl_mem>(mem_handle); |
275 | cl_command_queue queue; |
276 | DNN_SAFE_V(dnnl_ocl_interop_stream_get_command_queue( |
277 | stream, &queue)); |
278 | cl_int err = clEnqueueFillBuffer(queue, buf, &value, |
279 | sizeof(uint8_t), 0, size, 0, nullptr, nullptr); |
280 | if (err != CL_SUCCESS) SAFE_V(FAIL); |
281 | DNN_SAFE_V(dnnl_stream_wait(stream)); |
282 | return; |
283 | } |
284 | case memory_kind_ext_t::usm: |
285 | case memory_kind_ext_t::usm_device: |
286 | case memory_kind_ext_t::usm_shared: { |
287 | DNN_SAFE_V(dnnl::impl::gpu::ocl::usm::memset( |
288 | stream, mem_handle, value, size)); |
289 | DNN_SAFE_V(dnnl_stream_wait(stream)); |
290 | return; |
291 | } |
292 | } |
293 | #endif |
294 | } else if (is_sycl) { |
295 | #ifdef DNNL_WITH_SYCL |
296 | stream_t stream(engine_); |
297 | void *queue_ptr; |
298 | DNN_SAFE_V(dnnl_sycl_interop_stream_get_queue(stream, &queue_ptr)); |
299 | auto &queue = *static_cast<::sycl::queue *>(queue_ptr); |
300 | switch (memory_kind) { |
301 | case memory_kind_ext_t::buffer: { |
302 | auto &buf = *static_cast<::sycl::buffer<uint8_t, 1> *>( |
303 | mem_handle); |
304 | queue.submit([&](::sycl::handler &cgh) { |
305 | #ifdef DNNL_SYCL_INTEROP_USE_SYCL121 |
306 | constexpr auto target_device |
307 | = ::sycl::target::global_buffer; |
308 | #else |
309 | constexpr auto target_device = ::sycl::target::device; |
310 | #endif |
311 | ::sycl::accessor<uint8_t, 1, ::sycl::access::mode::write, |
312 | target_device> |
313 | acc(buf, cgh); |
314 | cgh.fill(acc, static_cast<uint8_t>(value)); |
315 | }); |
316 | DNN_SAFE_V(dnnl_stream_wait(stream)); |
317 | return; |
318 | } |
319 | case memory_kind_ext_t::usm: |
320 | case memory_kind_ext_t::usm_device: |
321 | case memory_kind_ext_t::usm_shared: { |
322 | queue.submit([&](::sycl::handler &cgh) { |
323 | cgh.memset(mem_handle, value, size); |
324 | }); |
325 | DNN_SAFE_V(dnnl_stream_wait(stream)); |
326 | return; |
327 | } |
328 | } |
329 | #endif |
330 | } |
331 | if (is_cpu(engine_)) { |
332 | ::memset(mem_handle, value, size); |
333 | return; |
334 | } |
335 | SAFE_V(FAIL); |
336 | } |
337 | |
338 | dnn_mem_t dnn_mem_t::create_from_host_ptr( |
339 | const dnnl_memory_desc_t &md, dnnl_engine_t engine, void *host_ptr) { |
340 | return dnn_mem_t(md, engine, {true, host_ptr}); |
341 | } |
342 | |
343 | size_t dnn_mem_t::pad_memory_size( |
344 | size_t sz, dnnl_engine_kind_t engine_kind, bool *was_padded) { |
345 | if (was_padded) *was_padded = false; |
346 | if (sz == 0 || !is_bench_mode(CORR) || engine_kind == dnnl_cpu) return sz; |
347 | |
348 | const int pad_size = 4096; |
349 | if (was_padded) *was_padded = true; |
350 | return sz + pad_size; |
351 | } |
352 | |
353 | dnnl_memory_desc_t dnn_mem_t::pad_memory_desc(const_dnnl_memory_desc_t md, |
354 | dnnl_engine_kind_t engine_kind, bool *was_padded) { |
355 | if (was_padded) *was_padded = false; |
356 | size_t old_sz = dnnl_memory_desc_get_size(md); |
357 | if (old_sz == 0 || !is_bench_mode(CORR) || engine_kind == dnnl_cpu) |
358 | return nullptr; |
359 | |
360 | size_t sz = pad_memory_size(old_sz, engine_kind, was_padded); |
361 | if (sz == old_sz) return nullptr; |
362 | |
363 | dnnl_memory_desc_t ret; |
364 | dnnl_dims_t dims = {(dnnl_dim_t)sz}; |
365 | DNN_SAFE_V( |
366 | dnnl_memory_desc_create_with_tag(&ret, 1, dims, dnnl_u8, dnnl_x)); |
367 | return ret; |
368 | } |
369 | |
370 | benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_md(int ndims, |
371 | const dnnl_dims_t dims, dnnl_data_type_t data_type, |
372 | const std::string &tag_, const dims_t &strides_) { |
373 | dnnl_memory_desc_t md {}; |
374 | const bool use_strides = !strides_.empty(); |
375 | // Ignore tag_ in case strides_ are explicitly provided |
376 | if (use_strides) { |
377 | std::vector<dnnl_dim_t> strides(strides_); |
378 | DNN_SAFE_V(dnnl_memory_desc_create_with_strides( |
379 | &md, ndims, dims, data_type, strides.data())); |
380 | return md; |
381 | } |
382 | |
383 | auto tag = normalize_tag(tag_, ndims); |
384 | if (tag == tag::undef || tag == tag::any || ndims == 0) { |
385 | dnnl_format_tag_t enum_tag = (tag == tag::undef || ndims == 0) |
386 | ? dnnl_format_tag_undef |
387 | : dnnl_format_tag_any; |
388 | DNN_SAFE_V(dnnl_memory_desc_create_with_tag( |
389 | &md, ndims, dims, data_type, enum_tag)); |
390 | return md; |
391 | } |
392 | |
393 | DNN_SAFE_V(dnnl_memory_desc_create_with_string_tag( |
394 | &md, ndims, dims, data_type, tag.data())); |
395 | |
396 | return md; |
397 | } |
398 | |
399 | int dnn_mem_t::initialize_memory_create_sycl(const handle_info_t &handle_info) { |
400 | #ifdef DNNL_WITH_SYCL |
401 | if (handle_info.is_host_ptr) { |
402 | // Ignore memory_kind with host pointers and force USM. |
403 | DNN_SAFE(dnnl_sycl_interop_memory_create(&m_, md_, engine_, |
404 | dnnl_sycl_interop_usm, handle_info.ptr), |
405 | CRIT); |
406 | return OK; |
407 | } |
408 | |
409 | auto md_padded = pad_memory_desc(md_, engine_kind_, &is_canary_protected_); |
410 | if (!md_padded) md_padded = md_; |
411 | |
412 | switch (memory_kind) { |
413 | case memory_kind_ext_t::usm: |
414 | case memory_kind_ext_t::buffer: { |
415 | dnnl_sycl_interop_memory_kind_t mem_kind |
416 | = (memory_kind == memory_kind_ext_t::usm |
417 | ? dnnl_sycl_interop_usm |
418 | : dnnl_sycl_interop_buffer); |
419 | DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded, |
420 | engine_, mem_kind, handle_info.ptr), |
421 | CRIT); |
422 | SAFE(init_memory(&m_, md_, m_padded_), CRIT); |
423 | break; |
424 | } |
425 | case memory_kind_ext_t::usm_device: |
426 | case memory_kind_ext_t::usm_shared: { |
427 | SAFE(handle_info.is_allocate() ? OK : FAIL, CRIT); |
428 | is_data_owner_ = true; |
429 | size_t sz = dnnl_memory_desc_get_size(md_padded); |
430 | auto eng = dnnl::engine(engine_, true); |
431 | auto dev = dnnl::sycl_interop::get_device(eng); |
432 | auto ctx = dnnl::sycl_interop::get_context(eng); |
433 | if (memory_kind == memory_kind_ext_t::usm_device) { |
434 | data_ = ::sycl::malloc_device(sz, dev, ctx); |
435 | } else { |
436 | data_ = ::sycl::malloc_shared(sz, dev, ctx); |
437 | } |
438 | DNN_SAFE((sz > 0 && !data_) ? dnnl_out_of_memory : dnnl_success, |
439 | CRIT); |
440 | DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded, |
441 | engine_, dnnl_sycl_interop_usm, data_), |
442 | CRIT); |
443 | SAFE(init_memory(&m_, md_, m_padded_), CRIT); |
444 | break; |
445 | } |
446 | default: assert(!"not expected" ); |
447 | } |
448 | if (md_padded != md_) DNN_SAFE(dnnl_memory_desc_destroy(md_padded), CRIT); |
449 | |
450 | #else |
451 | (void)handle_info; |
452 | #endif |
453 | return OK; |
454 | } |
455 | |
456 | int dnn_mem_t::initialize_memory_create_opencl( |
457 | const handle_info_t &handle_info) { |
458 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
459 | if (handle_info.is_host_ptr) { |
460 | // Ignore memory_kind with host pointers and force USM. |
461 | DNN_SAFE(dnnl_ocl_interop_memory_create(&m_, md_, engine_, |
462 | dnnl_ocl_interop_usm, handle_info.ptr), |
463 | CRIT); |
464 | return OK; |
465 | } |
466 | |
467 | SAFE(handle_info.is_allocate() ? OK : FAIL, CRIT); |
468 | |
469 | auto md_padded = pad_memory_desc(md_, engine_kind_, &is_canary_protected_); |
470 | if (!md_padded) md_padded = md_; |
471 | |
472 | switch (memory_kind) { |
473 | case memory_kind_ext_t::usm: |
474 | case memory_kind_ext_t::buffer: { |
475 | dnnl_ocl_interop_memory_kind_t mem_kind |
476 | = (memory_kind == memory_kind_ext_t::usm |
477 | ? dnnl_ocl_interop_usm |
478 | : dnnl_ocl_interop_buffer); |
479 | DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded, |
480 | engine_, mem_kind, handle_info.ptr), |
481 | CRIT); |
482 | SAFE(init_memory(&m_, md_, m_padded_), CRIT); |
483 | break; |
484 | } |
485 | case memory_kind_ext_t::usm_device: |
486 | case memory_kind_ext_t::usm_shared: { |
487 | is_data_owner_ = true; |
488 | size_t sz = dnnl_memory_desc_get_size(md_padded); |
489 | if (memory_kind == memory_kind_ext_t::usm_device) { |
490 | data_ = dnnl::impl::gpu::ocl::usm::malloc_device(engine_, sz); |
491 | } else { |
492 | data_ = dnnl::impl::gpu::ocl::usm::malloc_shared(engine_, sz); |
493 | } |
494 | DNN_SAFE((sz > 0 && !data_) ? dnnl_out_of_memory : dnnl_success, |
495 | CRIT); |
496 | DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded, |
497 | engine_, dnnl_ocl_interop_usm, data_), |
498 | CRIT); |
499 | SAFE(init_memory(&m_, md_, m_padded_), CRIT); |
500 | break; |
501 | } |
502 | default: assert(!"not expected" ); |
503 | } |
504 | if (md_padded != md_) DNN_SAFE(dnnl_memory_desc_destroy(md_padded), CRIT); |
505 | #else |
506 | (void)handle_info; |
507 | #endif |
508 | return OK; |
509 | } |
510 | |
511 | int dnn_mem_t::initialize_memory_create(const handle_info_t &handle_info) { |
512 | bool is_sycl = is_sycl_engine(engine_); |
513 | bool is_opencl = is_opencl_engine(engine_); |
514 | |
515 | if (handle_info.is_host_ptr) { |
516 | // Host pointer can be used with CPU memory only. |
517 | // XXX: assumption is that SYCL can work with native host pointers. |
518 | SAFE(is_cpu(engine_) ? OK : FAIL, CRIT); |
519 | } |
520 | |
521 | if (is_cpu(engine_) && handle_info.is_allocate() && !is_sycl) { |
522 | // Allocate memory for native runtime directly. |
523 | is_data_owner_ = true; |
524 | const size_t alignment = 2 * 1024 * 1024; |
525 | size_t sz = dnnl_memory_desc_get_size(md_); |
526 | data_ = zmalloc(sz, alignment); |
527 | DNN_SAFE(!data_ ? dnnl_out_of_memory : dnnl_success, CRIT); |
528 | DNN_SAFE(dnnl_memory_create(&m_, md_, engine_, data_), CRIT); |
529 | } else if (is_sycl) { |
530 | SAFE(initialize_memory_create_sycl(handle_info), CRIT); |
531 | } else if (is_opencl) { |
532 | SAFE(initialize_memory_create_opencl(handle_info), CRIT); |
533 | } else { |
534 | is_data_owner_ = false; |
535 | data_ = nullptr; |
536 | DNN_SAFE(dnnl_memory_create(&m_, md_, engine_, handle_info.ptr), CRIT); |
537 | } |
538 | return OK; |
539 | } |
540 | |
541 | int dnn_mem_t::initialize( |
542 | dnnl_engine_t engine, const handle_info_t &handle_info) { |
543 | is_mapped_ = false; |
544 | engine_ = engine; |
545 | engine_kind_ = query_engine_kind(engine_); |
546 | |
547 | SAFE(initialize_memory_create(handle_info), CRIT); |
548 | |
549 | size_t sz = dnnl_memory_desc_get_size(md_); |
550 | if (is_canary_protected_) sz = pad_memory_size(sz, engine_kind_); |
551 | |
552 | // Do not fill a memory if its size is zero. Moreover, memset expects |
553 | // defined pointer, nullptr is not allowed. |
554 | if (sz != 0 && handle_info.is_allocate()) { |
555 | // Fill memory with a magic number (NAN for fp data types) to catch |
556 | // possible uninitialized access. |
557 | map(); |
558 | ::memset(mapped_ptr_, dnnl_mem_default_value, sz); |
559 | unmap(); |
560 | } |
561 | |
562 | // Keep memory mapped and unmap only before execution |
563 | map(); |
564 | |
565 | return OK; |
566 | } |
567 | |
568 | static int cleanup_sycl(const dnnl_engine_t &engine, void *data) { |
569 | #ifdef DNNL_WITH_SYCL |
570 | switch (memory_kind) { |
571 | case memory_kind_ext_t::usm_device: |
572 | case memory_kind_ext_t::usm_shared: { |
573 | auto eng = dnnl::engine(engine, true); |
574 | auto ctx = dnnl::sycl_interop::get_context(eng); |
575 | ::sycl::free(data, ctx); |
576 | break; |
577 | } |
578 | default: break; |
579 | } |
580 | #endif |
581 | return OK; |
582 | } |
583 | |
584 | static int cleanup_opencl(const dnnl_engine_t &engine, void *data) { |
585 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
586 | switch (memory_kind) { |
587 | case memory_kind_ext_t::usm_device: |
588 | case memory_kind_ext_t::usm_shared: |
589 | dnnl::impl::gpu::ocl::usm::free(engine, data); |
590 | break; |
591 | default: break; |
592 | } |
593 | #endif |
594 | return OK; |
595 | } |
596 | |
597 | int dnn_mem_t::cleanup() { |
598 | if (!active_) return OK; |
599 | unmap(); |
600 | DNN_SAFE(dnnl_memory_desc_destroy(md_), CRIT); |
601 | DNN_SAFE(dnnl_memory_destroy(m_), CRIT); |
602 | if (is_data_owner_) { |
603 | if (is_sycl_engine(engine_)) { |
604 | SAFE(cleanup_sycl(engine_, data_), CRIT); |
605 | } else if (is_opencl_engine(engine_)) { |
606 | SAFE(cleanup_opencl(engine_, data_), CRIT); |
607 | } else { |
608 | zfree(data_); |
609 | } |
610 | } |
611 | DNN_SAFE(dnnl_memory_destroy(m_padded_), CRIT); |
612 | return OK; |
613 | } |
614 | |
615 | void dnn_mem_t::set_dt(dnnl_data_type_t dt) const { |
616 | // NOLINTNEXTLINE(readability-make-member-function-const) |
617 | dnnl_memory_desc_set_data_type(md_, dt); |
618 | } |
619 | |
620 | // Queries from memory descriptor. |
621 | int dnn_mem_t::ndims() const { |
622 | return query_md_ndims(md_); |
623 | } |
624 | |
625 | // Can't merge two below because compiler doesn't like conversion from |
626 | // pointer to reference type. |
627 | const dnnl_dims_t &dnn_mem_t::dims() const { |
628 | return query_md_dims(md_); |
629 | } |
630 | |
631 | const dnnl_dims_t &dnn_mem_t::padded_dims() const { |
632 | return query_md_padded_dims(md_); |
633 | } |
634 | |
635 | dnnl_data_type_t dnn_mem_t::dt() const { |
636 | return query_md_data_type(md_); |
637 | } |
638 | |
639 | const dnnl_dims_t &dnn_mem_t::padded_offsets() const { |
640 | return query_md_padded_offsets(md_); |
641 | } |
642 | |
643 | dnnl_dim_t dnn_mem_t::offset0() const { |
644 | return query_md_submemory_offset(md_); |
645 | } |
646 | |
647 | dnnl_format_kind_t dnn_mem_t::format_kind() const { |
648 | return query_md_format_kind(md_); |
649 | } |
650 | |
651 | const dnnl_dims_t &dnn_mem_t::strides() const { |
652 | return query_md_strides(md_); |
653 | } |
654 | |
655 | int dnn_mem_t::inner_nblks() const { |
656 | return query_md_inner_nblks(md_); |
657 | } |
658 | |
659 | const dnnl_dims_t &dnn_mem_t::inner_blks() const { |
660 | return query_md_inner_blks(md_); |
661 | } |
662 | |
663 | const dnnl_dims_t &dnn_mem_t::inner_idxs() const { |
664 | return query_md_inner_idxs(md_); |
665 | } |
666 | |
667 | // Returns physical offset by logical one. logical offset is represented by a |
668 | // scalar l_offset. If is_pos_padded is true, l_offset represents logical |
669 | // offset in already padded area. |
670 | static dnnl_dim_t md_off_l(dnnl_dims_t _pos, const dnn_mem_t &mem, |
671 | dnnl_dim_t l_offset, bool is_pos_padded = false) { |
672 | dnnl_dims_t pos; |
673 | const auto &_dims = is_pos_padded ? mem.padded_dims() : mem.dims(); |
674 | for (int rd = 0; rd < mem.ndims(); ++rd) { |
675 | const int d = mem.ndims() - 1 - rd; |
676 | const dnnl_dim_t cur_dim = _dims[d]; |
677 | pos[d] = l_offset % cur_dim; |
678 | if (_pos) _pos[d] = pos[d]; |
679 | l_offset /= cur_dim; |
680 | } |
681 | return md_off_v(mem, pos, is_pos_padded); |
682 | } |
683 | |
684 | template <typename T> |
685 | static int check_zero_padding_impl( |
686 | const dnn_mem_t &mem, int arg, res_t *res, int *error_count) { |
687 | const int ndims = mem.ndims(); |
688 | const auto &dims = mem.dims(); |
689 | const auto &pdims = mem.padded_dims(); |
690 | |
691 | if (ndims == 0) return OK; |
692 | if (mem.format_kind() != dnnl_blocked) return OK; |
693 | |
694 | auto product = [](const dnnl_dim_t *beg, const dnnl_dim_t *end) { |
695 | return std::accumulate( |
696 | beg, end, (dnnl_dim_t)1, std::multiplies<dnnl_dim_t>()); |
697 | }; |
698 | |
699 | int errors = 0; |
700 | std::atomic<int> ok(true); |
701 | |
702 | const T *mem_ptr = (const T *)mem; |
703 | |
704 | for (int dim_m_idx = 0; dim_m_idx < ndims; ++dim_m_idx) { |
705 | if (dims[dim_m_idx] == pdims[dim_m_idx]) continue; |
706 | |
707 | auto dim_l = product(pdims, pdims + dim_m_idx); |
708 | auto dim_r = product(pdims + dim_m_idx + 1, pdims + ndims); |
709 | |
710 | benchdnn_parallel_nd(dim_l, dim_r, [&](dnnl_dim_t l, dnnl_dim_t r) { |
711 | for (dnnl_dim_t m = dims[dim_m_idx]; m < pdims[dim_m_idx]; ++m) { |
712 | auto l_idx = (l * pdims[dim_m_idx] + m) * dim_r + r; |
713 | auto idx = md_off_l(nullptr, mem, l_idx, true); |
714 | if (!(mem_ptr[idx] == 0)) ok = false; |
715 | } |
716 | }); |
717 | |
718 | // Run the check one more time to report incorrect elements. This check |
719 | // is sequential. |
720 | if (!ok) { |
721 | for_(dnnl_dim_t l = 0; l < dim_l; ++l) |
722 | for_(dnnl_dim_t m = dims[dim_m_idx]; m < pdims[dim_m_idx]; ++m) |
723 | for (dnnl_dim_t r = 0; r < dim_r; ++r) { |
724 | auto l_idx = (l * pdims[dim_m_idx] + m) * dim_r + r; |
725 | dnnl_dims_t pos = {}; |
726 | auto idx = md_off_l(pos, mem, l_idx, true); |
727 | |
728 | bool idx_ok = (mem_ptr[idx] == 0); |
729 | if (!idx_ok) errors++; |
730 | |
731 | const bool dump = (!idx_ok && (errors < 10 || verbose >= 10)) |
732 | || (verbose >= 99); |
733 | if (dump) { |
734 | BENCHDNN_PRINT(0, |
735 | "[%4ld][arg:%d]" |
736 | "[" IFMT "," IFMT "," IFMT "," IFMT "," IFMT |
737 | "," IFMT "] fp: 0.f dt:% 9.6g \n" , |
738 | (long)idx, arg, pos[0], pos[1], pos[2], pos[3], |
739 | pos[4], pos[5], mem.get_elem(idx)); |
740 | } |
741 | } |
742 | } |
743 | } |
744 | |
745 | if (!ok) { |
746 | BENCHDNN_PRINT(0, "@@@ [arg:%d] check_zero_padding failed\n" , arg); |
747 | if (res) res->state = FAILED; |
748 | } |
749 | |
750 | if (error_count != nullptr) *error_count = errors; |
751 | |
752 | return ok ? OK : FAIL; |
753 | } |
754 | |
755 | int check_zero_padding( |
756 | const dnn_mem_t &mem, int arg, res_t *res, int *error_count) { |
757 | #define CASE(dt, type) \ |
758 | case dt: return check_zero_padding_impl<type>(mem, arg, res, error_count); |
759 | |
760 | switch (mem.dt()) { |
761 | case dnnl_data_type_undef: |
762 | return OK; |
763 | |
764 | CASE(dnnl_bf16, bfloat16_t); |
765 | CASE(dnnl_f16, float16_t); |
766 | CASE(dnnl_f32, float); |
767 | CASE(dnnl_f64, double); |
768 | CASE(dnnl_s32, int32_t); |
769 | CASE(dnnl_s8, int8_t); |
770 | CASE(dnnl_u8, uint8_t); |
771 | |
772 | default: assert(!"bad data_type" ); |
773 | }; |
774 | #undef CASE |
775 | |
776 | return FAIL; |
777 | } |
778 | |
779 | int check_buffer_overwrite(const dnn_mem_t &mem, int arg, res_t *res) { |
780 | if (!mem.is_canary_protected()) return OK; |
781 | |
782 | size_t sz = mem.size(); |
783 | size_t sz_padded = dnn_mem_t::pad_memory_size(sz, mem.engine_kind()); |
784 | |
785 | auto *mem_ptr = (const uint8_t *)mem; |
786 | for (size_t i = sz; i < sz_padded; i++) { |
787 | if (mem_ptr[i] == dnnl_mem_default_value) continue; |
788 | |
789 | BENCHDNN_PRINT(0, |
790 | "@@@ [arg:%d] check_buffer_overwrite failed. Expected: %d at " |
791 | "byte: %lld but found: %d\n" , |
792 | arg, dnnl_mem_default_value, (long long)i, mem_ptr[i]); |
793 | if (res) res->state = FAILED; |
794 | return FAIL; |
795 | } |
796 | return OK; |
797 | } |
798 | |
799 | // Returns physical offset by logical one. Logical offset is represented by an |
800 | // array pos. If is_pos_padded is true pos represents the position in already |
801 | // padded area. |
802 | dnnl_dim_t md_off_v( |
803 | const dnn_mem_t &mem, const dnnl_dims_t pos, bool is_pos_padded) { |
804 | assert(mem.format_kind() == dnnl_blocked); |
805 | |
806 | dnnl_dims_t pos_copy = {0}; |
807 | for (int d = 0; d < mem.ndims(); ++d) |
808 | pos_copy[d] = pos[d] + (is_pos_padded ? 0 : mem.padded_offsets()[d]); |
809 | |
810 | dnnl_dim_t phys_offset = mem.offset0(); |
811 | |
812 | const int nblks = mem.inner_nblks(); |
813 | if (nblks > 0) { |
814 | const auto &inner_idxs = mem.inner_idxs(); |
815 | const auto &inner_blks = mem.inner_blks(); |
816 | dnnl_dim_t blk_stride = 1; |
817 | for (int iblk = nblks - 1; iblk >= 0; --iblk) { |
818 | const int d = inner_idxs[iblk]; |
819 | |
820 | dnnl_dim_t p = pos_copy[d] % inner_blks[iblk]; |
821 | pos_copy[d] /= inner_blks[iblk]; |
822 | |
823 | phys_offset += p * blk_stride; |
824 | blk_stride *= inner_blks[iblk]; |
825 | } |
826 | } |
827 | |
828 | for (int d = 0; d < mem.ndims(); ++d) { |
829 | const dnnl_dim_t p = pos_copy[d]; |
830 | phys_offset += p * mem.strides()[d]; |
831 | } |
832 | |
833 | return phys_offset; |
834 | } |
835 | |
836 | dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md) { |
837 | dnnl_memory_desc_t cloned_md; |
838 | auto status = dnnl_memory_desc_clone(&cloned_md, md); |
839 | if (status != dnnl_success) return nullptr; |
840 | return cloned_md; |
841 | } |
842 | |