1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include <atomic>
19#include <cctype>
20#include <memory>
21#include <numeric>
22#include <string>
23
24#include "oneapi/dnnl/dnnl.h"
25
26#ifdef DNNL_WITH_SYCL
27#include "oneapi/dnnl/dnnl_sycl.hpp"
28#endif
29
30#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
31#include "oneapi/dnnl/dnnl_ocl.hpp"
32#include "src/gpu/ocl/ocl_usm_utils.hpp"
33#endif
34
35#include "tests/test_thread.hpp"
36
37#include "dnn_types.hpp"
38#include "dnnl_common.hpp"
39#include "dnnl_memory.hpp"
40#include "utils/dnnl_query.hpp"
41#include "utils/parallel.hpp"
42
43extern "C" dnnl_status_t dnnl_memory_desc_create_with_string_tag(
44 dnnl_memory_desc_t *, int, const dnnl_dims_t, dnnl_data_type_t,
45 const char *);
46
47extern "C" dnnl_status_t dnnl_memory_desc_set_data_type(
48 dnnl_memory_desc_t memory_desc, dnnl_data_type_t data_type);
49
50dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_engine_t engine,
51 const handle_info_t &handle_info) {
52 if (query_md_ndims(md) > 0) {
53 auto status = dnnl_memory_desc_clone(&md_, md);
54 (void)status;
55 assert(status == dnnl_success);
56 active_ = (initialize(engine, handle_info) == OK);
57 }
58}
59
60dnn_mem_t::dnn_mem_t(const_dnnl_memory_desc_t md, dnnl_data_type_t dt,
61 const std::string &tag, dnnl_engine_t engine) {
62 const int ndims = query_md_ndims(md);
63 if (ndims > 0) {
64 auto md_wrapper = dnn_mem_t::init_md(ndims, query_md_dims(md), dt, tag);
65 md_ = md_wrapper.release();
66 active_ = (initialize(engine) == OK);
67 }
68}
69
70dnn_mem_t::dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt,
71 const std::string &tag, dnnl_engine_t engine) {
72 if (ndims > 0) {
73 auto md_wrapper = dnn_mem_t::init_md(ndims, dims, dt, tag);
74 md_ = md_wrapper.release();
75 active_ = (initialize(engine) == OK);
76 }
77}
78
79dnn_mem_t::dnn_mem_t(int ndims, const dnnl_dims_t dims, dnnl_data_type_t dt,
80 const dnnl_dims_t strides, dnnl_engine_t engine) {
81 if (ndims > 0) {
82 auto status = dnnl_memory_desc_create_with_strides(
83 &md_, ndims, dims, dt, strides);
84 (void)status;
85 assert(status == dnnl_success);
86 active_ = (initialize(engine) == OK);
87 }
88}
89
90dnn_mem_t::dnn_mem_t(const dnn_mem_t &rhs, dnnl_data_type_t dt,
91 const std::string &tag, dnnl_engine_t engine)
92 : dnn_mem_t(rhs.md_, dt, tag, engine) {
93 if (active_) reorder(rhs);
94}
95
96int execute_reorder(const dnn_mem_t &src, dnn_mem_t &dst,
97 const_dnnl_primitive_attr_t attr) {
98 std::shared_ptr<const dnn_mem_t> r_src(&src, [](const dnn_mem_t *) {});
99 std::shared_ptr<dnn_mem_t> r_dst(&dst, [](dnn_mem_t *) {});
100
101 dnnl_primitive_desc_t r_pd_ {};
102 dnnl_primitive_t prim_ {};
103
104 // Optimization to reduce testing time for GPU.
105 //
106 // For CPU <-> GPU reorders, the library creates GPU-side kernels.
107 // Benchdnn heavily relies on reorders and this greatly increases execution
108 // time because of big overhead on building OpenCL kernels.
109 //
110 // First, try to create CPU reorder for the requested GPU reorder. If
111 // succeeded, then create CPU memory object wrapping mapped pointers of
112 // source and destination and execute CPU reorder. If CPU reorder can't be
113 // create, then just execute a regular GPU reorder.
114 //
115 // This optimization is skipped when testing reorder, sum and concat
116 // primitives because they are used specifically to test GPU reorders.
117#if ((DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL) \
118 || (DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL)) \
119 && DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
120 bool is_reorder_related_driver = (driver_name == "reorder"
121 || driver_name == "sum" || driver_name == "concat");
122 const auto &cpu_engine = get_cpu_engine();
123 if (!is_reorder_related_driver
124 && (src.engine_kind() == dnnl_gpu
125 || dst.engine_kind() == dnnl_gpu)) {
126
127 dnnl_status_t status = dnnl_reorder_primitive_desc_create(
128 &r_pd_, src.md_, cpu_engine, dst.md_, cpu_engine, attr);
129 if (status == dnnl_success) {
130 // Create CPU memory objects wrapping mapped pointers of source and
131 // destination
132 r_src = std::make_shared<dnn_mem_t>(dnn_mem_t::create_from_host_ptr(
133 src.md_, cpu_engine, (void *)src));
134 r_dst = std::make_shared<dnn_mem_t>(dnn_mem_t::create_from_host_ptr(
135 dst.md_, cpu_engine, (void *)dst));
136 }
137 }
138#endif
139
140 if (!r_pd_) {
141 DNN_SAFE(dnnl_reorder_primitive_desc_create(&r_pd_, src.md_,
142 src.engine(), dst.md_, dst.engine(), attr),
143 WARN);
144 }
145 auto r_pd = make_benchdnn_dnnl_wrapper(r_pd_);
146 const auto &scratchpad_md = query_md(r_pd, DNNL_ARG_SCRATCHPAD);
147 dnn_mem_t scratchpad(scratchpad_md, src.engine());
148
149 DNN_SAFE(dnnl_primitive_create(&prim_, r_pd), CRIT);
150 auto prim = make_benchdnn_dnnl_wrapper(prim_);
151
152 args_t args;
153 args.set(DNNL_ARG_FROM, *r_src);
154 args.set(DNNL_ARG_TO, *r_dst);
155 args.set(DNNL_ARG_SCRATCHPAD, scratchpad);
156
157 return execute_and_wait(prim, args);
158}
159int dnn_mem_t::reorder(const dnn_mem_t &rhs, const_dnnl_primitive_attr_t attr) {
160 if (this == &rhs) return OK;
161 return execute_reorder(rhs, *this, attr);
162}
163
164size_t dnn_mem_t::size() const {
165 return dnnl_memory_desc_get_size(md_);
166}
167
168size_t dnn_mem_t::sizeof_dt() const {
169 return dnnl_data_type_size(dt());
170}
171
172float dnn_mem_t::get_elem(int64_t idx) const {
173 void *data = (void *)*this;
174 float elem = 0.0;
175 switch (dt()) {
176 case dnnl_s8: elem = static_cast<int8_t *>(data)[idx]; break;
177 case dnnl_u8: elem = static_cast<uint8_t *>(data)[idx]; break;
178 case dnnl_s32: elem = static_cast<int32_t *>(data)[idx]; break;
179 case dnnl_f32: elem = static_cast<float *>(data)[idx]; break;
180 case dnnl_f64: elem = static_cast<double *>(data)[idx]; break;
181 case dnnl_f16: elem = static_cast<float16_t *>(data)[idx]; break;
182 case dnnl_bf16: elem = static_cast<bfloat16_t *>(data)[idx]; break;
183 default: assert(!"bad data type");
184 }
185 return elem;
186}
187
188void dnn_mem_t::set_elem(int64_t idx, float value) const {
189 void *data = (void *)*this;
190 switch (dt()) {
191 case dnnl_s8: ((int8_t *)data)[idx] = value; break;
192 case dnnl_u8: ((uint8_t *)data)[idx] = value; break;
193 case dnnl_s32: ((int32_t *)data)[idx] = value; break;
194 case dnnl_f32: ((float *)data)[idx] = value; break;
195 case dnnl_f64: ((double *)data)[idx] = value; break;
196 case dnnl_f16: ((float16_t *)data)[idx] = value; break;
197 case dnnl_bf16: ((bfloat16_t *)data)[idx] = value; break;
198 default: assert(!"bad data type");
199 }
200}
201
202// Creates a memory object from the underlying buffer of an existing memory
203// object `mem`. The size of `mem` must not be less than the size of `md`.
204#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL || defined(DNNL_WITH_SYCL)
205static int init_memory(
206 dnnl_memory_t *ret, const dnnl_memory_desc_t &md, dnnl_memory_t mem) {
207 void *handle;
208 DNN_SAFE(dnnl_memory_get_data_handle(mem, &handle), CRIT);
209
210 dnnl_engine_t engine;
211 DNN_SAFE(dnnl_memory_get_engine(mem, &engine), CRIT);
212
213 bool is_sycl = is_sycl_engine(engine);
214 bool is_opencl = is_opencl_engine(engine);
215
216 *ret = nullptr;
217
218 if (is_opencl) {
219#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
220 dnnl_ocl_interop_memory_kind_t mem_kind;
221 DNN_SAFE(dnnl_ocl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT);
222 DNN_SAFE(dnnl_ocl_interop_memory_create(
223 ret, md, engine, mem_kind, handle),
224 CRIT);
225#endif
226 } else if (is_sycl) {
227#ifdef DNNL_WITH_SYCL
228 dnnl_sycl_interop_memory_kind_t mem_kind;
229 DNN_SAFE(
230 dnnl_sycl_interop_memory_get_memory_kind(mem, &mem_kind), CRIT);
231 DNN_SAFE(dnnl_sycl_interop_memory_create(
232 ret, md, engine, mem_kind, handle),
233 CRIT);
234#endif
235 }
236
237 // Memory must be initialized at this point in some of the branches above.
238 if (!*ret) assert(!"not expected");
239
240 return OK;
241}
242#endif
243
244void dnn_mem_t::map() const {
245 assert(!is_mapped_ && "memory is already mapped");
246 is_mapped_ = true;
247
248 if (!m_) return;
249 auto mem = m_padded_ ? m_padded_ : m_;
250 DNN_SAFE_V(dnnl_memory_map_data(mem, &mapped_ptr_));
251}
252
253void dnn_mem_t::unmap() const {
254 assert(is_mapped_ && "memory is not mapped");
255 is_mapped_ = false;
256
257 if (!m_) return;
258 auto mem = m_padded_ ? m_padded_ : m_;
259 DNN_SAFE_V(dnnl_memory_unmap_data(mem, mapped_ptr_));
260 mapped_ptr_ = nullptr;
261}
262
263void dnn_mem_t::memset(int value, size_t size) const {
264 bool is_opencl = is_opencl_engine(engine_);
265 bool is_sycl = is_sycl_engine(engine_);
266 auto mem = m_padded_ ? m_padded_ : m_;
267 void *mem_handle;
268 DNN_SAFE_V(dnnl_memory_get_data_handle(mem, &mem_handle));
269 if (is_opencl) {
270#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
271 stream_t stream(engine_);
272 switch (memory_kind) {
273 case memory_kind_ext_t::buffer: {
274 auto buf = static_cast<cl_mem>(mem_handle);
275 cl_command_queue queue;
276 DNN_SAFE_V(dnnl_ocl_interop_stream_get_command_queue(
277 stream, &queue));
278 cl_int err = clEnqueueFillBuffer(queue, buf, &value,
279 sizeof(uint8_t), 0, size, 0, nullptr, nullptr);
280 if (err != CL_SUCCESS) SAFE_V(FAIL);
281 DNN_SAFE_V(dnnl_stream_wait(stream));
282 return;
283 }
284 case memory_kind_ext_t::usm:
285 case memory_kind_ext_t::usm_device:
286 case memory_kind_ext_t::usm_shared: {
287 DNN_SAFE_V(dnnl::impl::gpu::ocl::usm::memset(
288 stream, mem_handle, value, size));
289 DNN_SAFE_V(dnnl_stream_wait(stream));
290 return;
291 }
292 }
293#endif
294 } else if (is_sycl) {
295#ifdef DNNL_WITH_SYCL
296 stream_t stream(engine_);
297 void *queue_ptr;
298 DNN_SAFE_V(dnnl_sycl_interop_stream_get_queue(stream, &queue_ptr));
299 auto &queue = *static_cast<::sycl::queue *>(queue_ptr);
300 switch (memory_kind) {
301 case memory_kind_ext_t::buffer: {
302 auto &buf = *static_cast<::sycl::buffer<uint8_t, 1> *>(
303 mem_handle);
304 queue.submit([&](::sycl::handler &cgh) {
305#ifdef DNNL_SYCL_INTEROP_USE_SYCL121
306 constexpr auto target_device
307 = ::sycl::target::global_buffer;
308#else
309 constexpr auto target_device = ::sycl::target::device;
310#endif
311 ::sycl::accessor<uint8_t, 1, ::sycl::access::mode::write,
312 target_device>
313 acc(buf, cgh);
314 cgh.fill(acc, static_cast<uint8_t>(value));
315 });
316 DNN_SAFE_V(dnnl_stream_wait(stream));
317 return;
318 }
319 case memory_kind_ext_t::usm:
320 case memory_kind_ext_t::usm_device:
321 case memory_kind_ext_t::usm_shared: {
322 queue.submit([&](::sycl::handler &cgh) {
323 cgh.memset(mem_handle, value, size);
324 });
325 DNN_SAFE_V(dnnl_stream_wait(stream));
326 return;
327 }
328 }
329#endif
330 }
331 if (is_cpu(engine_)) {
332 ::memset(mem_handle, value, size);
333 return;
334 }
335 SAFE_V(FAIL);
336}
337
338dnn_mem_t dnn_mem_t::create_from_host_ptr(
339 const dnnl_memory_desc_t &md, dnnl_engine_t engine, void *host_ptr) {
340 return dnn_mem_t(md, engine, {true, host_ptr});
341}
342
343size_t dnn_mem_t::pad_memory_size(
344 size_t sz, dnnl_engine_kind_t engine_kind, bool *was_padded) {
345 if (was_padded) *was_padded = false;
346 if (sz == 0 || !is_bench_mode(CORR) || engine_kind == dnnl_cpu) return sz;
347
348 const int pad_size = 4096;
349 if (was_padded) *was_padded = true;
350 return sz + pad_size;
351}
352
353dnnl_memory_desc_t dnn_mem_t::pad_memory_desc(const_dnnl_memory_desc_t md,
354 dnnl_engine_kind_t engine_kind, bool *was_padded) {
355 if (was_padded) *was_padded = false;
356 size_t old_sz = dnnl_memory_desc_get_size(md);
357 if (old_sz == 0 || !is_bench_mode(CORR) || engine_kind == dnnl_cpu)
358 return nullptr;
359
360 size_t sz = pad_memory_size(old_sz, engine_kind, was_padded);
361 if (sz == old_sz) return nullptr;
362
363 dnnl_memory_desc_t ret;
364 dnnl_dims_t dims = {(dnnl_dim_t)sz};
365 DNN_SAFE_V(
366 dnnl_memory_desc_create_with_tag(&ret, 1, dims, dnnl_u8, dnnl_x));
367 return ret;
368}
369
370benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> dnn_mem_t::init_md(int ndims,
371 const dnnl_dims_t dims, dnnl_data_type_t data_type,
372 const std::string &tag_, const dims_t &strides_) {
373 dnnl_memory_desc_t md {};
374 const bool use_strides = !strides_.empty();
375 // Ignore tag_ in case strides_ are explicitly provided
376 if (use_strides) {
377 std::vector<dnnl_dim_t> strides(strides_);
378 DNN_SAFE_V(dnnl_memory_desc_create_with_strides(
379 &md, ndims, dims, data_type, strides.data()));
380 return md;
381 }
382
383 auto tag = normalize_tag(tag_, ndims);
384 if (tag == tag::undef || tag == tag::any || ndims == 0) {
385 dnnl_format_tag_t enum_tag = (tag == tag::undef || ndims == 0)
386 ? dnnl_format_tag_undef
387 : dnnl_format_tag_any;
388 DNN_SAFE_V(dnnl_memory_desc_create_with_tag(
389 &md, ndims, dims, data_type, enum_tag));
390 return md;
391 }
392
393 DNN_SAFE_V(dnnl_memory_desc_create_with_string_tag(
394 &md, ndims, dims, data_type, tag.data()));
395
396 return md;
397}
398
399int dnn_mem_t::initialize_memory_create_sycl(const handle_info_t &handle_info) {
400#ifdef DNNL_WITH_SYCL
401 if (handle_info.is_host_ptr) {
402 // Ignore memory_kind with host pointers and force USM.
403 DNN_SAFE(dnnl_sycl_interop_memory_create(&m_, md_, engine_,
404 dnnl_sycl_interop_usm, handle_info.ptr),
405 CRIT);
406 return OK;
407 }
408
409 auto md_padded = pad_memory_desc(md_, engine_kind_, &is_canary_protected_);
410 if (!md_padded) md_padded = md_;
411
412 switch (memory_kind) {
413 case memory_kind_ext_t::usm:
414 case memory_kind_ext_t::buffer: {
415 dnnl_sycl_interop_memory_kind_t mem_kind
416 = (memory_kind == memory_kind_ext_t::usm
417 ? dnnl_sycl_interop_usm
418 : dnnl_sycl_interop_buffer);
419 DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded,
420 engine_, mem_kind, handle_info.ptr),
421 CRIT);
422 SAFE(init_memory(&m_, md_, m_padded_), CRIT);
423 break;
424 }
425 case memory_kind_ext_t::usm_device:
426 case memory_kind_ext_t::usm_shared: {
427 SAFE(handle_info.is_allocate() ? OK : FAIL, CRIT);
428 is_data_owner_ = true;
429 size_t sz = dnnl_memory_desc_get_size(md_padded);
430 auto eng = dnnl::engine(engine_, true);
431 auto dev = dnnl::sycl_interop::get_device(eng);
432 auto ctx = dnnl::sycl_interop::get_context(eng);
433 if (memory_kind == memory_kind_ext_t::usm_device) {
434 data_ = ::sycl::malloc_device(sz, dev, ctx);
435 } else {
436 data_ = ::sycl::malloc_shared(sz, dev, ctx);
437 }
438 DNN_SAFE((sz > 0 && !data_) ? dnnl_out_of_memory : dnnl_success,
439 CRIT);
440 DNN_SAFE(dnnl_sycl_interop_memory_create(&m_padded_, md_padded,
441 engine_, dnnl_sycl_interop_usm, data_),
442 CRIT);
443 SAFE(init_memory(&m_, md_, m_padded_), CRIT);
444 break;
445 }
446 default: assert(!"not expected");
447 }
448 if (md_padded != md_) DNN_SAFE(dnnl_memory_desc_destroy(md_padded), CRIT);
449
450#else
451 (void)handle_info;
452#endif
453 return OK;
454}
455
456int dnn_mem_t::initialize_memory_create_opencl(
457 const handle_info_t &handle_info) {
458#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
459 if (handle_info.is_host_ptr) {
460 // Ignore memory_kind with host pointers and force USM.
461 DNN_SAFE(dnnl_ocl_interop_memory_create(&m_, md_, engine_,
462 dnnl_ocl_interop_usm, handle_info.ptr),
463 CRIT);
464 return OK;
465 }
466
467 SAFE(handle_info.is_allocate() ? OK : FAIL, CRIT);
468
469 auto md_padded = pad_memory_desc(md_, engine_kind_, &is_canary_protected_);
470 if (!md_padded) md_padded = md_;
471
472 switch (memory_kind) {
473 case memory_kind_ext_t::usm:
474 case memory_kind_ext_t::buffer: {
475 dnnl_ocl_interop_memory_kind_t mem_kind
476 = (memory_kind == memory_kind_ext_t::usm
477 ? dnnl_ocl_interop_usm
478 : dnnl_ocl_interop_buffer);
479 DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded,
480 engine_, mem_kind, handle_info.ptr),
481 CRIT);
482 SAFE(init_memory(&m_, md_, m_padded_), CRIT);
483 break;
484 }
485 case memory_kind_ext_t::usm_device:
486 case memory_kind_ext_t::usm_shared: {
487 is_data_owner_ = true;
488 size_t sz = dnnl_memory_desc_get_size(md_padded);
489 if (memory_kind == memory_kind_ext_t::usm_device) {
490 data_ = dnnl::impl::gpu::ocl::usm::malloc_device(engine_, sz);
491 } else {
492 data_ = dnnl::impl::gpu::ocl::usm::malloc_shared(engine_, sz);
493 }
494 DNN_SAFE((sz > 0 && !data_) ? dnnl_out_of_memory : dnnl_success,
495 CRIT);
496 DNN_SAFE(dnnl_ocl_interop_memory_create(&m_padded_, md_padded,
497 engine_, dnnl_ocl_interop_usm, data_),
498 CRIT);
499 SAFE(init_memory(&m_, md_, m_padded_), CRIT);
500 break;
501 }
502 default: assert(!"not expected");
503 }
504 if (md_padded != md_) DNN_SAFE(dnnl_memory_desc_destroy(md_padded), CRIT);
505#else
506 (void)handle_info;
507#endif
508 return OK;
509}
510
511int dnn_mem_t::initialize_memory_create(const handle_info_t &handle_info) {
512 bool is_sycl = is_sycl_engine(engine_);
513 bool is_opencl = is_opencl_engine(engine_);
514
515 if (handle_info.is_host_ptr) {
516 // Host pointer can be used with CPU memory only.
517 // XXX: assumption is that SYCL can work with native host pointers.
518 SAFE(is_cpu(engine_) ? OK : FAIL, CRIT);
519 }
520
521 if (is_cpu(engine_) && handle_info.is_allocate() && !is_sycl) {
522 // Allocate memory for native runtime directly.
523 is_data_owner_ = true;
524 const size_t alignment = 2 * 1024 * 1024;
525 size_t sz = dnnl_memory_desc_get_size(md_);
526 data_ = zmalloc(sz, alignment);
527 DNN_SAFE(!data_ ? dnnl_out_of_memory : dnnl_success, CRIT);
528 DNN_SAFE(dnnl_memory_create(&m_, md_, engine_, data_), CRIT);
529 } else if (is_sycl) {
530 SAFE(initialize_memory_create_sycl(handle_info), CRIT);
531 } else if (is_opencl) {
532 SAFE(initialize_memory_create_opencl(handle_info), CRIT);
533 } else {
534 is_data_owner_ = false;
535 data_ = nullptr;
536 DNN_SAFE(dnnl_memory_create(&m_, md_, engine_, handle_info.ptr), CRIT);
537 }
538 return OK;
539}
540
541int dnn_mem_t::initialize(
542 dnnl_engine_t engine, const handle_info_t &handle_info) {
543 is_mapped_ = false;
544 engine_ = engine;
545 engine_kind_ = query_engine_kind(engine_);
546
547 SAFE(initialize_memory_create(handle_info), CRIT);
548
549 size_t sz = dnnl_memory_desc_get_size(md_);
550 if (is_canary_protected_) sz = pad_memory_size(sz, engine_kind_);
551
552 // Do not fill a memory if its size is zero. Moreover, memset expects
553 // defined pointer, nullptr is not allowed.
554 if (sz != 0 && handle_info.is_allocate()) {
555 // Fill memory with a magic number (NAN for fp data types) to catch
556 // possible uninitialized access.
557 map();
558 ::memset(mapped_ptr_, dnnl_mem_default_value, sz);
559 unmap();
560 }
561
562 // Keep memory mapped and unmap only before execution
563 map();
564
565 return OK;
566}
567
568static int cleanup_sycl(const dnnl_engine_t &engine, void *data) {
569#ifdef DNNL_WITH_SYCL
570 switch (memory_kind) {
571 case memory_kind_ext_t::usm_device:
572 case memory_kind_ext_t::usm_shared: {
573 auto eng = dnnl::engine(engine, true);
574 auto ctx = dnnl::sycl_interop::get_context(eng);
575 ::sycl::free(data, ctx);
576 break;
577 }
578 default: break;
579 }
580#endif
581 return OK;
582}
583
584static int cleanup_opencl(const dnnl_engine_t &engine, void *data) {
585#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
586 switch (memory_kind) {
587 case memory_kind_ext_t::usm_device:
588 case memory_kind_ext_t::usm_shared:
589 dnnl::impl::gpu::ocl::usm::free(engine, data);
590 break;
591 default: break;
592 }
593#endif
594 return OK;
595}
596
597int dnn_mem_t::cleanup() {
598 if (!active_) return OK;
599 unmap();
600 DNN_SAFE(dnnl_memory_desc_destroy(md_), CRIT);
601 DNN_SAFE(dnnl_memory_destroy(m_), CRIT);
602 if (is_data_owner_) {
603 if (is_sycl_engine(engine_)) {
604 SAFE(cleanup_sycl(engine_, data_), CRIT);
605 } else if (is_opencl_engine(engine_)) {
606 SAFE(cleanup_opencl(engine_, data_), CRIT);
607 } else {
608 zfree(data_);
609 }
610 }
611 DNN_SAFE(dnnl_memory_destroy(m_padded_), CRIT);
612 return OK;
613}
614
615void dnn_mem_t::set_dt(dnnl_data_type_t dt) const {
616 // NOLINTNEXTLINE(readability-make-member-function-const)
617 dnnl_memory_desc_set_data_type(md_, dt);
618}
619
620// Queries from memory descriptor.
621int dnn_mem_t::ndims() const {
622 return query_md_ndims(md_);
623}
624
625// Can't merge two below because compiler doesn't like conversion from
626// pointer to reference type.
627const dnnl_dims_t &dnn_mem_t::dims() const {
628 return query_md_dims(md_);
629}
630
631const dnnl_dims_t &dnn_mem_t::padded_dims() const {
632 return query_md_padded_dims(md_);
633}
634
635dnnl_data_type_t dnn_mem_t::dt() const {
636 return query_md_data_type(md_);
637}
638
639const dnnl_dims_t &dnn_mem_t::padded_offsets() const {
640 return query_md_padded_offsets(md_);
641}
642
643dnnl_dim_t dnn_mem_t::offset0() const {
644 return query_md_submemory_offset(md_);
645}
646
647dnnl_format_kind_t dnn_mem_t::format_kind() const {
648 return query_md_format_kind(md_);
649}
650
651const dnnl_dims_t &dnn_mem_t::strides() const {
652 return query_md_strides(md_);
653}
654
655int dnn_mem_t::inner_nblks() const {
656 return query_md_inner_nblks(md_);
657}
658
659const dnnl_dims_t &dnn_mem_t::inner_blks() const {
660 return query_md_inner_blks(md_);
661}
662
663const dnnl_dims_t &dnn_mem_t::inner_idxs() const {
664 return query_md_inner_idxs(md_);
665}
666
667// Returns physical offset by logical one. logical offset is represented by a
668// scalar l_offset. If is_pos_padded is true, l_offset represents logical
669// offset in already padded area.
670static dnnl_dim_t md_off_l(dnnl_dims_t _pos, const dnn_mem_t &mem,
671 dnnl_dim_t l_offset, bool is_pos_padded = false) {
672 dnnl_dims_t pos;
673 const auto &_dims = is_pos_padded ? mem.padded_dims() : mem.dims();
674 for (int rd = 0; rd < mem.ndims(); ++rd) {
675 const int d = mem.ndims() - 1 - rd;
676 const dnnl_dim_t cur_dim = _dims[d];
677 pos[d] = l_offset % cur_dim;
678 if (_pos) _pos[d] = pos[d];
679 l_offset /= cur_dim;
680 }
681 return md_off_v(mem, pos, is_pos_padded);
682}
683
684template <typename T>
685static int check_zero_padding_impl(
686 const dnn_mem_t &mem, int arg, res_t *res, int *error_count) {
687 const int ndims = mem.ndims();
688 const auto &dims = mem.dims();
689 const auto &pdims = mem.padded_dims();
690
691 if (ndims == 0) return OK;
692 if (mem.format_kind() != dnnl_blocked) return OK;
693
694 auto product = [](const dnnl_dim_t *beg, const dnnl_dim_t *end) {
695 return std::accumulate(
696 beg, end, (dnnl_dim_t)1, std::multiplies<dnnl_dim_t>());
697 };
698
699 int errors = 0;
700 std::atomic<int> ok(true);
701
702 const T *mem_ptr = (const T *)mem;
703
704 for (int dim_m_idx = 0; dim_m_idx < ndims; ++dim_m_idx) {
705 if (dims[dim_m_idx] == pdims[dim_m_idx]) continue;
706
707 auto dim_l = product(pdims, pdims + dim_m_idx);
708 auto dim_r = product(pdims + dim_m_idx + 1, pdims + ndims);
709
710 benchdnn_parallel_nd(dim_l, dim_r, [&](dnnl_dim_t l, dnnl_dim_t r) {
711 for (dnnl_dim_t m = dims[dim_m_idx]; m < pdims[dim_m_idx]; ++m) {
712 auto l_idx = (l * pdims[dim_m_idx] + m) * dim_r + r;
713 auto idx = md_off_l(nullptr, mem, l_idx, true);
714 if (!(mem_ptr[idx] == 0)) ok = false;
715 }
716 });
717
718 // Run the check one more time to report incorrect elements. This check
719 // is sequential.
720 if (!ok) {
721 for_(dnnl_dim_t l = 0; l < dim_l; ++l)
722 for_(dnnl_dim_t m = dims[dim_m_idx]; m < pdims[dim_m_idx]; ++m)
723 for (dnnl_dim_t r = 0; r < dim_r; ++r) {
724 auto l_idx = (l * pdims[dim_m_idx] + m) * dim_r + r;
725 dnnl_dims_t pos = {};
726 auto idx = md_off_l(pos, mem, l_idx, true);
727
728 bool idx_ok = (mem_ptr[idx] == 0);
729 if (!idx_ok) errors++;
730
731 const bool dump = (!idx_ok && (errors < 10 || verbose >= 10))
732 || (verbose >= 99);
733 if (dump) {
734 BENCHDNN_PRINT(0,
735 "[%4ld][arg:%d]"
736 "[" IFMT "," IFMT "," IFMT "," IFMT "," IFMT
737 "," IFMT "] fp: 0.f dt:% 9.6g \n",
738 (long)idx, arg, pos[0], pos[1], pos[2], pos[3],
739 pos[4], pos[5], mem.get_elem(idx));
740 }
741 }
742 }
743 }
744
745 if (!ok) {
746 BENCHDNN_PRINT(0, "@@@ [arg:%d] check_zero_padding failed\n", arg);
747 if (res) res->state = FAILED;
748 }
749
750 if (error_count != nullptr) *error_count = errors;
751
752 return ok ? OK : FAIL;
753}
754
755int check_zero_padding(
756 const dnn_mem_t &mem, int arg, res_t *res, int *error_count) {
757#define CASE(dt, type) \
758 case dt: return check_zero_padding_impl<type>(mem, arg, res, error_count);
759
760 switch (mem.dt()) {
761 case dnnl_data_type_undef:
762 return OK;
763
764 CASE(dnnl_bf16, bfloat16_t);
765 CASE(dnnl_f16, float16_t);
766 CASE(dnnl_f32, float);
767 CASE(dnnl_f64, double);
768 CASE(dnnl_s32, int32_t);
769 CASE(dnnl_s8, int8_t);
770 CASE(dnnl_u8, uint8_t);
771
772 default: assert(!"bad data_type");
773 };
774#undef CASE
775
776 return FAIL;
777}
778
779int check_buffer_overwrite(const dnn_mem_t &mem, int arg, res_t *res) {
780 if (!mem.is_canary_protected()) return OK;
781
782 size_t sz = mem.size();
783 size_t sz_padded = dnn_mem_t::pad_memory_size(sz, mem.engine_kind());
784
785 auto *mem_ptr = (const uint8_t *)mem;
786 for (size_t i = sz; i < sz_padded; i++) {
787 if (mem_ptr[i] == dnnl_mem_default_value) continue;
788
789 BENCHDNN_PRINT(0,
790 "@@@ [arg:%d] check_buffer_overwrite failed. Expected: %d at "
791 "byte: %lld but found: %d\n",
792 arg, dnnl_mem_default_value, (long long)i, mem_ptr[i]);
793 if (res) res->state = FAILED;
794 return FAIL;
795 }
796 return OK;
797}
798
799// Returns physical offset by logical one. Logical offset is represented by an
800// array pos. If is_pos_padded is true pos represents the position in already
801// padded area.
802dnnl_dim_t md_off_v(
803 const dnn_mem_t &mem, const dnnl_dims_t pos, bool is_pos_padded) {
804 assert(mem.format_kind() == dnnl_blocked);
805
806 dnnl_dims_t pos_copy = {0};
807 for (int d = 0; d < mem.ndims(); ++d)
808 pos_copy[d] = pos[d] + (is_pos_padded ? 0 : mem.padded_offsets()[d]);
809
810 dnnl_dim_t phys_offset = mem.offset0();
811
812 const int nblks = mem.inner_nblks();
813 if (nblks > 0) {
814 const auto &inner_idxs = mem.inner_idxs();
815 const auto &inner_blks = mem.inner_blks();
816 dnnl_dim_t blk_stride = 1;
817 for (int iblk = nblks - 1; iblk >= 0; --iblk) {
818 const int d = inner_idxs[iblk];
819
820 dnnl_dim_t p = pos_copy[d] % inner_blks[iblk];
821 pos_copy[d] /= inner_blks[iblk];
822
823 phys_offset += p * blk_stride;
824 blk_stride *= inner_blks[iblk];
825 }
826 }
827
828 for (int d = 0; d < mem.ndims(); ++d) {
829 const dnnl_dim_t p = pos_copy[d];
830 phys_offset += p * mem.strides()[d];
831 }
832
833 return phys_offset;
834}
835
836dnnl_memory_desc_t clone_md(const_dnnl_memory_desc_t md) {
837 dnnl_memory_desc_t cloned_md;
838 auto status = dnnl_memory_desc_clone(&cloned_md, md);
839 if (status != dnnl_success) return nullptr;
840 return cloned_md;
841}
842