1/*******************************************************************************
2* Copyright 2017-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm> // for std::reverse and std::copy
18#include <functional> // for std::bind and std::placeholders
19#include <list>
20#include <string> // for std::string
21#include <utility> // for std::pair
22#include <vector> // for std::vector
23
24#include <assert.h>
25
26#include "oneapi/dnnl/dnnl.hpp"
27#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
28#include "oneapi/dnnl/dnnl_ocl.hpp"
29#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
30#include "oneapi/dnnl/dnnl_sycl.hpp"
31#endif
32
33#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
34#include "oneapi/dnnl/dnnl_threadpool.h"
35#endif
36
37#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
38#include "src/common/primitive_cache.hpp"
39#endif
40
41#include "cpu/platform.hpp"
42
43#include "tests/test_thread.hpp"
44
45#include "dnnl_common.hpp"
46#include "dnnl_memory.hpp"
47
48#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
49 || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
50extern "C" dnnl_status_t dnnl_impl_gpu_set_profiling(int flag);
51extern "C" dnnl_status_t dnnl_impl_gpu_reset_profiling();
52extern "C" dnnl_status_t dnnl_impl_gpu_get_profile_info(
53 uint64_t &time, double &freq, int mode);
54#endif
55
56int check_pd_cache(const_dnnl_primitive_desc_t pd) {
57#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
58 int capacity = 0;
59 DNN_SAFE(dnnl_get_primitive_cache_capacity(&capacity), CRIT);
60 if (capacity && !dnnl::impl::is_pd_in_cache(pd)) {
61 BENCHDNN_PRINT(0, "error: %s\n",
62 "primitive descriptor is expected to be fetched from "
63 "the primitive cache");
64 return FAIL;
65 }
66#endif
67 return OK;
68}
69
70int check_primitive_cache(dnnl_primitive_t p) {
71#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
72 int capacity = 0;
73 DNN_SAFE(dnnl_get_primitive_cache_capacity(&capacity), CRIT);
74 if (capacity && !dnnl::impl::is_primitive_in_cache(p)) {
75 BENCHDNN_PRINT(0, "error: %s\n",
76 "primitive is expected to be fetched from the primitive "
77 "cache");
78 return FAIL;
79 }
80#endif
81 return OK;
82}
83
84size_t set_primitive_cache_capacity_without_clearing(size_t capacity) {
85#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
86 return dnnl::impl::set_primitive_cache_capacity_without_clearing(capacity);
87#endif
88 return size_t(0);
89}
90
91int get_cache_blob_id(
92 std::vector<uint8_t> &cache_blob_id, const_dnnl_primitive_desc_t pd) {
93 dnnl_dim_t count;
94 const uint8_t *c_id;
95 DNN_SAFE(dnnl_primitive_desc_query(
96 pd, dnnl_query_cache_blob_id_size_s64, 0, (void *)&count),
97 WARN);
98 DNN_SAFE(dnnl_primitive_desc_query(
99 pd, dnnl_query_cache_blob_id, 0, (void **)&c_id),
100 WARN);
101 cache_blob_id = {c_id, c_id + count};
102 return OK;
103}
104
105int get_cache_blob(std::vector<uint8_t> &cache_blob, dnnl_primitive_t prim) {
106 size_t size = 0;
107 DNN_SAFE(dnnl_primitive_get_cache_blob(prim, &size, nullptr), WARN);
108
109 cache_blob.resize(size);
110 DNN_SAFE(dnnl_primitive_get_cache_blob(prim, &size, cache_blob.data()),
111 WARN);
112 return OK;
113}
114
115struct lru_cache_t {
116 lru_cache_t(size_t capacity) : capacity_(capacity) {}
117
118 const std::vector<uint8_t> &get(const std::vector<uint8_t> &key) {
119 auto it = cache_mapper_.find(key);
120 if (it == cache_mapper_.end()) { return dummy_; }
121 cache_list_.splice(cache_list_.begin(), cache_list_, it->second);
122 return cache_list_.front().value_;
123 }
124
125 void add(const std::vector<uint8_t> &key,
126 const std::vector<uint8_t> &value) {
127 assert(!cache_mapper_.count(key));
128 if (cache_mapper_.size() >= capacity_) {
129 cache_mapper_.erase(cache_list_.back().key_);
130 cache_list_.pop_back();
131 }
132 cache_list_.push_front(entry_t(key, value));
133 cache_mapper_.insert({key, cache_list_.begin()});
134 }
135
136private:
137 lru_cache_t(const lru_cache_t &other) = delete;
138 lru_cache_t(lru_cache_t &&other) = delete;
139 lru_cache_t &operator=(const lru_cache_t &other) = delete;
140 lru_cache_t &operator=(lru_cache_t &&other) = delete;
141
142 struct entry_t {
143 entry_t(const std::vector<uint8_t> &key,
144 const std::vector<uint8_t> &value)
145 : key_(key), value_(value) {}
146 std::vector<uint8_t> key_;
147 std::vector<uint8_t> value_;
148 };
149
150 size_t capacity_;
151 std::list<entry_t> cache_list_;
152 std::map<std::vector<uint8_t>, std::list<entry_t>::iterator> cache_mapper_;
153
154 const std::vector<uint8_t> dummy_;
155};
156
157lru_cache_t &get_test_cache() {
158 static lru_cache_t cache(1024);
159 return cache;
160}
161
162int test_persistent_cache_api(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim,
163 const_dnnl_primitive_desc_t pd, res_t *res) {
164 if (!is_gpu() || (is_gpu() && DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL)) {
165 return OK;
166 }
167
168 // Start testing persistent cache API.
169 // 1. Disable primitive cache to make sure that the next primitive will
170 // be created from the cache blob and not fetched from the primitive cache.
171 const auto old_capacity = set_primitive_cache_capacity_without_clearing(0);
172 // 2. Get cache blob ID to use it as a key for the `test_cache`.
173 std::vector<uint8_t> cache_blob_id;
174 SAFE(get_cache_blob_id(cache_blob_id, pd), WARN);
175 // 3. Check if a cache blob for the obtained cache blob ID is present in the
176 // `test_cache`.
177 // a) If the cache blob is found the primitive is created from it.
178 // b) If the cache blob is not found then get it from the previously
179 // created primitive, store it in the cache and create the primitive
180 // from it.
181 dnnl_primitive_t p {};
182 auto &cache = get_test_cache();
183 auto cache_value = cache.get(cache_blob_id);
184 if (!cache_value.empty()) {
185 const size_t size = cache_value.size();
186 const uint8_t *cache_blob = cache_value.data();
187 DNN_SAFE(
188 dnnl_primitive_create_from_cache_blob(&p, pd, size, cache_blob),
189 WARN);
190 } else {
191 std::vector<uint8_t> cache_blob;
192 SAFE(get_cache_blob(cache_blob, prim), WARN);
193
194 // The cross-engine reorder is a special primitive that may contain no
195 // kernels therefore the cache blob will always be empty, which is
196 // the correct behavior.
197 if (cache_blob.empty()) {
198 set_primitive_cache_capacity_without_clearing(old_capacity);
199 if (res->impl_name.find("cross_engine") != std::string::npos)
200 return OK;
201
202 BENCHDNN_PRINT(
203 0, "error: %s\n", "cache blob is not expected to be empty");
204 res->state = FAILED;
205 return FAIL;
206 }
207
208 DNN_SAFE(dnnl_primitive_create_from_cache_blob(
209 &p, pd, cache_blob.size(), cache_blob.data()),
210 WARN);
211 cache.add(cache_blob_id, cache_blob);
212 }
213 prim.reset(p);
214
215 // 4. Restore the original primitive cache capacity to make it functional.
216 set_primitive_cache_capacity_without_clearing(old_capacity);
217
218 return OK;
219}
220
221float round_to_nearest_representable(dnnl_data_type_t dt, float value) {
222 switch (dt) {
223 case dnnl_f32: break;
224 case dnnl_f64: break;
225 case dnnl_bf16: value = (float)dnnl::impl::bfloat16_t(value); break;
226 case dnnl_f16: value = (float)dnnl::impl::float16_t(value); break;
227 case dnnl_s32:
228 case dnnl_s8:
229 case dnnl_u8: value = maybe_saturate(dt, value); break;
230 default: SAFE(FAIL, CRIT);
231 }
232
233 return value;
234}
235
236// Engine kind used to run oneDNN primitives for testing
237dnnl_engine_kind_t engine_tgt_kind = dnnl_cpu;
238// Engine index used to run oneDNN primitives for testing
239size_t engine_index = 0;
240// CPU ISA specific hints : none by default
241isa_hints_t hints {isa_hints_t::none};
242
243memory_kind_ext_t memory_kind {default_memory_kind};
244
245void init_isa_settings() {
246 if (hints.get() == isa_hints_t::no_hints)
247 DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_no_hints));
248 else if (hints.get() == isa_hints_t::prefer_ymm)
249 DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_prefer_ymm));
250 else {
251 // Do nothing when hints == none
252 assert(hints.get() == isa_hints_t::none);
253 }
254}
255
256args_t &args_t::set(int arg, const dnn_mem_t &mem) {
257 args_.emplace_back(arg, &mem);
258 return *this;
259}
260
261args_t &args_t::set(
262 const std::vector<int> &args, const std::vector<dnn_mem_t> &mems) {
263 assert(args.size() == mems.size());
264 for (size_t i = 0; i < mems.size(); ++i)
265 args_.emplace_back(args[i], &mems[i]);
266 return *this;
267}
268
269const dnn_mem_t &args_t::find(int arg) const {
270 static dnn_mem_t empty_stub;
271 for (const auto &e : args_) {
272 if (e.first == arg) return *(e.second);
273 }
274 return empty_stub;
275}
276
277// Unmap before passing the memory to execute
278void execute_unmap_args(
279 const args_t &args, std::vector<dnnl_exec_arg_t> &dnnl_args) {
280 dnnl_args.resize(args.size());
281 for (int i = 0; i < args.size(); ++i) {
282 if (args.dnn_mem(i).is_mapped()) args.dnn_mem(i).unmap();
283
284 dnnl_args[i].arg = args.arg(i);
285 dnnl_args[i].memory = args.dnn_mem(i).m_;
286 }
287}
288
289// Map the memory back after execute
290void execute_map_args(const args_t &args) {
291 for (int i = 0; i < args.size(); ++i)
292 if (!args.dnn_mem(i).is_mapped()) args.dnn_mem(i).map();
293}
294
295int execute_and_wait(perf_function_t &exec_func, const dnnl_engine_t &engine,
296 const args_t &args, res_t *res) {
297 stream_t stream(engine);
298 std::vector<dnnl_exec_arg_t> dnnl_args;
299
300 execute_unmap_args(args, dnnl_args);
301
302 DNN_SAFE(exec_func(stream, dnnl_args), CRIT);
303 DNN_SAFE(dnnl_stream_wait(stream), CRIT);
304 if (res) res->state = EXECUTED;
305
306 execute_map_args(args);
307
308 return OK;
309}
310
311dnnl_status_t primitive_executor(dnnl_primitive_t prim,
312 const dnnl_stream_t &stream,
313 const std::vector<dnnl_exec_arg_t> &dnnl_args) {
314 return dnnl_primitive_execute(
315 prim, stream, (int)dnnl_args.size(), dnnl_args.data());
316}
317
318int execute_and_wait(dnnl_primitive_t prim, const args_t &args, res_t *res) {
319 perf_function_t exec_func = std::bind(&primitive_executor, prim,
320 std::placeholders::_1, std::placeholders::_2);
321 auto pd = query_pd(prim);
322 auto engine = query_engine(pd);
323 return execute_and_wait(exec_func, engine, args, res);
324}
325
326void enable_gpu_profiling() {
327#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
328 || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
329 if (!is_bench_mode(PROF)) return;
330 DNN_SAFE_V(dnnl_impl_gpu_set_profiling(1));
331#endif
332}
333
334void disable_gpu_profiling() {
335#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
336 || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
337 if (!is_bench_mode(PROF)) return;
338 DNN_SAFE_V(dnnl_impl_gpu_reset_profiling());
339 DNN_SAFE_V(dnnl_impl_gpu_set_profiling(0));
340#endif
341}
342
343void reset_gpu_profiling() {
344#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
345 || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
346 if (!is_bench_mode(PROF)) return;
347 DNN_SAFE_V(dnnl_impl_gpu_reset_profiling());
348#endif
349}
350
351void get_gpu_profiling_info(uint64_t &nsec, double &freq, int mode) {
352#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
353 || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
354 if (!is_bench_mode(PROF)) return;
355 DNN_SAFE_V(dnnl_impl_gpu_get_profile_info(nsec, freq, mode));
356#endif
357}
358
359inline bool should_stop(const timer::timer_t &t) {
360 const bool stop = false
361 || (fix_times_per_prb && t.times() >= fix_times_per_prb)
362 || (!fix_times_per_prb && t.total_ms() >= max_ms_per_prb
363 && t.times() >= min_times_per_prb);
364 return stop;
365}
366
367inline int measure_perf_individual(timer::timer_t &t, dnnl_stream_t stream,
368 perf_function_t &perf_func, std::vector<dnnl_exec_arg_t> &dnnl_args) {
369 t.reset();
370 while (true) {
371 DNN_SAFE(perf_func(stream, dnnl_args), WARN);
372 t.stamp();
373 if (should_stop(t)) break;
374 }
375 return OK;
376}
377
378inline int measure_perf_aggregate(timer::timer_t &t, dnnl_stream_t stream,
379 perf_function_t &perf_func, std::vector<dnnl_exec_arg_t> &dnnl_args) {
380 const int max_batch_times = 10000;
381
382 // Warm-up run, this is not measured due to possibility the associated
383 // kernel has not been built and skews the results.
384 DNN_SAFE(perf_func(stream, dnnl_args), WARN);
385 DNN_SAFE(dnnl_stream_wait(stream), WARN);
386
387 int cur_batch_times
388 = fix_times_per_prb ? fix_times_per_prb : min_times_per_prb;
389
390 t.reset();
391 reset_gpu_profiling();
392
393 bool is_first_loop = true;
394 while (true) {
395 for (int i = 0; i < cur_batch_times; i++) {
396 DNN_SAFE(perf_func(stream, dnnl_args), WARN);
397 }
398 DNN_SAFE(dnnl_stream_wait(stream), WARN);
399
400 if (is_bench_mode(PROF)) {
401 uint64_t nsec = 0;
402 double freq = 0;
403 get_gpu_profiling_info(nsec, freq, 0);
404 reset_gpu_profiling();
405 t.stamp_with_frequency(cur_batch_times, nsec / 1e6, freq);
406 } else {
407 t.stamp(cur_batch_times);
408 }
409
410 if (should_stop(t)) break;
411
412 // Adjust cur_batch_times after the first batch run
413 if (is_first_loop) {
414 double ms_min = t.ms(timer::timer_t::min);
415 // Heuristic: try to use ~5 batch runs for the whole benchmark
416 int batch_times_heuristic = (ms_min == 0.0)
417 ? INT_MAX
418 : MAX2(1,
419 (int)((max_ms_per_prb - t.total_ms()) / ms_min
420 / 5));
421 cur_batch_times = MIN2(max_batch_times, batch_times_heuristic);
422 is_first_loop = false;
423 }
424 }
425
426 return OK;
427}
428
429int measure_perf(const thr_ctx_t &ctx, res_t *res, perf_function_t &perf_func,
430 args_t &args) {
431 int ret = OK;
432 if (is_bench_mode(PERF)) {
433 const auto &engine = get_test_engine();
434 stream_t stream(engine, ctx.get_interop_obj());
435 std::vector<dnnl_exec_arg_t> dnnl_args;
436 execute_unmap_args(args, dnnl_args);
437
438 auto &t = res->timer_map.perf_timer();
439 // For non-DPCPP CPU: measure individual iterations.
440 // For DPCPP CPU and GPU: measure iterations in batches to hide driver
441 // overhead. DPCPP CPU follows the model of GPU, thus, handled similar.
442 if (is_cpu() && !is_sycl_engine(engine))
443 ret = execute_in_thr_ctx(ctx, measure_perf_individual, t, stream,
444 perf_func, dnnl_args);
445 else
446 ret = execute_in_thr_ctx(ctx, measure_perf_aggregate, t, stream,
447 perf_func, dnnl_args);
448
449 if (ret == OK) execute_map_args(args);
450 }
451 return ret;
452}
453
454int measure_perf(
455 const thr_ctx_t &ctx, res_t *res, dnnl_primitive_t prim, args_t &args) {
456 perf_function_t perf_func = std::bind(&primitive_executor, prim,
457 std::placeholders::_1, std::placeholders::_2);
458
459 return measure_perf(ctx, res, perf_func, args);
460}
461
462void maybe_prepare_runtime_scales(dnn_mem_t &scales_m,
463 const attr_t::scale_t &scale, int64_t scale_cnt, const float *scales) {
464 if (!scale.runtime) return;
465
466 const int64_t count = scale.policy == policy_t::COMMON ? 1 : scale_cnt;
467
468 scales_m = dnn_mem_t(1, &count, dnnl_f32, tag::x, get_test_engine());
469 for (int64_t c = 0; c < count; ++c)
470 ((float *)scales_m)[c] = scales[c];
471}
472
473void maybe_prepare_runtime_scales_v2(dnn_mem_t &scales_dt, dnn_mem_t &scales_fp,
474 const attr_t::scale_t &scale, int64_t scale_cnt, const float *scales) {
475 if (!scale.runtime) return;
476 maybe_prepare_runtime_scales(scales_dt, scale, scale_cnt, scales);
477 const int64_t count = scale.policy == policy_t::COMMON ? 1 : scale_cnt;
478 scales_fp = dnn_mem_t(1, &count, dnnl_f32, tag::x, get_cpu_engine());
479 for (int64_t c = 0; c < count; ++c)
480 ((float *)scales_fp)[c] = ((float *)scales_dt)[c];
481}
482
483void maybe_prepare_runtime_zero_points(dnn_mem_t &zero_points_m,
484 const attr_t &attr, int arg, int64_t count,
485 const int32_t *zero_points) {
486 if (!attr.zero_points.runtime(arg)) return;
487
488 const auto e = attr.zero_points.get(arg);
489 const int64_t cnt = e.policy == policy_t::COMMON ? 1 : count;
490
491 zero_points_m = dnn_mem_t(1, &cnt, dnnl_s32, tag::x, get_test_engine());
492 for (int64_t c = 0; c < cnt; ++c)
493 ((int32_t *)zero_points_m)[c] = zero_points[c];
494}
495
496void maybe_prepare_runtime_zero_points_v2(dnn_mem_t &zero_points_dt,
497 dnn_mem_t &zero_points_fp, const attr_t &attr, int arg, int64_t count,
498 const int32_t *zero_points) {
499 if (!attr.zero_points.runtime(arg)) return;
500 maybe_prepare_runtime_zero_points(
501 zero_points_dt, attr, arg, count, zero_points);
502 const auto e = attr.zero_points.get(arg);
503 const int64_t cnt = e.policy == policy_t::COMMON ? 1 : count;
504 zero_points_fp = dnn_mem_t(1, &cnt, dnnl_s32, tag::x, get_cpu_engine());
505 for (int64_t c = 0; c < cnt; ++c)
506 ((int32_t *)zero_points_fp)[c] = ((int32_t *)zero_points_dt)[c];
507}
508
509std::vector<float> prepare_po_vals(const dnn_mem_t &dst_m, const args_t &args,
510 const std::vector<std::pair<int, int>> &v_po_masks,
511 const size_t dst_off) {
512 std::vector<float> v_vals(v_po_masks.size());
513
514 for (size_t d = 0; d < v_po_masks.size(); ++d) {
515 const auto po_offset
516 = dst_m.get_scale_idx(dst_off, v_po_masks[d].second);
517 const float val = args.find(v_po_masks[d].first).get_elem(po_offset);
518 v_vals[d] = val;
519 }
520 return v_vals;
521}
522
523bool check_md_consistency_with_tag(
524 const_dnnl_memory_desc_t md, const std::string &tag) {
525 auto md_new_tag = dnn_mem_t::init_md(
526 query_md_ndims(md), query_md_dims(md), query_md_data_type(md), tag);
527 return dnnl_memory_desc_equal(md_new_tag, md);
528}
529
530void skip_start(res_t *res) {
531 if (benchdnn_stat.tests < test_start) {
532 res->state = SKIPPED;
533 res->reason = SKIP_START;
534 return;
535 }
536}
537
538void skip_unimplemented_data_type(
539 const std::vector<dnnl_data_type_t> &v_dt, dir_t dir, res_t *res) {
540 const bool has_f64_support = is_f64_supported();
541#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
542 using namespace dnnl::impl::cpu::platform;
543 // bf16 is supported on AVX512-CORE+
544 const bool has_bf16_support = is_gpu()
545 || (is_cpu() && has_data_type_support(dnnl_bf16)
546 && IMPLICATION(!(dir & FLAG_INF),
547 has_training_support(dnnl_bf16)));
548 const bool has_f16_support = (is_gpu() && (dir & FLAG_FWD))
549 || (is_cpu() && has_data_type_support(dnnl_f16)
550 && IMPLICATION(
551 !(dir & FLAG_INF), has_training_support(dnnl_f16)));
552
553#else
554 const bool has_bf16_support = is_gpu();
555 // f16 is supported on GPU for inference only.
556 const bool has_f16_support = is_gpu() && (dir & FLAG_FWD);
557#endif
558
559 for (const auto &i_dt : v_dt) {
560 bool need_skip = false;
561 switch (i_dt) {
562 case dnnl_bf16: need_skip = !has_bf16_support; break;
563 case dnnl_f16: need_skip = !has_f16_support; break;
564 case dnnl_f64: need_skip = !has_f64_support; break;
565 default: break;
566 }
567 if (need_skip) {
568 res->state = SKIPPED, res->reason = DATA_TYPE_NOT_SUPPORTED;
569 return;
570 }
571 }
572}
573
574void skip_unimplemented_sum_po(
575 const attr_t &attr, res_t *res, dnnl_data_type_t dst_dt) {
576 const auto &po = attr.post_ops;
577 if (po.is_def()) return;
578
579 const int first_sum_idx = po.find(attr_t::post_ops_t::SUM);
580 if (first_sum_idx == -1) return;
581
582 const auto sum_dt = po.entry[first_sum_idx].sum.dt;
583
584 for (int idx = 0; idx < po.len(); ++idx) {
585 const auto &e = po.entry[idx];
586 if (e.is_sum_kind()) {
587 // Sum with zero-point is not supported on GPU
588 if (is_gpu() && e.sum.zero_point != 0) {
589 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
590 break;
591 }
592 // Each sum must have same data on CPU
593 if (is_cpu() && e.sum.dt != sum_dt) {
594 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
595 break;
596 }
597 // Sum must have data type with the same size like dst on both
598 if (dst_dt != dnnl_data_type_undef && sum_dt != dnnl_data_type_undef
599 && dnnl_data_type_size(dst_dt)
600 != dnnl_data_type_size(e.sum.dt)) {
601 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
602 return;
603 }
604 }
605 }
606}
607
608void skip_unimplemented_arg_scale(const attr_t &attr, res_t *res) {
609 for (const auto &arg_s : attr.scales.scales) {
610 if (arg_s.second.policy != policy_t::COMMON) {
611 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
612 return;
613 }
614 }
615}
616
617void skip_invalid_inplace(res_t *res, dnnl_data_type_t sdt,
618 dnnl_data_type_t ddt, const std::string &stag,
619 const std::string &dtag) {
620 // Note: existing implementation of dnn_mem_t doesn't allow to track the
621 // fact that two different objects pointing on the same SYCL memory should
622 // not map/unmap both objects.
623 // This leads to the restriction that memory descriptors should coincide,
624 // thus, a single memory object would be used for in-place validation.
625 // General limitation of in-place mode is having same amount of memory on
626 // input and output.
627 if (sdt != ddt) {
628 res->state = SKIPPED, res->reason = INVALID_CASE;
629 return;
630 }
631
632 if (dtag == tag::any) return;
633 if (stag != dtag) {
634 res->state = SKIPPED, res->reason = INVALID_CASE;
635 return;
636 }
637}
638
639// Check ensures that attributes don't cause implementation fallback
640int check_same_pd(const dnnl_primitive_desc_t &pd_no_attr, res_t *res) {
641 const auto pd_no_attr_name = query_impl_info(pd_no_attr);
642 if (res->impl_name == pd_no_attr_name) return OK;
643
644 res->state = FAILED;
645 BENCHDNN_PRINT(0,
646 "ERROR: attributes caused impl fallback from [%s] to [%s]\n",
647 pd_no_attr_name.c_str(), res->impl_name.c_str());
648 return FAIL;
649}
650
651bool is_cpu(const dnnl_engine_t &engine) {
652 return query_engine_kind(engine) == dnnl_cpu;
653}
654
655bool is_gpu(const dnnl_engine_t &engine) {
656 return query_engine_kind(engine) == dnnl_gpu;
657}
658
659bool is_sycl_engine(const dnnl_engine_t &engine) {
660 if (is_cpu(engine)) return DNNL_CPU_RUNTIME == DNNL_RUNTIME_DPCPP;
661 if (is_gpu(engine)) return DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP;
662 return false;
663}
664
665bool is_opencl_engine(const dnnl_engine_t &engine) {
666 if (is_gpu(engine)) return DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL;
667 return false;
668}
669
670bool is_nvidia_gpu(const dnnl_engine_t &engine) {
671#ifdef DNNL_WITH_SYCL
672 if (!is_gpu(engine)) return false;
673 constexpr int nvidia_vendor_id = 0x10DE;
674 auto eng = dnnl::engine(engine, true);
675 auto device = dnnl::sycl_interop::get_device(eng);
676 const auto eng_vendor_id
677 = device.get_info<::sycl::info::device::vendor_id>();
678 return eng_vendor_id == nvidia_vendor_id;
679#endif
680 return false;
681}
682
683bool is_amd_gpu(const dnnl_engine_t &engine) {
684#ifdef DNNL_WITH_SYCL
685 if (!is_gpu(engine)) return false;
686 constexpr int amd_vendor_id = 0x1002;
687 auto eng = dnnl::engine(engine, true);
688 auto device = dnnl::sycl_interop::get_device(eng);
689 const auto eng_vendor_id
690 = device.get_info<::sycl::info::device::vendor_id>();
691 return eng_vendor_id == amd_vendor_id;
692#endif
693 return false;
694}
695
696bool is_f64_supported(const dnnl_engine_t &engine) {
697 if (!is_gpu(engine)) return false;
698 if (is_nvidia_gpu(engine) || is_amd_gpu(engine)) return false;
699#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
700 if (is_sycl_engine(engine)) {
701 auto eng = dnnl::engine(engine, true);
702 auto dev = dnnl::sycl_interop::get_device(eng);
703#ifdef DNNL_SYCL_INTEROP_USE_SYCL121
704 return dev.has_extension("cl_khr_fp64");
705#else
706 return dev.has(::sycl::aspect::fp64);
707#endif
708 }
709#endif
710#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
711 if (is_opencl_engine(engine)) {
712 auto eng = dnnl::engine(engine, true);
713 cl_device_id dev = dnnl::ocl_interop::get_device(eng);
714 size_t param_size = 0;
715 cl_int err = clGetDeviceInfo(
716 dev, CL_DEVICE_EXTENSIONS, 0, nullptr, &param_size);
717 if (err != CL_SUCCESS) return false;
718
719 std::string extension_string(param_size, '\0');
720 err = clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, param_size,
721 &extension_string[0], &param_size);
722 if (err != CL_SUCCESS) return false;
723
724 return extension_string.find("cl_khr_fp64") != std::string::npos;
725 }
726#endif
727 return false;
728}
729
730#if defined(_WIN32) && !defined(__GNUC__)
731#include "windows.h"
732
733static size_t get_cpu_ram_size() {
734 MEMORYSTATUSEX s {};
735 s.dwLength = sizeof(s);
736 GlobalMemoryStatusEx(&s);
737 return s.ullTotalPhys;
738}
739#elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__QNXNTO__)
740#include <unistd.h>
741#include <sys/sysctl.h>
742
743static size_t get_cpu_ram_size() {
744#ifdef __APPLE__
745 int query_ram[] = {CTL_HW, HW_MEMSIZE};
746#else
747 int query_ram[] = {CTL_HW, HW_PHYSMEM};
748#endif
749 int query_ram_len = sizeof(query_ram) / sizeof(*query_ram);
750 size_t totalram = 0;
751 size_t length = sizeof(totalram);
752
753 sysctl(query_ram, query_ram_len, &totalram, &length, NULL, 0);
754 return totalram;
755}
756#else
757#include <sys/sysinfo.h>
758
759static size_t get_cpu_ram_size() {
760 struct sysinfo s {};
761 sysinfo(&s);
762 return s.totalram;
763}
764#endif
765
766static size_t get_gpu_ram_size() {
767 // XXX: create a tmp engine to query what we need.
768 // It will be removed in the future as part of switching back
769 // to the global engine.
770 engine_t eng_tmp(engine_tgt_kind);
771 dnnl::engine eng(eng_tmp, true);
772 if (eng.get_kind() != dnnl::engine::kind::gpu) return 0;
773
774#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
775 cl_int status = CL_SUCCESS;
776 // Get single device attached to the engine.
777 engine_t engine_tgt(engine_tgt_kind);
778 cl_device_id ocl_device = dnnl::ocl_interop::get_device(eng);
779
780 cl_ulong ram_size = 0;
781 status = clGetDeviceInfo(ocl_device, CL_DEVICE_GLOBAL_MEM_SIZE,
782 sizeof(cl_ulong), &ram_size, nullptr);
783 if (status == CL_SUCCESS) return (size_t)ram_size;
784#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
785 auto sycl_dev = dnnl::sycl_interop::get_device(eng);
786 return (size_t)sycl_dev.get_info<::sycl::info::device::global_mem_size>();
787#endif
788 return 0;
789}
790
791struct check_mem_size_args_t {
792 check_mem_size_args_t(const_dnnl_primitive_desc_t pd, bool want_input,
793 bool add_ref_size = false)
794 : pd(pd)
795 , want_input(want_input)
796 , add_ref_size(add_ref_size)
797 , is_scratchpad(false)
798 , total_size_device(0)
799 , total_size_cpu(0)
800 , scratchpad_size(0) {}
801
802 // Input args.
803 const_dnnl_primitive_desc_t pd;
804 bool want_input;
805 bool add_ref_size;
806 bool is_scratchpad;
807
808 // Output args.
809 size_t total_size_device;
810 size_t total_size_cpu;
811 size_t scratchpad_size;
812};
813
814static int check_total_size(
815 const check_mem_size_args_t &check_mem_size_args, res_t *res) {
816 static uint64_t cpu_device_capacity = get_cpu_ram_size();
817 static uint64_t gpu_device_capacity = get_gpu_ram_size();
818
819 const uint64_t device_max_capacity
820 = is_cpu() ? cpu_device_capacity : gpu_device_capacity;
821 const uint64_t cpu_max_capacity = cpu_device_capacity;
822
823 // 0.75f is taken randomly and is subject to change in future.
824 const double capacity_factor = 0.75;
825 const double benchdnn_device_limit = capacity_factor * device_max_capacity;
826 const double benchdnn_cpu_limit = capacity_factor * cpu_max_capacity;
827 assert(benchdnn_device_limit > 0 && benchdnn_cpu_limit > 0);
828
829 const bool fits_device_ram = is_gpu()
830 ? (check_mem_size_args.total_size_device <= benchdnn_device_limit)
831 : true;
832 if (!fits_device_ram) {
833 BENCHDNN_PRINT(
834 2, "%s\n", "benchdnn: not enough device RAM for a problem.");
835 res->state = SKIPPED;
836 res->reason = NOT_ENOUGH_RAM;
837 }
838
839 auto GB = [](double bytes) { return bytes / powf(2, 30); };
840
841 if (is_gpu()) {
842 BENCHDNN_PRINT((!fits_device_ram ? 2 : 6),
843 "Requested: %g GB, benchdnn device limit: %g GB, device RAM "
844 "capacity: %g GB\n",
845 GB(check_mem_size_args.total_size_device),
846 GB(benchdnn_device_limit), GB(gpu_device_capacity));
847 }
848
849 size_t total_size_cpu = check_mem_size_args.total_size_cpu;
850 if (is_cpu()) total_size_cpu += check_mem_size_args.total_size_device;
851 bool fits_cpu_ram = total_size_cpu <= benchdnn_cpu_limit;
852
853 if (!fits_cpu_ram) {
854 BENCHDNN_PRINT(
855 2, "%s\n", "benchdnn: not enough CPU RAM for a problem.");
856 // Try to catch a huge scratchpad size requested by the library.
857 // Use following logic:
858 // scratch_size
859 // ---------------------- <= 0.75 (pre-defined threshold).
860 // io_size + scratch_size
861 //
862 // 0.75 value supposed to be experimental and might be adjusted.
863 static constexpr float scratch_trh = 0.75f;
864 if (check_mem_size_args.scratchpad_size
865 > scratch_trh * total_size_cpu) {
866 BENCHDNN_PRINT(2, "%s `%ld` %s `%ld`.\n",
867 "benchdnn: CPU scratchpad size",
868 (long)check_mem_size_args.scratchpad_size,
869 "exceeded a given threshold",
870 (long)(scratch_trh * total_size_cpu));
871 res->state = FAILED;
872 } else {
873 res->state = SKIPPED;
874 }
875 res->reason = NOT_ENOUGH_RAM;
876 }
877
878 BENCHDNN_PRINT((!fits_cpu_ram ? 2 : 6),
879 "Requested: %g GB, benchdnn CPU limit: %g GB, CPU RAM capacity: %g "
880 "GB\n",
881 GB(total_size_cpu), GB(benchdnn_cpu_limit),
882 GB(cpu_device_capacity));
883
884 return res->state == FAILED ? FAIL : OK;
885}
886
887static void add_md_size(const_dnnl_memory_desc_t md,
888 check_mem_size_args_t &check_mem_size_args) {
889 const auto mem_size = dnnl_memory_desc_get_size(md);
890 // Runtime mem size is not defined.
891 if (mem_size == 0 || mem_size == DNNL_RUNTIME_SIZE_VAL) return;
892
893 check_mem_size_args.total_size_device += mem_size; // Original memory size.
894 if (!check_mem_size_args.add_ref_size) return;
895
896 // Reference memories are always tag::abx fp32, hence need re-creating
897 // memory descriptor and take its size.
898 auto ref_md = dnn_mem_t::init_md(
899 query_md_ndims(md), query_md_dims(md), dnnl_f32, tag::abx);
900 const auto ref_md_size = dnnl_memory_desc_get_size(ref_md);
901
902 // Correctness pass allocates additional tag::abx f32 memory.
903 bool compare_mem_factor = !check_mem_size_args.want_input
904 && check_mem_size_args.add_ref_size;
905
906 // All memory is mapped once it is created and unmapped only before
907 // primitive execution. Device memory requires additional buffer for mapped
908 // memory.
909 // XXX: In DPC++ build oneDNN uses USM memory, which shouldn't require an
910 // additional buffer, so map factor should be equal to 0 for DPC++.
911 // However due to a driver issue oneDNN pretends that shared USM is not
912 // accessible on the host, hence map will allocate an extra memory.
913 check_mem_size_args.total_size_cpu += !is_cpu() * mem_size; // Map factor.
914 if (check_mem_size_args.is_scratchpad) {
915 check_mem_size_args.scratchpad_size += mem_size;
916 } else {
917 check_mem_size_args.total_size_cpu += ref_md_size; // Reference memory.
918 // Comparison memory.
919 check_mem_size_args.total_size_cpu += compare_mem_factor * ref_md_size;
920 }
921}
922
923bool is_fwd_prop_kind(dnnl_prop_kind_t prop_kind) {
924 return prop_kind == dnnl_forward_training
925 || prop_kind == dnnl_forward_inference
926 || prop_kind == dnnl_prop_kind_undef;
927}
928
929static void get_memory_bytes(check_mem_size_args_t &check_mem_size_args) {
930 auto const_pd = check_mem_size_args.pd;
931 const int n_idx = check_mem_size_args.want_input
932 ? query_n_inputs(const_pd)
933 : query_n_outputs(const_pd);
934 const auto prop_kind = query_prop_kind(const_pd);
935 const bool is_fwd = is_fwd_prop_kind(prop_kind);
936
937#define MD(name) dnnl_query_##name##_md
938 std::vector<dnnl_query_t> query_fwd_in_mds {MD(src), MD(weights)};
939 std::vector<dnnl_query_t> query_fwd_out_mds {MD(dst), MD(workspace)};
940
941 std::vector<dnnl_query_t> query_bwd_in_mds {
942 MD(src), MD(weights), MD(dst), MD(diff_dst), MD(workspace)};
943 std::vector<dnnl_query_t> query_bwd_out_mds {
944 MD(diff_src), MD(diff_weights)};
945#undef MD
946
947 const auto &query_in_mds = is_fwd ? query_fwd_in_mds : query_bwd_in_mds;
948 const auto &query_out_mds = is_fwd ? query_fwd_out_mds : query_bwd_out_mds;
949 const auto &query_mds
950 = check_mem_size_args.want_input ? query_in_mds : query_out_mds;
951
952 for_(const auto query : query_mds)
953 for (int idx = 0; idx < n_idx; ++idx) {
954 const auto &md = query_md(const_pd, query, idx);
955 add_md_size(md, check_mem_size_args);
956 }
957}
958
959int check_mem_size(const_dnnl_memory_desc_t md, res_t *res) {
960 if (!mem_check) return OK;
961
962 check_mem_size_args_t check_mem_size_args(nullptr, false, false);
963 check_mem_size_args.total_size_device = dnnl_memory_desc_get_size(md);
964
965 return check_total_size(check_mem_size_args, res);
966}
967
968int check_mem_size(const_dnnl_primitive_desc_t const_pd, res_t *res) {
969 if (!mem_check) return OK;
970
971 // Get input sizes.
972 check_mem_size_args_t check_mem_size_args(const_pd, /* want_input = */ true,
973 /* add_ref_size = */ true);
974 get_memory_bytes(check_mem_size_args);
975
976 // Get scratchpad size. Treat it as `want_input=true` to avoid comparison
977 // factor count. Since scratchpad modes are mutual excluded, it takes sizes
978 // of both modes since either of them will report 0 size.
979 check_mem_size_args.is_scratchpad = true;
980 const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD);
981 add_md_size(scratchpad_md, check_mem_size_args);
982 check_mem_size_args.is_scratchpad = false;
983 check_mem_size_args.total_size_device += query_mem_consumption(const_pd);
984 check_mem_size_args.scratchpad_size += query_mem_consumption(const_pd);
985
986 // Get output sizes.
987 check_mem_size_args.want_input = false;
988 get_memory_bytes(check_mem_size_args);
989
990 return check_total_size(check_mem_size_args, res);
991}
992
993int get_memory_footprint(const_dnnl_primitive_desc_t const_pd, res_t *res) {
994 check_mem_size_args_t check_mem_in_size_args(
995 const_pd, /* want_input = */ true);
996 get_memory_bytes(check_mem_in_size_args); // Get input bytes.
997 check_mem_size_args_t check_mem_out_size_args(
998 const_pd, /* want_input = */ false);
999 get_memory_bytes(check_mem_out_size_args); // Get output bytes.
1000
1001 // Update read bytes with dst bytes in case of sum post-op.
1002 auto const_attr_po = query_post_ops(const_pd);
1003 auto po_len = dnnl_post_ops_len(const_attr_po);
1004 for (int idx = 0; idx < po_len; ++idx) {
1005 const auto kind = dnnl_post_ops_get_kind(const_attr_po, idx);
1006 if (kind == dnnl_sum) {
1007 const auto &dst_md = query_md(const_pd, DNNL_ARG_DST);
1008 add_md_size(dst_md, check_mem_in_size_args);
1009 }
1010 }
1011
1012 res->ibytes = check_mem_in_size_args.total_size_device;
1013 res->obytes = check_mem_out_size_args.total_size_device;
1014
1015 return OK;
1016}
1017
1018memory_kind_ext_t str2memory_kind(const char *str) {
1019#define CASE(param) \
1020 if (!strcasecmp(#param, str)) return memory_kind_ext_t::param
1021
1022 CASE(usm);
1023 CASE(buffer);
1024 CASE(usm_device);
1025 CASE(usm_shared);
1026
1027#undef CASE
1028
1029 assert(!"not expected");
1030 return memory_kind_ext_t::usm;
1031}
1032
1033static void maybe_print_cpu_engine_error_message() {
1034#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
1035 fprintf(stderr,
1036 "ERROR: can't create CPU engine. Possible reasons for this error:\n"
1037 "- Incorrect SYCL_DEVICE_FILTER. The filter must be either unset "
1038 "or include 'opencl:cpu' devices.\n"
1039 "- Missing TBB library which is required for OpenCL CPU runtime. "
1040 "Check that TBB library is available in the system.\n"
1041 "- Missing OpenCL CPU runtime or other issues with OpenCL CPU "
1042 "runtime. Check that output from `sycl-ls` or `clinfo -l` commands "
1043 "include any CPU devices.\n");
1044#endif
1045}
1046
1047engine_t::engine_t(dnnl_engine_kind_t engine_kind) : is_owner_(true) {
1048 enable_gpu_profiling();
1049 size_t idx = engine_kind == dnnl_cpu ? 0 : engine_index;
1050 dnnl_status_t status = dnnl_engine_create(&engine_, engine_kind, idx);
1051 if (engine_kind == dnnl_cpu && status != dnnl_success)
1052 maybe_print_cpu_engine_error_message();
1053 DNN_SAFE_V(status);
1054}
1055
1056engine_t::engine_t(dnnl_engine_t engine) : engine_(engine), is_owner_(false) {}
1057
1058engine_t::engine_t(const engine_t &other) {
1059 is_owner_ = other.is_owner_;
1060
1061 if (!is_owner_) {
1062 engine_ = other.engine_;
1063 return;
1064 }
1065
1066 dnnl_engine_kind_t engine_kind;
1067 DNN_SAFE_V(dnnl_engine_get_kind(other.engine_, &engine_kind));
1068
1069 if (engine_kind == dnnl_cpu) {
1070#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
1071 void *dev;
1072 void *ctx;
1073 DNN_SAFE_V(dnnl_sycl_interop_engine_get_device(other.engine_, &dev));
1074 DNN_SAFE_V(dnnl_sycl_interop_engine_get_context(other.engine_, &ctx));
1075 DNN_SAFE_V(dnnl_sycl_interop_engine_create(&engine_, dev, ctx));
1076#else
1077 DNN_SAFE_V(dnnl_engine_create(&engine_, dnnl_cpu, 0));
1078#endif
1079 } else if (engine_kind == dnnl_gpu) {
1080#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
1081 cl_device_id dev;
1082 cl_context ctx;
1083 DNN_SAFE_V(dnnl_ocl_interop_get_device(other.engine_, &dev));
1084 DNN_SAFE_V(dnnl_ocl_interop_engine_get_context(other.engine_, &ctx));
1085 DNN_SAFE_V(dnnl_ocl_interop_engine_create(&engine_, dev, ctx));
1086#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
1087 void *dev;
1088 void *ctx;
1089 DNN_SAFE_V(dnnl_sycl_interop_engine_get_device(other.engine_, &dev));
1090 DNN_SAFE_V(dnnl_sycl_interop_engine_get_context(other.engine_, &ctx));
1091 DNN_SAFE_V(dnnl_sycl_interop_engine_create(&engine_, dev, ctx));
1092#endif
1093 } else {
1094 assert(!"unsupported engine kind");
1095 }
1096}
1097
1098engine_t::~engine_t() {
1099 if (is_owner_) DNN_SAFE_V(dnnl_engine_destroy(engine_));
1100}
1101
1102stream_t::stream_t(dnnl_engine_t engine, void *interop_obj) {
1103#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
1104 if (is_cpu(engine)) {
1105 auto tp = static_cast<dnnl::threadpool_interop::threadpool_iface *>(
1106 interop_obj);
1107 if (tp == nullptr) tp = dnnl::testing::get_threadpool();
1108 SAFE_V(dnnl_threadpool_interop_stream_create(&stream_, engine, tp));
1109 return;
1110 }
1111#endif
1112 DNN_SAFE_V(dnnl_stream_create(&stream_, engine, dnnl_stream_default_flags));
1113}
1114
1115stream_t::~stream_t() {
1116 DNN_SAFE_V(dnnl_stream_destroy(stream_));
1117}
1118
1119float reorder_rescale_factor() {
1120 float factor = 1.f;
1121#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
1122 if (is_cpu(get_test_engine()))
1123 factor = dnnl::impl::cpu::platform::s8s8_weights_scale_factor();
1124#endif
1125 return factor;
1126}
1127
1128dims_t md2dims(const dnnl_memory_desc_t &md) {
1129 auto ndims = query_md_ndims(md);
1130 dims_t dims(ndims, 0);
1131 for (int d = 0; d < ndims; ++d)
1132 dims[d] = query_md_dims(md)[d];
1133 return dims;
1134}
1135
1136dnnl_data_type_t deduce_cfg_data_type(
1137 dnnl_data_type_t in_dt, const attr_t &attr, data_kind_t dk) {
1138 dnnl_data_type_t dt_ = in_dt;
1139
1140 if ((dk == SRC || dk == WEI) && dt_ == dnnl_f32) {
1141 // Update data type based on fpmath-mode attribute
1142 switch (attr.fpmath_mode) {
1143 case dnnl_fpmath_mode_strict: break;
1144 case dnnl_fpmath_mode_bf16: dt_ = dnnl_bf16; break;
1145 case dnnl_fpmath_mode_tf32: dt_ = dnnl_bf16; break;
1146 default: assert(!"unsupported_fpmath_mode"); SAFE_V(CRIT);
1147 }
1148 } else if (dk == DST) {
1149 // Sum post-op defines the type of filling destination.
1150 const int sum_idx = attr.post_ops.find(attr_t::post_ops_t::SUM);
1151 if (sum_idx >= 0) {
1152 auto sum_dt = attr.post_ops.entry[sum_idx].sum.dt;
1153 if (sum_dt != dnnl_data_type_undef) dt_ = sum_dt;
1154 }
1155 }
1156
1157 return dt_;
1158}
1159