dnnl_common.cpp source code [oneDNN/tests/benchdnn/dnnl_common.cpp]

1	/*******************************************************************************
2	* Copyright 2017-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <algorithm> // for std::reverse and std::copy
18	#include <functional> // for std::bind and std::placeholders
19	#include <list>
20	#include <string> // for std::string
21	#include <utility> // for std::pair
22	#include <vector> // for std::vector
23
24	#include <assert.h>
25
26	#include "oneapi/dnnl/dnnl.hpp"
27	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
28	#include "oneapi/dnnl/dnnl_ocl.hpp"
29	#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
30	#include "oneapi/dnnl/dnnl_sycl.hpp"
31	#endif
32
33	#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
34	#include "oneapi/dnnl/dnnl_threadpool.h"
35	#endif
36
37	#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
38	#include "src/common/primitive_cache.hpp"
39	#endif
40
41	#include "cpu/platform.hpp"
42
43	#include "tests/test_thread.hpp"
44
45	#include "dnnl_common.hpp"
46	#include "dnnl_memory.hpp"
47
48	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
49	\|\| DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
50	extern "C" dnnl_status_t dnnl_impl_gpu_set_profiling(int flag);
51	extern "C" dnnl_status_t dnnl_impl_gpu_reset_profiling();
52	extern "C" dnnl_status_t dnnl_impl_gpu_get_profile_info(
53	uint64_t &time, double &freq, int mode);
54	#endif
55
56	int check_pd_cache(const_dnnl_primitive_desc_t pd) {
57	#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
58	int capacity = `0`;
59	DNN_SAFE(dnnl_get_primitive_cache_capacity(&capacity), CRIT);
60	if (capacity && !dnnl::impl::is_pd_in_cache(pd)) {
61	BENCHDNN_PRINT(`0`, "error: %s\n",
62	"primitive descriptor is expected to be fetched from "
63	"the primitive cache");
64	return FAIL;
65	}
66	#endif
67	return OK;
68	}
69
70	int check_primitive_cache(dnnl_primitive_t p) {
71	#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
72	int capacity = `0`;
73	DNN_SAFE(dnnl_get_primitive_cache_capacity(&capacity), CRIT);
74	if (capacity && !dnnl::impl::is_primitive_in_cache(p)) {
75	BENCHDNN_PRINT(`0`, "error: %s\n",
76	"primitive is expected to be fetched from the primitive "
77	"cache");
78	return FAIL;
79	}
80	#endif
81	return OK;
82	}
83
84	size_t set_primitive_cache_capacity_without_clearing(size_t capacity) {
85	#ifndef DNNL_DISABLE_PRIMITIVE_CACHE
86	return dnnl::impl::set_primitive_cache_capacity_without_clearing(capacity);
87	#endif
88	return size_t(`0`);
89	}
90
91	int get_cache_blob_id(
92	std::vector<uint8_t> &cache_blob_id, const_dnnl_primitive_desc_t pd) {
93	dnnl_dim_t count;
94	const uint8_t *c_id;
95	DNN_SAFE(dnnl_primitive_desc_query(
96	pd, dnnl_query_cache_blob_id_size_s64, `0`, (void *)&count),
97	WARN);
98	DNN_SAFE(dnnl_primitive_desc_query(
99	pd, dnnl_query_cache_blob_id, `0`, (void **)&c_id),
100	WARN);
101	cache_blob_id = {c_id, c_id + count};
102	return OK;
103	}
104
105	int get_cache_blob(std::vector<uint8_t> &cache_blob, dnnl_primitive_t prim) {
106	size_t size = `0`;
107	DNN_SAFE(dnnl_primitive_get_cache_blob(prim, &size, nullptr), WARN);
108
109	cache_blob.resize(size);
110	DNN_SAFE(dnnl_primitive_get_cache_blob(prim, &size, cache_blob.data()),
111	WARN);
112	return OK;
113	}
114
115	struct lru_cache_t {
116	lru_cache_t(size_t capacity) : capacity_(capacity) {}
117
118	const std::vector<uint8_t> &get(const std::vector<uint8_t> &key) {
119	auto it = cache_mapper_.find(key);
120	if (it == cache_mapper_.end()) { return dummy_; }
121	cache_list_.splice(cache_list_.begin(), cache_list_, it->second);
122	return cache_list_.front().value_;
123	}
124
125	void add(const std::vector<uint8_t> &key,
126	const std::vector<uint8_t> &value) {
127	assert(!cache_mapper_.count(key));
128	if (cache_mapper_.size() >= capacity_) {
129	cache_mapper_.erase(cache_list_.back().key_);
130	cache_list_.pop_back();
131	}
132	cache_list_.push_front(entry_t(key, value));
133	cache_mapper_.insert({key, cache_list_.begin()});
134	}
135
136	private:
137	lru_cache_t(const lru_cache_t &other) = delete;
138	lru_cache_t(lru_cache_t &&other) = delete;
139	lru_cache_t &operator=(const lru_cache_t &other) = delete;
140	lru_cache_t &operator=(lru_cache_t &&other) = delete;
141
142	struct entry_t {
143	entry_t(const std::vector<uint8_t> &key,
144	const std::vector<uint8_t> &value)
145	: key_(key), value_(value) {}
146	std::vector<uint8_t> key_;
147	std::vector<uint8_t> value_;
148	};
149
150	size_t capacity_;
151	std::list<entry_t> cache_list_;
152	std::map<std::vector<uint8_t>, std::list<entry_t>::iterator> cache_mapper_;
153
154	const std::vector<uint8_t> dummy_;
155	};
156
157	lru_cache_t &get_test_cache() {
158	static lru_cache_t cache(`1024`);
159	return cache;
160	}
161
162	int test_persistent_cache_api(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim,
163	const_dnnl_primitive_desc_t pd, res_t *res) {
164	if (!is_gpu() \|\| (is_gpu() && DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL)) {
165	return OK;
166	}
167
168	// Start testing persistent cache API.
169	// 1. Disable primitive cache to make sure that the next primitive will
170	// be created from the cache blob and not fetched from the primitive cache.
171	const auto old_capacity = set_primitive_cache_capacity_without_clearing(`0`);
172	// 2. Get cache blob ID to use it as a key for the `test_cache`.
173	std::vector<uint8_t> cache_blob_id;
174	SAFE(get_cache_blob_id(cache_blob_id, pd), WARN);
175	// 3. Check if a cache blob for the obtained cache blob ID is present in the
176	// `test_cache`.
177	// a) If the cache blob is found the primitive is created from it.
178	// b) If the cache blob is not found then get it from the previously
179	// created primitive, store it in the cache and create the primitive
180	// from it.
181	dnnl_primitive_t p {};
182	auto &cache = get_test_cache();
183	auto cache_value = cache.get(cache_blob_id);
184	if (!cache_value.empty()) {
185	const size_t size = cache_value.size();
186	const uint8_t *cache_blob = cache_value.data();
187	DNN_SAFE(
188	dnnl_primitive_create_from_cache_blob(&p, pd, size, cache_blob),
189	WARN);
190	} else {
191	std::vector<uint8_t> cache_blob;
192	SAFE(get_cache_blob(cache_blob, prim), WARN);
193
194	// The cross-engine reorder is a special primitive that may contain no
195	// kernels therefore the cache blob will always be empty, which is
196	// the correct behavior.
197	if (cache_blob.empty()) {
198	set_primitive_cache_capacity_without_clearing(old_capacity);
199	if (res->impl_name.find("cross_engine") != std::string::npos)
200	return OK;
201
202	BENCHDNN_PRINT(
203	`0`, "error: %s\n", "cache blob is not expected to be empty");
204	res->state = FAILED;
205	return FAIL;
206	}
207
208	DNN_SAFE(dnnl_primitive_create_from_cache_blob(
209	&p, pd, cache_blob.size(), cache_blob.data()),
210	WARN);
211	cache.add(cache_blob_id, cache_blob);
212	}
213	prim.reset(p);
214
215	// 4. Restore the original primitive cache capacity to make it functional.
216	set_primitive_cache_capacity_without_clearing(old_capacity);
217
218	return OK;
219	}
220
221	float round_to_nearest_representable(dnnl_data_type_t dt, float value) {
222	switch (dt) {
223	case dnnl_f32: break;
224	case dnnl_f64: break;
225	case dnnl_bf16: value = (float)dnnl::impl::bfloat16_t(value); break;
226	case dnnl_f16: value = (float)dnnl::impl::float16_t (value); break;
227	case dnnl_s32:
228	case dnnl_s8:
229	case dnnl_u8: value = maybe_saturate(dt, value); break;
230	default: SAFE(FAIL, CRIT);
231	}
232
233	return value;
234	}
235
236	// Engine kind used to run oneDNN primitives for testing
237	dnnl_engine_kind_t engine_tgt_kind = dnnl_cpu;
238	// Engine index used to run oneDNN primitives for testing
239	size_t engine_index = `0`;
240	// CPU ISA specific hints : none by default
241	isa_hints_t hints {isa_hints_t::none};
242
243	memory_kind_ext_t memory_kind {default_memory_kind};
244
245	void init_isa_settings() {
246	if (hints.get() == isa_hints_t::no_hints)
247	DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_no_hints));
248	else if (hints.get() == isa_hints_t::prefer_ymm)
249	DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_prefer_ymm));
250	else {
251	// Do nothing when hints == none
252	assert(hints.get() == isa_hints_t::none);
253	}
254	}
255
256	args_t &args_t::set(int arg, const dnn_mem_t &mem) {
257	args_.emplace_back(arg, &mem);
258	return *this;
259	}
260
261	args_t &args_t::set(
262	const std::vector<int> &args, const std::vector<dnn_mem_t> &mems) {
263	assert(args.size() == mems.size());
264	for (size_t i = `0`; i < mems.size(); ++i)
265	args_.emplace_back(args[i], &mems[i]);
266	return *this;
267	}
268
269	const dnn_mem_t &args_t::find(int arg) const {
270	static dnn_mem_t empty_stub;
271	for (const auto &e : args_) {
272	if (e.first == arg) return *(e.second);
273	}
274	return empty_stub;
275	}
276
277	// Unmap before passing the memory to execute
278	void execute_unmap_args(
279	const args_t &args, std::vector<dnnl_exec_arg_t> &dnnl_args) {
280	dnnl_args.resize(args.size());
281	for (int i = `0`; i < args.size(); ++i) {
282	if (args.dnn_mem(i).is_mapped()) args.dnn_mem(i).unmap();
283
284	dnnl_args[i].arg = args.arg(i);
285	dnnl_args[i].memory = args.dnn_mem(i).m_;
286	}
287	}
288
289	// Map the memory back after execute
290	void execute_map_args(const args_t &args) {
291	for (int i = `0`; i < args.size(); ++i)
292	if (!args.dnn_mem(i).is_mapped()) args.dnn_mem(i).map();
293	}
294
295	int execute_and_wait(perf_function_t &exec_func, const dnnl_engine_t &engine,
296	const args_t &args, res_t *res) {
297	stream_t stream(engine);
298	std::vector<dnnl_exec_arg_t> dnnl_args;
299
300	execute_unmap_args(args, dnnl_args);
301
302	DNN_SAFE(exec_func(stream, dnnl_args), CRIT);
303	DNN_SAFE(dnnl_stream_wait(stream), CRIT);
304	if (res) res->state = EXECUTED;
305
306	execute_map_args(args);
307
308	return OK;
309	}
310
311	dnnl_status_t primitive_executor(dnnl_primitive_t prim,
312	const dnnl_stream_t &stream,
313	const std::vector<dnnl_exec_arg_t> &dnnl_args) {
314	return dnnl_primitive_execute(
315	prim, stream, (int)dnnl_args.size(), dnnl_args.data());
316	}
317
318	int execute_and_wait(dnnl_primitive_t prim, const args_t &args, res_t *res) {
319	perf_function_t exec_func = std::bind(&primitive_executor, prim,
320	std::placeholders::_1, std::placeholders::_2);
321	auto pd = query_pd(prim);
322	auto engine = query_engine(pd);
323	return execute_and_wait(exec_func, engine, args, res);
324	}
325
326	void enable_gpu_profiling() {
327	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
328	\|\| DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
329	if (!is_bench_mode(PROF)) return;
330	DNN_SAFE_V(dnnl_impl_gpu_set_profiling(`1`));
331	#endif
332	}
333
334	void disable_gpu_profiling() {
335	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
336	\|\| DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
337	if (!is_bench_mode(PROF)) return;
338	DNN_SAFE_V(dnnl_impl_gpu_reset_profiling());
339	DNN_SAFE_V(dnnl_impl_gpu_set_profiling(`0`));
340	#endif
341	}
342
343	void reset_gpu_profiling() {
344	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
345	\|\| DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
346	if (!is_bench_mode(PROF)) return;
347	DNN_SAFE_V(dnnl_impl_gpu_reset_profiling());
348	#endif
349	}
350
351	void get_gpu_profiling_info(uint64_t &nsec, double &freq, int mode) {
352	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \
353	\|\| DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
354	if (!is_bench_mode(PROF)) return;
355	DNN_SAFE_V(dnnl_impl_gpu_get_profile_info(nsec, freq, mode));
356	#endif
357	}
358
359	inline bool should_stop(const timer::timer_t &t) {
360	const bool stop = false
361	\|\| (fix_times_per_prb && t.times() >= fix_times_per_prb)
362	\|\| (!fix_times_per_prb && t.total_ms() >= max_ms_per_prb
363	&& t.times() >= min_times_per_prb);
364	return stop;
365	}
366
367	inline int measure_perf_individual(timer::timer_t &t, dnnl_stream_t stream,
368	perf_function_t &perf_func, std::vector<dnnl_exec_arg_t> &dnnl_args) {
369	t.reset();
370	while (true) {
371	DNN_SAFE(perf_func(stream, dnnl_args), WARN);
372	t.stamp();
373	if (should_stop(t)) break;
374	}
375	return OK;
376	}
377
378	inline int measure_perf_aggregate(timer::timer_t &t, dnnl_stream_t stream,
379	perf_function_t &perf_func, std::vector<dnnl_exec_arg_t> &dnnl_args) {
380	const int max_batch_times = `10000`;
381
382	// Warm-up run, this is not measured due to possibility the associated
383	// kernel has not been built and skews the results.
384	DNN_SAFE(perf_func(stream, dnnl_args), WARN);
385	DNN_SAFE(dnnl_stream_wait(stream), WARN);
386
387	int cur_batch_times
388	= fix_times_per_prb ? fix_times_per_prb : min_times_per_prb;
389
390	t.reset();
391	reset_gpu_profiling();
392
393	bool is_first_loop = true;
394	while (true) {
395	for (int i = `0`; i < cur_batch_times; i++) {
396	DNN_SAFE(perf_func(stream, dnnl_args), WARN);
397	}
398	DNN_SAFE(dnnl_stream_wait(stream), WARN);
399
400	if (is_bench_mode(PROF)) {
401	uint64_t nsec = `0`;
402	double freq = `0`;
403	get_gpu_profiling_info(nsec, freq, `0`);
404	reset_gpu_profiling();
405	t.stamp_with_frequency(cur_batch_times, nsec / `1e6`, freq);
406	} else {
407	t.stamp(cur_batch_times);
408	}
409
410	if (should_stop(t)) break;
411
412	// Adjust cur_batch_times after the first batch run
413	if (is_first_loop) {
414	double ms_min = t.ms(timer::timer_t::min);
415	// Heuristic: try to use ~5 batch runs for the whole benchmark
416	int batch_times_heuristic = (ms_min == `0.0`)
417	? INT_MAX
418	: MAX2(`1`,
419	(int)((max_ms_per_prb - t.total_ms()) / ms_min
420	/ `5`));
421	cur_batch_times = MIN2(max_batch_times, batch_times_heuristic);
422	is_first_loop = false;
423	}
424	}
425
426	return OK;
427	}
428
429	int measure_perf(const thr_ctx_t &ctx, res_t *res, perf_function_t &perf_func,
430	args_t &args) {
431	int ret = OK;
432	if (is_bench_mode(PERF)) {
433	const auto &engine = get_test_engine();
434	stream_t stream(engine, ctx.get_interop_obj());
435	std::vector<dnnl_exec_arg_t> dnnl_args;
436	execute_unmap_args(args, dnnl_args);
437
438	auto &t = res->timer_map.perf_timer();
439	// For non-DPCPP CPU: measure individual iterations.
440	// For DPCPP CPU and GPU: measure iterations in batches to hide driver
441	// overhead. DPCPP CPU follows the model of GPU, thus, handled similar.
442	if (is_cpu() && !is_sycl_engine(engine))
443	ret = execute_in_thr_ctx(ctx, measure_perf_individual, t, stream,
444	perf_func, dnnl_args);
445	else
446	ret = execute_in_thr_ctx(ctx, measure_perf_aggregate, t, stream,
447	perf_func, dnnl_args);
448
449	if (ret == OK) execute_map_args(args);
450	}
451	return ret;
452	}
453
454	int measure_perf(
455	const thr_ctx_t &ctx, res_t *res, dnnl_primitive_t prim, args_t &args) {
456	perf_function_t perf_func = std::bind(&primitive_executor, prim,
457	std::placeholders::_1, std::placeholders::_2);
458
459	return measure_perf(ctx, res, perf_func, args);
460	}
461
462	void maybe_prepare_runtime_scales(dnn_mem_t &scales_m,
463	const attr_t::scale_t &scale, int64_t scale_cnt, const float *scales) {
464	if (!scale.runtime) return;
465
466	const int64_t count = scale.policy == policy_t::COMMON ? `1` : scale_cnt;
467
468	scales_m = dnn_mem_t(`1`, &count, dnnl_f32, tag::x, get_test_engine());
469	for (int64_t c = `0`; c < count; ++c)
470	((float *)scales_m)[c] = scales[c];
471	}
472
473	void maybe_prepare_runtime_scales_v2(dnn_mem_t &scales_dt, dnn_mem_t &scales_fp,
474	const attr_t::scale_t &scale, int64_t scale_cnt, const float *scales) {
475	if (!scale.runtime) return;
476	maybe_prepare_runtime_scales(scales_dt, scale, scale_cnt, scales);
477	const int64_t count = scale.policy == policy_t::COMMON ? `1` : scale_cnt;
478	scales_fp = dnn_mem_t(`1`, &count, dnnl_f32, tag::x, get_cpu_engine());
479	for (int64_t c = `0`; c < count; ++c)
480	((float )scales_fp)[c] = ((float* *)scales_dt)[c];
481	}
482
483	void maybe_prepare_runtime_zero_points(dnn_mem_t &zero_points_m,
484	const attr_t &attr, int arg, int64_t count,
485	const int32_t *zero_points) {
486	if (!attr.zero_points.runtime(arg)) return;
487
488	const auto e = attr.zero_points.get(arg);
489	const int64_t cnt = e.policy == policy_t::COMMON ? `1` : count;
490
491	zero_points_m = dnn_mem_t(`1`, &cnt, dnnl_s32, tag::x, get_test_engine());
492	for (int64_t c = `0`; c < cnt; ++c)
493	((int32_t *)zero_points_m)[c] = zero_points[c];
494	}
495
496	void maybe_prepare_runtime_zero_points_v2(dnn_mem_t &zero_points_dt,
497	dnn_mem_t &zero_points_fp, const attr_t &attr, int arg, int64_t count,
498	const int32_t *zero_points) {
499	if (!attr.zero_points.runtime(arg)) return;
500	maybe_prepare_runtime_zero_points(
501	zero_points_dt, attr, arg, count, zero_points);
502	const auto e = attr.zero_points.get(arg);
503	const int64_t cnt = e.policy == policy_t::COMMON ? `1` : count;
504	zero_points_fp = dnn_mem_t(`1`, &cnt, dnnl_s32, tag::x, get_cpu_engine());
505	for (int64_t c = `0`; c < cnt; ++c)
506	((int32_t )zero_points_fp)[c] = ((int32_t )zero_points_dt)[c];
507	}
508
509	std::vector<float> prepare_po_vals(const dnn_mem_t &dst_m, const args_t &args,
510	const std::vector<std::pair<int, int>> &v_po_masks,
511	const size_t dst_off) {
512	std::vector<float> v_vals(v_po_masks.size());
513
514	for (size_t d = `0`; d < v_po_masks.size(); ++d) {
515	const auto po_offset
516	= dst_m.get_scale_idx(dst_off, v_po_masks[d].second);
517	const float val = args.find(v_po_masks[d].first).get_elem(po_offset);
518	v_vals[d] = val;
519	}
520	return v_vals;
521	}
522
523	bool check_md_consistency_with_tag(
524	const_dnnl_memory_desc_t md, const std::string &tag) {
525	auto md_new_tag = dnn_mem_t::init_md(
526	query_md_ndims(md), query_md_dims(md), query_md_data_type(md), tag);
527	return dnnl_memory_desc_equal(md_new_tag, md);
528	}
529
530	void skip_start(res_t *res) {
531	if (benchdnn_stat.tests < test_start) {
532	res->state = SKIPPED;
533	res->reason = SKIP_START;
534	return;
535	}
536	}
537
538	void skip_unimplemented_data_type(
539	const std::vector<dnnl_data_type_t> &v_dt, dir_t dir, res_t *res) {
540	const bool has_f64_support = is_f64_supported();
541	#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
542	using namespace dnnl::impl::cpu::platform;
543	// bf16 is supported on AVX512-CORE+
544	const bool has_bf16_support = is_gpu()
545	\|\| (is_cpu() && has_data_type_support(dnnl_bf16)
546	&& IMPLICATION(!(dir & FLAG_INF),
547	has_training_support(dnnl_bf16)));
548	const bool has_f16_support = (is_gpu() && (dir & FLAG_FWD))
549	\|\| (is_cpu() && has_data_type_support(dnnl_f16)
550	&& IMPLICATION(
551	!(dir & FLAG_INF), has_training_support(dnnl_f16)));
552
553	#else
554	const bool has_bf16_support = is_gpu();
555	// f16 is supported on GPU for inference only.
556	const bool has_f16_support = is_gpu() && (dir & FLAG_FWD);
557	#endif
558
559	for (const auto &i_dt : v_dt) {
560	bool need_skip = false;
561	switch (i_dt) {
562	case dnnl_bf16: need_skip = !has_bf16_support; break;
563	case dnnl_f16: need_skip = !has_f16_support; break;
564	case dnnl_f64: need_skip = !has_f64_support; break;
565	default: break;
566	}
567	if (need_skip) {
568	res->state = SKIPPED, res->reason = DATA_TYPE_NOT_SUPPORTED;
569	return;
570	}
571	}
572	}
573
574	void skip_unimplemented_sum_po(
575	const attr_t &attr, res_t *res, dnnl_data_type_t dst_dt) {
576	const auto &po = attr.post_ops;
577	if (po.is_def()) return;
578
579	const int first_sum_idx = po.find(attr_t::post_ops_t::SUM);
580	if (first_sum_idx == -`1`) return;
581
582	const auto sum_dt = po.entry[first_sum_idx].sum.dt;
583
584	for (int idx = `0`; idx < po.len(); ++idx) {
585	const auto &e = po.entry[idx];
586	if (e.is_sum_kind()) {
587	// Sum with zero-point is not supported on GPU
588	if (is_gpu() && e.sum.zero_point != `0`) {
589	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
590	break;
591	}
592	// Each sum must have same data on CPU
593	if (is_cpu() && e.sum.dt != sum_dt) {
594	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
595	break;
596	}
597	// Sum must have data type with the same size like dst on both
598	if (dst_dt != dnnl_data_type_undef && sum_dt != dnnl_data_type_undef
599	&& dnnl_data_type_size(dst_dt)
600	!= dnnl_data_type_size(e.sum.dt)) {
601	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
602	return;
603	}
604	}
605	}
606	}
607
608	void skip_unimplemented_arg_scale(const attr_t &attr, res_t *res) {
609	for (const auto &arg_s : attr.scales.scales) {
610	if (arg_s.second.policy != policy_t::COMMON) {
611	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
612	return;
613	}
614	}
615	}
616
617	void skip_invalid_inplace(res_t *res, dnnl_data_type_t sdt,
618	dnnl_data_type_t ddt, const std::string &stag,
619	const std::string &dtag) {
620	// Note: existing implementation of dnn_mem_t doesn't allow to track the
621	// fact that two different objects pointing on the same SYCL memory should
622	// not map/unmap both objects.
623	// This leads to the restriction that memory descriptors should coincide,
624	// thus, a single memory object would be used for in-place validation.
625	// General limitation of in-place mode is having same amount of memory on
626	// input and output.
627	if (sdt != ddt) {
628	res->state = SKIPPED, res->reason = INVALID_CASE;
629	return;
630	}
631
632	if (dtag == tag::any) return;
633	if (stag != dtag) {
634	res->state = SKIPPED, res->reason = INVALID_CASE;
635	return;
636	}
637	}
638
639	// Check ensures that attributes don't cause implementation fallback
640	int check_same_pd(const dnnl_primitive_desc_t &pd_no_attr, res_t *res) {
641	const auto pd_no_attr_name = query_impl_info(pd_no_attr);
642	if (res->impl_name == pd_no_attr_name) return OK;
643
644	res->state = FAILED;
645	BENCHDNN_PRINT(`0`,
646	"ERROR: attributes caused impl fallback from [%s] to [%s]\n",
647	pd_no_attr_name.c_str(), res->impl_name.c_str());
648	return FAIL;
649	}
650
651	bool is_cpu(const dnnl_engine_t &engine) {
652	return query_engine_kind(engine) == dnnl_cpu;
653	}
654
655	bool is_gpu(const dnnl_engine_t &engine) {
656	return query_engine_kind(engine) == dnnl_gpu;
657	}
658
659	bool is_sycl_engine(const dnnl_engine_t &engine) {
660	if (is_cpu(engine)) return DNNL_CPU_RUNTIME == DNNL_RUNTIME_DPCPP;
661	if (is_gpu(engine)) return DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP;
662	return false;
663	}
664
665	bool is_opencl_engine(const dnnl_engine_t &engine) {
666	if (is_gpu(engine)) return DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL;
667	return false;
668	}
669
670	bool is_nvidia_gpu(const dnnl_engine_t &engine) {
671	#ifdef DNNL_WITH_SYCL
672	if (!is_gpu(engine)) return false;
673	constexpr int nvidia_vendor_id = `0x10DE`;
674	auto eng = dnnl::engine(engine, true);
675	auto device = dnnl::sycl_interop::get_device(eng);
676	const auto eng_vendor_id
677	= device.get_info<::sycl::info::device::vendor_id>();
678	return eng_vendor_id == nvidia_vendor_id;
679	#endif
680	return false;
681	}
682
683	bool is_amd_gpu(const dnnl_engine_t &engine) {
684	#ifdef DNNL_WITH_SYCL
685	if (!is_gpu(engine)) return false;
686	constexpr int amd_vendor_id = `0x1002`;
687	auto eng = dnnl::engine(engine, true);
688	auto device = dnnl::sycl_interop::get_device(eng);
689	const auto eng_vendor_id
690	= device.get_info<::sycl::info::device::vendor_id>();
691	return eng_vendor_id == amd_vendor_id;
692	#endif
693	return false;
694	}
695
696	bool is_f64_supported(const dnnl_engine_t &engine) {
697	if (!is_gpu(engine)) return false;
698	if (is_nvidia_gpu(engine) \|\| is_amd_gpu(engine)) return false;
699	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
700	if (is_sycl_engine(engine)) {
701	auto eng = dnnl::engine(engine, true);
702	auto dev = dnnl::sycl_interop::get_device(eng);
703	#ifdef DNNL_SYCL_INTEROP_USE_SYCL121
704	return dev.has_extension("cl_khr_fp64");
705	#else
706	return dev.has(::sycl::aspect::fp64);
707	#endif
708	}
709	#endif
710	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
711	if (is_opencl_engine(engine)) {
712	auto eng = dnnl::engine(engine, true);
713	cl_device_id dev = dnnl::ocl_interop::get_device(eng);
714	size_t param_size = `0`;
715	cl_int err = clGetDeviceInfo(
716	dev, CL_DEVICE_EXTENSIONS, `0`, nullptr, &param_size);
717	if (err != CL_SUCCESS) return false;
718
719	std::string extension_string(param_size, `'\0'`);
720	err = clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, param_size,
721	&extension_string[`0`], &param_size);
722	if (err != CL_SUCCESS) return false;
723
724	return extension_string.find("cl_khr_fp64") != std::string::npos;
725	}
726	#endif
727	return false;
728	}
729
730	#if defined(_WIN32) && !defined(__GNUC__)
731	#include "windows.h"
732
733	static size_t get_cpu_ram_size() {
734	MEMORYSTATUSEX s {};
735	s.dwLength = sizeof(s);
736	GlobalMemoryStatusEx(&s);
737	return s.ullTotalPhys;
738	}
739	#elif defined(__APPLE__) \|\| defined(__FreeBSD__) \|\| defined(__QNXNTO__)
740	#include <unistd.h>
741	#include <sys/sysctl.h>
742
743	static size_t get_cpu_ram_size() {
744	#ifdef __APPLE__
745	int query_ram[] = {CTL_HW, HW_MEMSIZE};
746	#else
747	int query_ram[] = {CTL_HW, HW_PHYSMEM};
748	#endif
749	int query_ram_len = sizeof(query_ram) / sizeof(*query_ram);
750	size_t totalram = `0`;
751	size_t length = sizeof(totalram);
752
753	sysctl(query_ram, query_ram_len, &totalram, &length, NULL, `0`);
754	return totalram;
755	}
756	#else
757	#include <sys/sysinfo.h>
758
759	static size_t get_cpu_ram_size() {
760	struct sysinfo s {};
761	sysinfo(&s);
762	return s.totalram;
763	}
764	#endif
765
766	static size_t get_gpu_ram_size() {
767	// XXX: create a tmp engine to query what we need.
768	// It will be removed in the future as part of switching back
769	// to the global engine.
770	engine_t eng_tmp(engine_tgt_kind);
771	dnnl::engine eng(eng_tmp, true);
772	if (eng.get_kind() != dnnl::engine::kind::gpu) return `0`;
773
774	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
775	cl_int status = CL_SUCCESS;
776	// Get single device attached to the engine.
777	engine_t engine_tgt(engine_tgt_kind);
778	cl_device_id ocl_device = dnnl::ocl_interop::get_device(eng);
779
780	cl_ulong ram_size = `0`;
781	status = clGetDeviceInfo(ocl_device, CL_DEVICE_GLOBAL_MEM_SIZE,
782	sizeof(cl_ulong), &ram_size, nullptr);
783	if (status == CL_SUCCESS) return (size_t)ram_size;
784	#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP
785	auto sycl_dev = dnnl::sycl_interop::get_device(eng);
786	return (size_t)sycl_dev.get_info<::sycl::info::device::global_mem_size>();
787	#endif
788	return `0`;
789	}
790
791	struct check_mem_size_args_t {
792	check_mem_size_args_t(const_dnnl_primitive_desc_t pd, bool want_input,
793	bool add_ref_size = false)
794	: pd(pd)
795	, want_input(want_input)
796	, add_ref_size(add_ref_size)
797	, is_scratchpad(false)
798	, total_size_device(`0`)
799	, total_size_cpu(`0`)
800	, scratchpad_size(`0`) {}
801
802	// Input args.
803	const_dnnl_primitive_desc_t pd;
804	bool want_input;
805	bool add_ref_size;
806	bool is_scratchpad;
807
808	// Output args.
809	size_t total_size_device;
810	size_t total_size_cpu;
811	size_t scratchpad_size;
812	};
813
814	static int check_total_size(
815	const check_mem_size_args_t &check_mem_size_args, res_t *res) {
816	static uint64_t cpu_device_capacity = get_cpu_ram_size();
817	static uint64_t gpu_device_capacity = get_gpu_ram_size();
818
819	const uint64_t device_max_capacity
820	= is_cpu() ? cpu_device_capacity : gpu_device_capacity;
821	const uint64_t cpu_max_capacity = cpu_device_capacity;
822
823	// 0.75f is taken randomly and is subject to change in future.
824	const double capacity_factor = `0.75`;
825	const double benchdnn_device_limit = capacity_factor * device_max_capacity;
826	const double benchdnn_cpu_limit = capacity_factor * cpu_max_capacity;
827	assert(benchdnn_device_limit > `0` && benchdnn_cpu_limit > `0`);
828
829	const bool fits_device_ram = is_gpu()
830	? (check_mem_size_args.total_size_device <= benchdnn_device_limit)
831	: true;
832	if (!fits_device_ram) {
833	BENCHDNN_PRINT(
834	`2`, "%s\n", "benchdnn: not enough device RAM for a problem.");
835	res->state = SKIPPED;
836	res->reason = NOT_ENOUGH_RAM;
837	}
838
839	auto GB = [](double bytes) { return bytes / powf(`2`, `30`); };
840
841	if (is_gpu()) {
842	BENCHDNN_PRINT((!fits_device_ram ? `2` : `6`),
843	"Requested: %g GB, benchdnn device limit: %g GB, device RAM "
844	"capacity: %g GB\n",
845	GB(check_mem_size_args.total_size_device),
846	GB(benchdnn_device_limit), GB(gpu_device_capacity));
847	}
848
849	size_t total_size_cpu = check_mem_size_args.total_size_cpu;
850	if (is_cpu()) total_size_cpu += check_mem_size_args.total_size_device;
851	bool fits_cpu_ram = total_size_cpu <= benchdnn_cpu_limit;
852
853	if (!fits_cpu_ram) {
854	BENCHDNN_PRINT(
855	`2`, "%s\n", "benchdnn: not enough CPU RAM for a problem.");
856	// Try to catch a huge scratchpad size requested by the library.
857	// Use following logic:
858	// scratch_size
859	// ---------------------- <= 0.75 (pre-defined threshold).
860	// io_size + scratch_size
861	//
862	// 0.75 value supposed to be experimental and might be adjusted.
863	static constexpr float scratch_trh = `0.75f`;
864	if (check_mem_size_args.scratchpad_size
865	> scratch_trh * total_size_cpu) {
866	BENCHDNN_PRINT(`2`, "%s `%ld` %s `%ld`.\n",
867	"benchdnn: CPU scratchpad size",
868	(long)check_mem_size_args.scratchpad_size,
869	"exceeded a given threshold",
870	(long)(scratch_trh * total_size_cpu));
871	res->state = FAILED;
872	} else {
873	res->state = SKIPPED;
874	}
875	res->reason = NOT_ENOUGH_RAM;
876	}
877
878	BENCHDNN_PRINT((!fits_cpu_ram ? `2` : `6`),
879	"Requested: %g GB, benchdnn CPU limit: %g GB, CPU RAM capacity: %g "
880	"GB\n",
881	GB(total_size_cpu), GB(benchdnn_cpu_limit),
882	GB(cpu_device_capacity));
883
884	return res->state == FAILED ? FAIL : OK;
885	}
886
887	static void add_md_size(const_dnnl_memory_desc_t md,
888	check_mem_size_args_t &check_mem_size_args) {
889	const auto mem_size = dnnl_memory_desc_get_size(md);
890	// Runtime mem size is not defined.
891	if (mem_size == `0` \|\| mem_size == DNNL_RUNTIME_SIZE_VAL) return;
892
893	check_mem_size_args.total_size_device += mem_size; // Original memory size.
894	if (!check_mem_size_args.add_ref_size) return;
895
896	// Reference memories are always tag::abx fp32, hence need re-creating
897	// memory descriptor and take its size.
898	auto ref_md = dnn_mem_t::init_md(
899	query_md_ndims(md), query_md_dims(md), dnnl_f32, tag::abx);
900	const auto ref_md_size = dnnl_memory_desc_get_size(ref_md);
901
902	// Correctness pass allocates additional tag::abx f32 memory.
903	bool compare_mem_factor = !check_mem_size_args.want_input
904	&& check_mem_size_args.add_ref_size;
905
906	// All memory is mapped once it is created and unmapped only before
907	// primitive execution. Device memory requires additional buffer for mapped
908	// memory.
909	// XXX: In DPC++ build oneDNN uses USM memory, which shouldn't require an
910	// additional buffer, so map factor should be equal to 0 for DPC++.
911	// However due to a driver issue oneDNN pretends that shared USM is not
912	// accessible on the host, hence map will allocate an extra memory.
913	check_mem_size_args.total_size_cpu += !is_cpu() * mem_size; // Map factor.
914	if (check_mem_size_args.is_scratchpad) {
915	check_mem_size_args.scratchpad_size += mem_size;
916	} else {
917	check_mem_size_args.total_size_cpu += ref_md_size; // Reference memory.
918	// Comparison memory.
919	check_mem_size_args.total_size_cpu += compare_mem_factor * ref_md_size;
920	}
921	}
922
923	bool is_fwd_prop_kind(dnnl_prop_kind_t prop_kind) {
924	return prop_kind == dnnl_forward_training
925	\|\| prop_kind == dnnl_forward_inference
926	\|\| prop_kind == dnnl_prop_kind_undef;
927	}
928
929	static void get_memory_bytes(check_mem_size_args_t &check_mem_size_args) {
930	auto const_pd = check_mem_size_args.pd;
931	const int n_idx = check_mem_size_args.want_input
932	? query_n_inputs(const_pd)
933	: query_n_outputs(const_pd);
934	const auto prop_kind = query_prop_kind(const_pd);
935	const bool is_fwd = is_fwd_prop_kind(prop_kind);
936
937	#define MD(name) dnnl_query_##name##_md
938	std::vector<dnnl_query_t> query_fwd_in_mds {MD(src), MD(weights)};
939	std::vector<dnnl_query_t> query_fwd_out_mds {MD(dst), MD(workspace)};
940
941	std::vector<dnnl_query_t> query_bwd_in_mds {
942	MD(src), MD(weights), MD(dst), MD(diff_dst), MD(workspace)};
943	std::vector<dnnl_query_t> query_bwd_out_mds {
944	MD(diff_src), MD(diff_weights)};
945	#undef MD
946
947	const auto &query_in_mds = is_fwd ? query_fwd_in_mds : query_bwd_in_mds;
948	const auto &query_out_mds = is_fwd ? query_fwd_out_mds : query_bwd_out_mds;
949	const auto &query_mds
950	= check_mem_size_args.want_input ? query_in_mds : query_out_mds;
951
952	for_(const auto query : query_mds)
953	for (int idx = `0`; idx < n_idx; ++idx) {
954	const auto &md = query_md(const_pd, query, idx);
955	add_md_size(md, check_mem_size_args);
956	}
957	}
958
959	int check_mem_size(const_dnnl_memory_desc_t md, res_t *res) {
960	if (!mem_check) return OK;
961
962	check_mem_size_args_t check_mem_size_args(nullptr, false, false);
963	check_mem_size_args.total_size_device = dnnl_memory_desc_get_size(md);
964
965	return check_total_size(check_mem_size_args, res);
966	}
967
968	int check_mem_size(const_dnnl_primitive_desc_t const_pd, res_t *res) {
969	if (!mem_check) return OK;
970
971	// Get input sizes.
972	check_mem_size_args_t check_mem_size_args(const_pd, / want_input = / true,
973	/ add_ref_size = / true);
974	get_memory_bytes(check_mem_size_args);
975
976	// Get scratchpad size. Treat it as `want_input=true` to avoid comparison
977	// factor count. Since scratchpad modes are mutual excluded, it takes sizes
978	// of both modes since either of them will report 0 size.
979	check_mem_size_args.is_scratchpad = true;
980	const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD);
981	add_md_size(scratchpad_md, check_mem_size_args);
982	check_mem_size_args.is_scratchpad = false;
983	check_mem_size_args.total_size_device += query_mem_consumption(const_pd);
984	check_mem_size_args.scratchpad_size += query_mem_consumption(const_pd);
985
986	// Get output sizes.
987	check_mem_size_args.want_input = false;
988	get_memory_bytes(check_mem_size_args);
989
990	return check_total_size(check_mem_size_args, res);
991	}
992
993	int get_memory_footprint(const_dnnl_primitive_desc_t const_pd, res_t *res) {
994	check_mem_size_args_t check_mem_in_size_args(
995	const_pd, / want_input = / true);
996	get_memory_bytes(check_mem_in_size_args); // Get input bytes.
997	check_mem_size_args_t check_mem_out_size_args(
998	const_pd, / want_input = / false);
999	get_memory_bytes(check_mem_out_size_args); // Get output bytes.
1000
1001	// Update read bytes with dst bytes in case of sum post-op.
1002	auto const_attr_po = query_post_ops(const_pd);
1003	auto po_len = dnnl_post_ops_len(const_attr_po);
1004	for (int idx = `0`; idx < po_len; ++idx) {
1005	const auto kind = dnnl_post_ops_get_kind(const_attr_po, idx);
1006	if (kind == dnnl_sum) {
1007	const auto &dst_md = query_md(const_pd, DNNL_ARG_DST);
1008	add_md_size(dst_md, check_mem_in_size_args);
1009	}
1010	}
1011
1012	res->ibytes = check_mem_in_size_args.total_size_device;
1013	res->obytes = check_mem_out_size_args.total_size_device;
1014
1015	return OK;
1016	}
1017
1018	memory_kind_ext_t str2memory_kind(const char *str) {
1019	#define CASE(param) \
1020	if (!strcasecmp(#param, str)) return memory_kind_ext_t::param
1021
1022	CASE(usm);
1023	CASE(buffer);
1024	CASE(usm_device);
1025	CASE(usm_shared);
1026
1027	#undef CASE
1028
1029	assert(!"not expected");
1030	return memory_kind_ext_t::usm;
1031	}
1032
1033	static void maybe_print_cpu_engine_error_message() {
1034	#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
1035	fprintf(stderr,
1036	"ERROR: can't create CPU engine. Possible reasons for this error:\n"
1037	"- Incorrect SYCL_DEVICE_FILTER. The filter must be either unset "
1038	"or include 'opencl:cpu' devices.\n"
1039	"- Missing TBB library which is required for OpenCL CPU runtime. "
1040	"Check that TBB library is available in the system.\n"
1041	"- Missing OpenCL CPU runtime or other issues with OpenCL CPU "
1042	"runtime. Check that output from `sycl-ls` or `clinfo -l` commands "
1043	"include any CPU devices.\n");
1044	#endif
1045	}
1046
1047	engine_t::engine_t(dnnl_engine_kind_t engine_kind) : is_owner_(true) {
1048	enable_gpu_profiling();
1049	size_t idx = engine_kind == dnnl_cpu ? `0` : engine_index;
1050	dnnl_status_t status = dnnl_engine_create(&engine_, engine_kind, idx);
1051	if (engine_kind == dnnl_cpu && status != dnnl_success)
1052	maybe_print_cpu_engine_error_message();
1053	DNN_SAFE_V(status);
1054	}
1055
1056	engine_t::engine_t(dnnl_engine_t engine) : engine_(engine), is_owner_(false) {}
1057
1058	engine_t::engine_t(const engine_t &other) {
1059	is_owner_ = other.is_owner_;
1060
1061	if (!is_owner_) {
1062	engine_ = other.engine_;
1063	return;
1064	}
1065
1066	dnnl_engine_kind_t engine_kind;
1067	DNN_SAFE_V(dnnl_engine_get_kind(other.engine_, &engine_kind));
1068
1069	if (engine_kind == dnnl_cpu) {
1070	#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL
1071	void *dev;
1072	void *ctx;
1073	DNN_SAFE_V(dnnl_sycl_interop_engine_get_device(other.engine_, &dev));
1074	DNN_SAFE_V(dnnl_sycl_interop_engine_get_context(other.engine_, &ctx));
1075	DNN_SAFE_V(dnnl_sycl_interop_engine_create(&engine_, dev, ctx));
1076	#else
1077	DNN_SAFE_V(dnnl_engine_create(&engine_, dnnl_cpu, `0`));
1078	#endif
1079	} else if (engine_kind == dnnl_gpu) {
1080	#if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL
1081	cl_device_id dev;
1082	cl_context ctx;
1083	DNN_SAFE_V(dnnl_ocl_interop_get_device(other.engine_, &dev));
1084	DNN_SAFE_V(dnnl_ocl_interop_engine_get_context(other.engine_, &ctx));
1085	DNN_SAFE_V(dnnl_ocl_interop_engine_create(&engine_, dev, ctx));
1086	#elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL
1087	void *dev;
1088	void *ctx;
1089	DNN_SAFE_V(dnnl_sycl_interop_engine_get_device(other.engine_, &dev));
1090	DNN_SAFE_V(dnnl_sycl_interop_engine_get_context(other.engine_, &ctx));
1091	DNN_SAFE_V(dnnl_sycl_interop_engine_create(&engine_, dev, ctx));
1092	#endif
1093	} else {
1094	assert(!"unsupported engine kind");
1095	}
1096	}
1097
1098	engine_t::~engine_t() {
1099	if (is_owner_) DNN_SAFE_V(dnnl_engine_destroy(engine_));
1100	}
1101
1102	stream_t::stream_t(dnnl_engine_t engine, void *interop_obj) {
1103	#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
1104	if (is_cpu(engine)) {
1105	auto tp = static_cast<dnnl::threadpool_interop::threadpool_iface *>(
1106	interop_obj);
1107	if (tp == nullptr) tp = dnnl::testing::get_threadpool();
1108	SAFE_V(dnnl_threadpool_interop_stream_create(&stream_, engine, tp));
1109	return;
1110	}
1111	#endif
1112	DNN_SAFE_V(dnnl_stream_create(&stream_, engine, dnnl_stream_default_flags));
1113	}
1114
1115	stream_t::~stream_t() {
1116	DNN_SAFE_V(dnnl_stream_destroy(stream_));
1117	}
1118
1119	float reorder_rescale_factor() {
1120	float factor = `1.f`;
1121	#if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE
1122	if (is_cpu(get_test_engine()))
1123	factor = dnnl::impl::cpu::platform::s8s8_weights_scale_factor();
1124	#endif
1125	return factor;
1126	}
1127
1128	dims_t md2dims(const dnnl_memory_desc_t &md) {
1129	auto ndims = query_md_ndims(md);
1130	dims_t dims(ndims, `0`);
1131	for (int d = `0`; d < ndims; ++d)
1132	dims[d] = query_md_dims(md)[d];
1133	return dims;
1134	}
1135
1136	dnnl_data_type_t deduce_cfg_data_type(
1137	dnnl_data_type_t in_dt, const attr_t &attr, data_kind_t dk) {
1138	dnnl_data_type_t dt_ = in_dt;
1139
1140	if ((dk == SRC \|\| dk == WEI) && dt_ == dnnl_f32) {
1141	// Update data type based on fpmath-mode attribute
1142	switch (attr.fpmath_mode) {
1143	case dnnl_fpmath_mode_strict: break;
1144	case dnnl_fpmath_mode_bf16: dt_ = dnnl_bf16; break;
1145	case dnnl_fpmath_mode_tf32: dt_ = dnnl_bf16; break;
1146	default: assert(!"unsupported_fpmath_mode"); SAFE_V(CRIT);
1147	}
1148	} else if (dk == DST) {
1149	// Sum post-op defines the type of filling destination.
1150	const int sum_idx = attr.post_ops.find(attr_t::post_ops_t::SUM);
1151	if (sum_idx >= `0`) {
1152	auto sum_dt = attr.post_ops.entry[sum_idx].sum.dt;
1153	if (sum_dt != dnnl_data_type_undef) dt_ = sum_dt;
1154	}
1155	}
1156
1157	return dt_;
1158	}
1159

Browse the source code of oneDNN/tests/benchdnn/dnnl_common.cpp