reorder.cpp source code [oneDNN/tests/benchdnn/reorder/reorder.cpp]

1	/*******************************************************************************
2	* Copyright 2017-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <algorithm>
18	#include <cstring>
19
20	#include <stdlib.h>
21
22	#include "oneapi/dnnl/dnnl.h"
23
24	// TODO: refactor the driver to avoid using extra flags of a memory descriptor.
25	#include "common/memory_desc.hpp"
26
27	#include "utils/parallel.hpp"
28
29	#include "dnn_types.hpp"
30	#include "dnnl_common.hpp"
31	#include "dnnl_memory.hpp"
32
33	#include "reorder.hpp"
34
35	namespace reorder {
36
37	// Filling for integers is different due to problematic int -> float conversion.
38	// And it doesn't require many different points to be tested.
39	int fill_memory_int(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
40	dnn_mem_t &mem_fp) {
41	const auto conf = prb->get_conf(kind);
42
43	for (int64_t idx = `0`; idx < mem_fp.nelems(); ++idx) {
44	const float gen[`4`] = {
45	conf->max, // saturate to max of output data type
46	conf->min, // saturate to min of output data type
47	`0`,
48	`16`,
49	};
50
51	const int64_t rng = kind == SRC ? (idx % `4`) : ((idx * `5` / `4`) % `4`);
52	mem_fp.set_elem(
53	idx, round_to_nearest_representable(conf->dt, gen[rng]));
54	}
55
56	SAFE(mem_dt.reorder(mem_fp), WARN);
57	return OK;
58	}
59
60	int fill_memory_fp(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
61	dnn_mem_t &mem_fp) {
62	const auto conf = prb->get_conf(kind);
63	const auto &src_scales = prb->attr.scales.get(DNNL_ARG_FROM);
64	const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_TO);
65	const int src_scale_mask = attr_t::get_default_mask(src_scales.policy);
66	const int dst_scale_mask = attr_t::get_default_mask(dst_scales.policy);
67
68	for (int64_t idx = `0`; idx < mem_fp.nelems(); ++idx) {
69	float src_scale = `1.f`, dst_scale = `1.f`;
70	if (!src_scales.is_def()) {
71	int64_t src_mask_idx = mem_fp.get_scale_idx(idx, src_scale_mask);
72	src_scale = prb->src_scales[src_mask_idx];
73	}
74	if (!dst_scales.is_def()) {
75	int64_t dst_mask_idx = mem_fp.get_scale_idx(idx, dst_scale_mask);
76	dst_scale = prb->dst_scales[dst_mask_idx];
77	}
78	const float scale = src_scale / dst_scale;
79
80	const float gen[`7`] = {
81	conf->max, // saturate to max of output data type
82	conf->min, // saturate to min of output data type
83	`1.6f`, // rounding check
84	`0.2f`, // saturate to 0
85	`1.f` / scale, // exact multiplication check
86	`2.f`,
87	scale,
88	};
89
90	const int64_t rng = kind == SRC ? (idx % `7`) : ((idx * `8` / `7`) % `7`);
91	mem_fp.set_elem(
92	idx, round_to_nearest_representable(conf->dt, gen[rng]));
93	}
94
95	SAFE(mem_dt.reorder(mem_fp), WARN);
96	return OK;
97	}
98
99	int fill_memory(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
100	dnn_mem_t &mem_fp) {
101	const auto dt = kind == SRC ? prb->sdt : prb->ddt;
102	if (is_integral_dt(dt)) return fill_memory_int(prb, kind, mem_dt, mem_fp);
103	return fill_memory_fp(prb, kind, mem_dt, mem_fp);
104	}
105
106	int compare_compensation(const prb_t *prb, dnn_mem_t &mem_s8_comp_ref,
107	dnn_mem_t &mem_zp_comp_ref, dnn_mem_t &mem_got, res_t *res) {
108	// Note: following check relies on certain assumptions on CPU. These
109	// assumptions may not hold for GPU. In addition, it's prohibit to work
110	// with raw pointers directly for buffer type of memory.
111	if (!is_cpu(get_test_engine())) return FAIL;
112
113	const auto padded_nelems = mem_got.nelems(true);
114	// Note: internally offset is aligned on 4, otherwise it's UB.
115	size_t first_comp_offset = div_up(padded_nelems, `4`) * `4`;
116	int *comp_handle
117	= reinterpret_cast<int >((char* *)mem_got + first_comp_offset);
118
119	const auto cmp_compensation = [&](const dnn_mem_t &mem_ref, int comp_mask) {
120	// Idea behind this check:
121	// Using knowledge from the library where `comp_handle` starts, and that
122	// memory utilizes blocking over OC and G, if present, we wrap that
123	// piece of memory which is described by shortened tag coming from prb
124	// into a separate memory and reorder it to plain so that it is a
125	// straight comparison of values in native plain layout.
126	auto comp_md = dnn_mem_t::init_md(mem_ref.ndims(), mem_ref.dims(),
127	mem_ref.dt(), trim_tag_by_mask(prb->dtag, comp_mask));
128	dnn_mem_t comp_m(comp_md, mem_ref.engine(), {false, comp_handle});
129
130	compare::compare_t cmp;
131	cmp.set_zero_trust_percent(`100.f`); // No sense in zero trust test.
132	int status = cmp.compare(mem_ref, comp_m, attr_t (), res);
133
134	// Shift original compensation pointer for next compensation
135	comp_handle += comp_m.nelems(true);
136	return status;
137	};
138
139	if (mem_s8_comp_ref.ndims())
140	SAFE(cmp_compensation(mem_s8_comp_ref,
141	prb->get_compensation_mask(FLAG_S8S8_COMP)),
142	WARN);
143	if (mem_zp_comp_ref.ndims())
144	SAFE(cmp_compensation(
145	mem_zp_comp_ref, prb->get_compensation_mask(FLAG_ZP_COMP)),
146	WARN);
147
148	return res->state == FAILED ? FAIL : OK;
149	}
150
151	dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
152	const prb_t *prb = init_pd_args.prb;
153
154	auto dims = prb->dims;
155	for (int d = `0`; d < prb->ndims; ++d)
156	if (prb->runtime_dim_mask & (`1` << d)) dims[d] = DNNL_RUNTIME_DIM_VAL;
157
158	auto src_d
159	= dnn_mem_t::init_md(prb->ndims, dims.data(), prb->sdt, prb->stag);
160	auto dst_d
161	= dnn_mem_t::init_md(prb->ndims, dims.data(), prb->ddt, prb->dtag);
162
163	// Prepare and assign extra for dst_md.
164	auto &extra = static_cast<dnnl_memory_desc_t>(dst_d)->extra;
165	extra.flags = dnnl::impl::memory_extra_flags::none;
166	if (prb->is_reorder_with_compensation(FLAG_ANY)) {
167	for (const auto &i_oflag : prb->oflag) {
168	if (i_oflag.first & FLAG_S8S8_COMP) {
169	extra.flags \|= dnnl::impl::memory_extra_flags::
170	compensation_conv_s8s8;
171	extra.compensation_mask = i_oflag.second;
172
173	const float s8_scale_factor = reorder_rescale_factor();
174	const bool need_rescale = s8_scale_factor != `1.f`;
175	if (need_rescale) {
176	extra.flags \|= dnnl::impl::memory_extra_flags::scale_adjust;
177	extra.scale_adjust = s8_scale_factor;
178	}
179	}
180	if (i_oflag.first & FLAG_ZP_COMP) {
181	extra.flags \|= dnnl::impl::memory_extra_flags::
182	compensation_conv_asymmetric_src;
183	extra.asymm_compensation_mask = i_oflag.second;
184	}
185	}
186	}
187
188	auto src_engine = init_pd_args.engine;
189	auto dst_engine = init_pd_args.engine;
190	if (is_gpu()) {
191	switch (prb->cross_engine) {
192	case CPU2GPU: src_engine = get_cpu_engine(); break;
193	case GPU2CPU: dst_engine = get_cpu_engine(); break;
194	default: break;
195	}
196	}
197
198	auto dnnl_attr = make_benchdnn_dnnl_wrapper(
199	create_dnnl_attr(prb->attr, attr_args_t ()));
200
201	init_pd_args.is_iterator_supported = false;
202	return dnnl_reorder_primitive_desc_create(
203	&init_pd_args.pd, src_d, src_engine, dst_d, dst_engine, dnnl_attr);
204	}
205
206	void skip_unimplemented_prb(const prb_t prb, res_t res) {
207	const auto sdt = prb->sdt;
208	const auto ddt = prb->ddt;
209	skip_unimplemented_data_type({sdt, ddt}, prb->dir, res);
210	skip_unimplemented_sum_po(prb->attr, res);
211
212	bool scales_ok = true;
213	#if !defined(DNNL_X64) \|\| DNNL_X64 == 0
214	{
215	// reference reorder supports only a subset of scale policies
216	const std::vector<policy_t> supported_policy = {policy_t::COMMON,
217	policy_t::PER_DIM_0, policy_t::PER_DIM_1, policy_t::PER_DIM_01};
218
219	for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
220	scales_ok = std::any_of(supported_policy.cbegin(),
221	supported_policy.cend(), [&](const policy_t policy) {
222	return prb->attr.scales.get(arg).policy == policy;
223	});
224	}
225	}
226	#endif
227	if (!scales_ok) {
228	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
229	return;
230	}
231
232	if (prb->is_reorder_with_compensation(FLAG_ANY)) {
233	// Compensation is supported for s8 dst data type.
234	const bool dt_ok = ddt == dnnl_s8;
235	// Compensation can be paired with dst scale only.
236	const bool attr_ok
237	= prb->attr.zero_points.is_def() && prb->attr.post_ops.is_def();
238	// Compensation does not support runtime dims.
239	const bool rt_ok = prb->runtime_dim_mask == `0`;
240
241	// Compensation and scales mask should coincide
242	const auto comp_mask = prb->get_compensation_mask(FLAG_ANY);
243	bool masks_ok = true;
244	for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
245	const auto &e = prb->attr.scales.get(arg);
246	if (!e.is_def()) {
247	int e_mask = attr_t::get_default_mask(e.policy);
248	masks_ok = masks_ok && e_mask == comp_mask;
249	}
250	}
251
252	if (!dt_ok \|\| !attr_ok \|\| !rt_ok \|\| !masks_ok) {
253	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
254	return;
255	}
256
257	#if !defined(DNNL_X64) \|\| DNNL_X64 == 0
258	// Simple reorder doesn't provide decent coverage for compensated cases.
259	// Shut them down unconditionally by default.
260	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
261	return;
262	#endif
263	}
264
265	// Destination scale is not supported for runtime dimensions since the
266	// implementation logic inverts dst scales and requires scratchpad for
267	// `mask > 0` cases which is impossible to estimate with rt dims.
268	const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST);
269	if (!dst_scales.is_def() && attr_t::get_default_mask(dst_scales.policy) > `0`
270	&& prb->runtime_dim_mask != `0`) {
271	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
272	return;
273	}
274
275	// Compensation is supported through jit reorder only, but jit reorder
276	// doesn't support different masks for source and destination scales.
277	const auto &src_scales = prb->attr.scales.get(DNNL_ARG_SRC);
278	if (!src_scales.is_def() && !dst_scales.is_def()) {
279	if (attr_t::get_default_mask(src_scales.policy)
280	!= attr_t::get_default_mask(dst_scales.policy)
281	&& prb->is_reorder_with_compensation(FLAG_ANY)) {
282	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
283	return;
284	}
285	}
286
287	if (is_cpu()) {
288	// CPU reorder doesn't support bf16<-->s32 combinations.
289	const bool s32_src_ok = IMPLICATION(sdt == dnnl_s32, ddt != dnnl_bf16);
290	const bool s32_dst_ok = IMPLICATION(ddt == dnnl_s32, sdt != dnnl_bf16);
291	if (!s32_src_ok \|\| !s32_dst_ok) {
292	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
293	return;
294	}
295
296	// CPU f16 reorders only support f16<->f32 combinations
297	const bool f16_src_ok = IMPLICATION(
298	sdt == dnnl_f16, ddt == dnnl_f16 \|\| ddt == dnnl_f32);
299	const bool f16_dst_ok = IMPLICATION(
300	ddt == dnnl_f16, sdt == dnnl_f16 \|\| sdt == dnnl_f32);
301	if (!f16_src_ok \|\| !f16_dst_ok) {
302	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
303	return;
304	}
305	}
306
307	if (is_gpu()) {
308	// GPU does not support run-time dims.
309	// Reorders w/ compensation are not supported by design: zp_comp is done
310	// in kernels directly, but s8s8 instructions are available in HW.
311	if (prb->runtime_dim_mask != `0`
312	\|\| prb->is_reorder_with_compensation(FLAG_ANY)) {
313	res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
314	return;
315	}
316	}
317	}
318
319	void skip_invalid_prb(const prb_t prb, res_t res) {
320	// No sense in cross engine reorders when one of devices is switched off.
321	#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
322	\|\| DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE
323	auto cross_engine = prb->cross_engine;
324	if (cross_engine == CPU2GPU \|\| cross_engine == GPU2CPU)
325	res->state = SKIPPED, res->reason = INVALID_CASE;
326	#endif
327
328	// Zero-points can't be used with sum post-op.
329	if (!prb->attr.zero_points.is_def(DNNL_ARG_DST)
330	&& prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -`1`) {
331	res->state = SKIPPED, res->reason = INVALID_CASE;
332	return;
333	}
334
335	// only integral data types can have zero points
336	const bool is_src_zp_ok = is_integral_dt(prb->sdt)
337	\|\| prb->attr.zero_points.is_def(DNNL_ARG_SRC);
338	const bool is_dst_zp_ok = is_integral_dt(prb->ddt)
339	\|\| prb->attr.zero_points.is_def(DNNL_ARG_DST);
340	if (!(is_src_zp_ok && is_dst_zp_ok)) {
341	res->state = SKIPPED, res->reason = INVALID_CASE;
342	return;
343	}
344	}
345
346	void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
347	const args_t &ref_args) {
348	const bool has_s32 = prb->sdt == dnnl_s32 \|\| prb->ddt == dnnl_s32;
349	const bool has_s8 = prb->sdt == dnnl_s8 \|\| prb->ddt == dnnl_s8;
350	const bool has_u8 = prb->sdt == dnnl_u8 \|\| prb->ddt == dnnl_u8;
351	// For u8 4/7 inputs becomes 0, for s32/s8 3/7 inputs becomes 0;
352	const float zero_trust_percent
353	= has_u8 ? `58.f` : (has_s32 \|\| has_s8) ? `43.f` : `30.f`;
354	cmp.set_zero_trust_percent(zero_trust_percent);
355
356	// Additional check to avoid false-positive result from f32->s32 conversion
357	// in case of sum post-op on GPU happening when two max_dt values
358	// are summed together.
359	const auto reorder_add_check
360	= [&](const compare::compare_t::driver_check_func_args_t &args) {
361	if (args.dt == dnnl_s32 && args.got == max_dt(args.dt)
362	&& is_gpu()) {
363	// 128.f = float(INT_MAX) - BENCHDNN_S32_TO_F32_SAT_CONST;
364	return args.diff == `128.f`;
365	}
366	return false;
367	};
368	cmp.set_driver_check_function(reorder_add_check);
369	}
370
371	int doit(const prb_t prb, res_t res) {
372	if (bench_mode == LIST) return res->state = LISTED, OK;
373
374	benchdnn_dnnl_wrapper_t<dnnl_primitive_t> prim;
375	SAFE(init_prim(prb->ctx_init, prim, init_pd, prb, res), WARN);
376	if (res->state == SKIPPED \|\| res->state == UNIMPLEMENTED) return OK;
377
378	auto const_pd = query_pd(prim);
379
380	benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> src_md {}, dst_md {};
381	if (prb->runtime_dim_mask != `0`) {
382	// re-create memory descriptors with defined dims
383	src_md = dnn_mem_t::init_md(
384	prb->ndims, prb->dims.data(), prb->sdt, prb->stag);
385	dst_md = dnn_mem_t::init_md(
386	prb->ndims, prb->dims.data(), prb->ddt, prb->dtag);
387	} else {
388	src_md = clone_md(query_md(const_pd, DNNL_ARG_SRC));
389	dst_md = clone_md(query_md(const_pd, DNNL_ARG_DST));
390	}
391	const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD);
392
393	dnnl_engine_t src_engine
394	= query_engine(const_pd, dnnl_query_reorder_src_engine);
395	dnnl_engine_t dst_engine
396	= query_engine(const_pd, dnnl_query_reorder_dst_engine);
397	const auto &ref_engine = get_cpu_engine();
398
399	dnn_mem_t src_fp(src_md, dnnl_f32, tag::abx, ref_engine);
400	dnn_mem_t src_dt(src_md, src_engine);
401
402	dnn_mem_t scratchpad_dt(scratchpad_md, src_engine);
403
404	dnn_mem_t dst_fp(dst_md, dnnl_f32, tag::abx, ref_engine);
405	dnn_mem_t dst_dt(dst_md, dst_engine);
406
407	SAFE(fill_memory(prb, SRC, src_dt, src_fp), WARN);
408
409	const bool has_sum
410	= prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) >= `0`;
411	if (has_sum) { SAFE(fill_memory(prb, DST, dst_dt, dst_fp), WARN); }
412
413	const int src_mask = attr_t::get_default_mask(
414	prb->attr.scales.get(DNNL_ARG_SRC).policy);
415	const int dst_mask = attr_t::get_default_mask(
416	prb->attr.scales.get(DNNL_ARG_DST).policy);
417	dnn_mem_t src_scales, dst_scales;
418	dnn_mem_t src_zero_points_m, dst_zero_points_m;
419
420	maybe_prepare_runtime_scales(src_scales, prb->attr.scales.get(DNNL_ARG_SRC),
421	prb->nelems(src_mask), prb->src_scales);
422	maybe_prepare_runtime_scales(dst_scales, prb->attr.scales.get(DNNL_ARG_DST),
423	prb->nelems(dst_mask), prb->dst_scales);
424	maybe_prepare_runtime_zero_points(
425	src_zero_points_m, prb->attr, DNNL_ARG_SRC, `1`, prb->src_zp);
426	maybe_prepare_runtime_zero_points(
427	dst_zero_points_m, prb->attr, DNNL_ARG_DST, `1`, prb->dst_zp);
428
429	args_t args, ref_args;
430
431	args.set(DNNL_ARG_FROM, src_dt);
432	args.set(DNNL_ARG_TO, dst_dt);
433	args.set(DNNL_ARG_SCRATCHPAD, scratchpad_dt);
434	args.set(DNNL_ARG_ATTR_SCALES \| DNNL_ARG_SRC, src_scales);
435	args.set(DNNL_ARG_ATTR_SCALES \| DNNL_ARG_DST, dst_scales);
436	args.set(DNNL_ARG_ATTR_ZERO_POINTS \| DNNL_ARG_SRC, src_zero_points_m);
437	args.set(DNNL_ARG_ATTR_ZERO_POINTS \| DNNL_ARG_DST, dst_zero_points_m);
438
439	SAFE(execute_and_wait(prim, args, res), WARN);
440
441	if (is_bench_mode(CORR)) {
442	const auto assign_comp_mem = [&](dnn_mem_t &m, flag_bit_t flag) {
443	if (prb->is_reorder_with_compensation(flag)) {
444	dims_t dims = prb->get_compensation_dims(flag);
445	int ndims = static_cast<int>(dims.size());
446	auto md = dnn_mem_t::init_md(
447	ndims, dims.data(), dnnl_s32, tag::abx);
448	m = dnn_mem_t(md, ref_engine);
449	}
450	return OK;
451	};
452
453	dnn_mem_t dst_s8_comp_ref, dst_zp_comp_ref;
454	assign_comp_mem (dst_s8_comp_ref, FLAG_S8S8_COMP);
455	assign_comp_mem (dst_zp_comp_ref, FLAG_ZP_COMP);
456
457	ref_args.set(DNNL_ARG_FROM, src_fp);
458	ref_args.set(DNNL_ARG_TO, dst_fp);
459	ref_args.set(DNNL_ARG_SRC_1, dst_s8_comp_ref); // Additional input
460	ref_args.set(DNNL_ARG_SRC_2, dst_zp_comp_ref); // Additional input
461	ref_args.set(DNNL_ARG_ATTR_SCALES \| DNNL_ARG_SRC, src_scales);
462	ref_args.set(DNNL_ARG_ATTR_SCALES \| DNNL_ARG_DST, dst_scales);
463
464	// Remove extra desc so that reorders with compensation could have
465	// proper reorder from blocked layout to plain for comparison.
466	dnnl::impl::memory_extra_desc_t empty_extra {};
467	const auto orig_dst_extra = dst_dt.md_->extra;
468	dst_dt.md_->extra = empty_extra;
469
470	// Validate main reorder part.
471	check_correctness(prb, {DST}, args, ref_args, setup_cmp, res);
472
473	// Restore extra for compensation comparison and performance mode.
474	dst_dt.md_->extra = orig_dst_extra;
475
476	// Validate compensated reorder part.
477	if (prb->is_reorder_with_compensation(FLAG_ANY)) {
478	compare_compensation(
479	prb, dst_s8_comp_ref, dst_zp_comp_ref, dst_dt, res);
480	}
481	}
482
483	return measure_perf(prb->ctx_exe, res, prim, args);
484	}
485
486	} // namespace reorder
487

Browse the source code of oneDNN/tests/benchdnn/reorder/reorder.cpp