1/*******************************************************************************
2* Copyright 2017-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include <cstring>
19
20#include <stdlib.h>
21
22#include "oneapi/dnnl/dnnl.h"
23
24// TODO: refactor the driver to avoid using extra flags of a memory descriptor.
25#include "common/memory_desc.hpp"
26
27#include "utils/parallel.hpp"
28
29#include "dnn_types.hpp"
30#include "dnnl_common.hpp"
31#include "dnnl_memory.hpp"
32
33#include "reorder.hpp"
34
35namespace reorder {
36
37// Filling for integers is different due to problematic int -> float conversion.
38// And it doesn't require many different points to be tested.
39int fill_memory_int(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
40 dnn_mem_t &mem_fp) {
41 const auto conf = prb->get_conf(kind);
42
43 for (int64_t idx = 0; idx < mem_fp.nelems(); ++idx) {
44 const float gen[4] = {
45 conf->max, // saturate to max of output data type
46 conf->min, // saturate to min of output data type
47 0,
48 16,
49 };
50
51 const int64_t rng = kind == SRC ? (idx % 4) : ((idx * 5 / 4) % 4);
52 mem_fp.set_elem(
53 idx, round_to_nearest_representable(conf->dt, gen[rng]));
54 }
55
56 SAFE(mem_dt.reorder(mem_fp), WARN);
57 return OK;
58}
59
60int fill_memory_fp(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
61 dnn_mem_t &mem_fp) {
62 const auto conf = prb->get_conf(kind);
63 const auto &src_scales = prb->attr.scales.get(DNNL_ARG_FROM);
64 const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_TO);
65 const int src_scale_mask = attr_t::get_default_mask(src_scales.policy);
66 const int dst_scale_mask = attr_t::get_default_mask(dst_scales.policy);
67
68 for (int64_t idx = 0; idx < mem_fp.nelems(); ++idx) {
69 float src_scale = 1.f, dst_scale = 1.f;
70 if (!src_scales.is_def()) {
71 int64_t src_mask_idx = mem_fp.get_scale_idx(idx, src_scale_mask);
72 src_scale = prb->src_scales[src_mask_idx];
73 }
74 if (!dst_scales.is_def()) {
75 int64_t dst_mask_idx = mem_fp.get_scale_idx(idx, dst_scale_mask);
76 dst_scale = prb->dst_scales[dst_mask_idx];
77 }
78 const float scale = src_scale / dst_scale;
79
80 const float gen[7] = {
81 conf->max, // saturate to max of output data type
82 conf->min, // saturate to min of output data type
83 1.6f, // rounding check
84 0.2f, // saturate to 0
85 1.f / scale, // exact multiplication check
86 2.f,
87 scale,
88 };
89
90 const int64_t rng = kind == SRC ? (idx % 7) : ((idx * 8 / 7) % 7);
91 mem_fp.set_elem(
92 idx, round_to_nearest_representable(conf->dt, gen[rng]));
93 }
94
95 SAFE(mem_dt.reorder(mem_fp), WARN);
96 return OK;
97}
98
99int fill_memory(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt,
100 dnn_mem_t &mem_fp) {
101 const auto dt = kind == SRC ? prb->sdt : prb->ddt;
102 if (is_integral_dt(dt)) return fill_memory_int(prb, kind, mem_dt, mem_fp);
103 return fill_memory_fp(prb, kind, mem_dt, mem_fp);
104}
105
106int compare_compensation(const prb_t *prb, dnn_mem_t &mem_s8_comp_ref,
107 dnn_mem_t &mem_zp_comp_ref, dnn_mem_t &mem_got, res_t *res) {
108 // Note: following check relies on certain assumptions on CPU. These
109 // assumptions may not hold for GPU. In addition, it's prohibit to work
110 // with raw pointers directly for buffer type of memory.
111 if (!is_cpu(get_test_engine())) return FAIL;
112
113 const auto padded_nelems = mem_got.nelems(true);
114 // Note: internally offset is aligned on 4, otherwise it's UB.
115 size_t first_comp_offset = div_up(padded_nelems, 4) * 4;
116 int *comp_handle
117 = reinterpret_cast<int *>((char *)mem_got + first_comp_offset);
118
119 const auto cmp_compensation = [&](const dnn_mem_t &mem_ref, int comp_mask) {
120 // Idea behind this check:
121 // Using knowledge from the library where `comp_handle` starts, and that
122 // memory utilizes blocking over OC and G, if present, we wrap that
123 // piece of memory which is described by shortened tag coming from prb
124 // into a separate memory and reorder it to plain so that it is a
125 // straight comparison of values in native plain layout.
126 auto comp_md = dnn_mem_t::init_md(mem_ref.ndims(), mem_ref.dims(),
127 mem_ref.dt(), trim_tag_by_mask(prb->dtag, comp_mask));
128 dnn_mem_t comp_m(comp_md, mem_ref.engine(), {false, comp_handle});
129
130 compare::compare_t cmp;
131 cmp.set_zero_trust_percent(100.f); // No sense in zero trust test.
132 int status = cmp.compare(mem_ref, comp_m, attr_t(), res);
133
134 // Shift original compensation pointer for next compensation
135 comp_handle += comp_m.nelems(true);
136 return status;
137 };
138
139 if (mem_s8_comp_ref.ndims())
140 SAFE(cmp_compensation(mem_s8_comp_ref,
141 prb->get_compensation_mask(FLAG_S8S8_COMP)),
142 WARN);
143 if (mem_zp_comp_ref.ndims())
144 SAFE(cmp_compensation(
145 mem_zp_comp_ref, prb->get_compensation_mask(FLAG_ZP_COMP)),
146 WARN);
147
148 return res->state == FAILED ? FAIL : OK;
149}
150
151dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) {
152 const prb_t *prb = init_pd_args.prb;
153
154 auto dims = prb->dims;
155 for (int d = 0; d < prb->ndims; ++d)
156 if (prb->runtime_dim_mask & (1 << d)) dims[d] = DNNL_RUNTIME_DIM_VAL;
157
158 auto src_d
159 = dnn_mem_t::init_md(prb->ndims, dims.data(), prb->sdt, prb->stag);
160 auto dst_d
161 = dnn_mem_t::init_md(prb->ndims, dims.data(), prb->ddt, prb->dtag);
162
163 // Prepare and assign extra for dst_md.
164 auto &extra = static_cast<dnnl_memory_desc_t>(dst_d)->extra;
165 extra.flags = dnnl::impl::memory_extra_flags::none;
166 if (prb->is_reorder_with_compensation(FLAG_ANY)) {
167 for (const auto &i_oflag : prb->oflag) {
168 if (i_oflag.first & FLAG_S8S8_COMP) {
169 extra.flags |= dnnl::impl::memory_extra_flags::
170 compensation_conv_s8s8;
171 extra.compensation_mask = i_oflag.second;
172
173 const float s8_scale_factor = reorder_rescale_factor();
174 const bool need_rescale = s8_scale_factor != 1.f;
175 if (need_rescale) {
176 extra.flags |= dnnl::impl::memory_extra_flags::scale_adjust;
177 extra.scale_adjust = s8_scale_factor;
178 }
179 }
180 if (i_oflag.first & FLAG_ZP_COMP) {
181 extra.flags |= dnnl::impl::memory_extra_flags::
182 compensation_conv_asymmetric_src;
183 extra.asymm_compensation_mask = i_oflag.second;
184 }
185 }
186 }
187
188 auto src_engine = init_pd_args.engine;
189 auto dst_engine = init_pd_args.engine;
190 if (is_gpu()) {
191 switch (prb->cross_engine) {
192 case CPU2GPU: src_engine = get_cpu_engine(); break;
193 case GPU2CPU: dst_engine = get_cpu_engine(); break;
194 default: break;
195 }
196 }
197
198 auto dnnl_attr = make_benchdnn_dnnl_wrapper(
199 create_dnnl_attr(prb->attr, attr_args_t()));
200
201 init_pd_args.is_iterator_supported = false;
202 return dnnl_reorder_primitive_desc_create(
203 &init_pd_args.pd, src_d, src_engine, dst_d, dst_engine, dnnl_attr);
204}
205
206void skip_unimplemented_prb(const prb_t *prb, res_t *res) {
207 const auto sdt = prb->sdt;
208 const auto ddt = prb->ddt;
209 skip_unimplemented_data_type({sdt, ddt}, prb->dir, res);
210 skip_unimplemented_sum_po(prb->attr, res);
211
212 bool scales_ok = true;
213#if !defined(DNNL_X64) || DNNL_X64 == 0
214 {
215 // reference reorder supports only a subset of scale policies
216 const std::vector<policy_t> supported_policy = {policy_t::COMMON,
217 policy_t::PER_DIM_0, policy_t::PER_DIM_1, policy_t::PER_DIM_01};
218
219 for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
220 scales_ok = std::any_of(supported_policy.cbegin(),
221 supported_policy.cend(), [&](const policy_t policy) {
222 return prb->attr.scales.get(arg).policy == policy;
223 });
224 }
225 }
226#endif
227 if (!scales_ok) {
228 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
229 return;
230 }
231
232 if (prb->is_reorder_with_compensation(FLAG_ANY)) {
233 // Compensation is supported for s8 dst data type.
234 const bool dt_ok = ddt == dnnl_s8;
235 // Compensation can be paired with dst scale only.
236 const bool attr_ok
237 = prb->attr.zero_points.is_def() && prb->attr.post_ops.is_def();
238 // Compensation does not support runtime dims.
239 const bool rt_ok = prb->runtime_dim_mask == 0;
240
241 // Compensation and scales mask should coincide
242 const auto comp_mask = prb->get_compensation_mask(FLAG_ANY);
243 bool masks_ok = true;
244 for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) {
245 const auto &e = prb->attr.scales.get(arg);
246 if (!e.is_def()) {
247 int e_mask = attr_t::get_default_mask(e.policy);
248 masks_ok = masks_ok && e_mask == comp_mask;
249 }
250 }
251
252 if (!dt_ok || !attr_ok || !rt_ok || !masks_ok) {
253 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
254 return;
255 }
256
257#if !defined(DNNL_X64) || DNNL_X64 == 0
258 // Simple reorder doesn't provide decent coverage for compensated cases.
259 // Shut them down unconditionally by default.
260 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
261 return;
262#endif
263 }
264
265 // Destination scale is not supported for runtime dimensions since the
266 // implementation logic inverts dst scales and requires scratchpad for
267 // `mask > 0` cases which is impossible to estimate with rt dims.
268 const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST);
269 if (!dst_scales.is_def() && attr_t::get_default_mask(dst_scales.policy) > 0
270 && prb->runtime_dim_mask != 0) {
271 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
272 return;
273 }
274
275 // Compensation is supported through jit reorder only, but jit reorder
276 // doesn't support different masks for source and destination scales.
277 const auto &src_scales = prb->attr.scales.get(DNNL_ARG_SRC);
278 if (!src_scales.is_def() && !dst_scales.is_def()) {
279 if (attr_t::get_default_mask(src_scales.policy)
280 != attr_t::get_default_mask(dst_scales.policy)
281 && prb->is_reorder_with_compensation(FLAG_ANY)) {
282 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
283 return;
284 }
285 }
286
287 if (is_cpu()) {
288 // CPU reorder doesn't support bf16<-->s32 combinations.
289 const bool s32_src_ok = IMPLICATION(sdt == dnnl_s32, ddt != dnnl_bf16);
290 const bool s32_dst_ok = IMPLICATION(ddt == dnnl_s32, sdt != dnnl_bf16);
291 if (!s32_src_ok || !s32_dst_ok) {
292 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
293 return;
294 }
295
296 // CPU f16 reorders only support f16<->f32 combinations
297 const bool f16_src_ok = IMPLICATION(
298 sdt == dnnl_f16, ddt == dnnl_f16 || ddt == dnnl_f32);
299 const bool f16_dst_ok = IMPLICATION(
300 ddt == dnnl_f16, sdt == dnnl_f16 || sdt == dnnl_f32);
301 if (!f16_src_ok || !f16_dst_ok) {
302 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
303 return;
304 }
305 }
306
307 if (is_gpu()) {
308 // GPU does not support run-time dims.
309 // Reorders w/ compensation are not supported by design: zp_comp is done
310 // in kernels directly, but s8s8 instructions are available in HW.
311 if (prb->runtime_dim_mask != 0
312 || prb->is_reorder_with_compensation(FLAG_ANY)) {
313 res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED;
314 return;
315 }
316 }
317}
318
319void skip_invalid_prb(const prb_t *prb, res_t *res) {
320 // No sense in cross engine reorders when one of devices is switched off.
321#if DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \
322 || DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE
323 auto cross_engine = prb->cross_engine;
324 if (cross_engine == CPU2GPU || cross_engine == GPU2CPU)
325 res->state = SKIPPED, res->reason = INVALID_CASE;
326#endif
327
328 // Zero-points can't be used with sum post-op.
329 if (!prb->attr.zero_points.is_def(DNNL_ARG_DST)
330 && prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1) {
331 res->state = SKIPPED, res->reason = INVALID_CASE;
332 return;
333 }
334
335 // only integral data types can have zero points
336 const bool is_src_zp_ok = is_integral_dt(prb->sdt)
337 || prb->attr.zero_points.is_def(DNNL_ARG_SRC);
338 const bool is_dst_zp_ok = is_integral_dt(prb->ddt)
339 || prb->attr.zero_points.is_def(DNNL_ARG_DST);
340 if (!(is_src_zp_ok && is_dst_zp_ok)) {
341 res->state = SKIPPED, res->reason = INVALID_CASE;
342 return;
343 }
344}
345
346void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind,
347 const args_t &ref_args) {
348 const bool has_s32 = prb->sdt == dnnl_s32 || prb->ddt == dnnl_s32;
349 const bool has_s8 = prb->sdt == dnnl_s8 || prb->ddt == dnnl_s8;
350 const bool has_u8 = prb->sdt == dnnl_u8 || prb->ddt == dnnl_u8;
351 // For u8 4/7 inputs becomes 0, for s32/s8 3/7 inputs becomes 0;
352 const float zero_trust_percent
353 = has_u8 ? 58.f : (has_s32 || has_s8) ? 43.f : 30.f;
354 cmp.set_zero_trust_percent(zero_trust_percent);
355
356 // Additional check to avoid false-positive result from f32->s32 conversion
357 // in case of sum post-op on GPU happening when two max_dt values
358 // are summed together.
359 const auto reorder_add_check
360 = [&](const compare::compare_t::driver_check_func_args_t &args) {
361 if (args.dt == dnnl_s32 && args.got == max_dt(args.dt)
362 && is_gpu()) {
363 // 128.f = float(INT_MAX) - BENCHDNN_S32_TO_F32_SAT_CONST;
364 return args.diff == 128.f;
365 }
366 return false;
367 };
368 cmp.set_driver_check_function(reorder_add_check);
369}
370
371int doit(const prb_t *prb, res_t *res) {
372 if (bench_mode == LIST) return res->state = LISTED, OK;
373
374 benchdnn_dnnl_wrapper_t<dnnl_primitive_t> prim;
375 SAFE(init_prim(prb->ctx_init, prim, init_pd, prb, res), WARN);
376 if (res->state == SKIPPED || res->state == UNIMPLEMENTED) return OK;
377
378 auto const_pd = query_pd(prim);
379
380 benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> src_md {}, dst_md {};
381 if (prb->runtime_dim_mask != 0) {
382 // re-create memory descriptors with defined dims
383 src_md = dnn_mem_t::init_md(
384 prb->ndims, prb->dims.data(), prb->sdt, prb->stag);
385 dst_md = dnn_mem_t::init_md(
386 prb->ndims, prb->dims.data(), prb->ddt, prb->dtag);
387 } else {
388 src_md = clone_md(query_md(const_pd, DNNL_ARG_SRC));
389 dst_md = clone_md(query_md(const_pd, DNNL_ARG_DST));
390 }
391 const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD);
392
393 dnnl_engine_t src_engine
394 = query_engine(const_pd, dnnl_query_reorder_src_engine);
395 dnnl_engine_t dst_engine
396 = query_engine(const_pd, dnnl_query_reorder_dst_engine);
397 const auto &ref_engine = get_cpu_engine();
398
399 dnn_mem_t src_fp(src_md, dnnl_f32, tag::abx, ref_engine);
400 dnn_mem_t src_dt(src_md, src_engine);
401
402 dnn_mem_t scratchpad_dt(scratchpad_md, src_engine);
403
404 dnn_mem_t dst_fp(dst_md, dnnl_f32, tag::abx, ref_engine);
405 dnn_mem_t dst_dt(dst_md, dst_engine);
406
407 SAFE(fill_memory(prb, SRC, src_dt, src_fp), WARN);
408
409 const bool has_sum
410 = prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) >= 0;
411 if (has_sum) { SAFE(fill_memory(prb, DST, dst_dt, dst_fp), WARN); }
412
413 const int src_mask = attr_t::get_default_mask(
414 prb->attr.scales.get(DNNL_ARG_SRC).policy);
415 const int dst_mask = attr_t::get_default_mask(
416 prb->attr.scales.get(DNNL_ARG_DST).policy);
417 dnn_mem_t src_scales, dst_scales;
418 dnn_mem_t src_zero_points_m, dst_zero_points_m;
419
420 maybe_prepare_runtime_scales(src_scales, prb->attr.scales.get(DNNL_ARG_SRC),
421 prb->nelems(src_mask), prb->src_scales);
422 maybe_prepare_runtime_scales(dst_scales, prb->attr.scales.get(DNNL_ARG_DST),
423 prb->nelems(dst_mask), prb->dst_scales);
424 maybe_prepare_runtime_zero_points(
425 src_zero_points_m, prb->attr, DNNL_ARG_SRC, 1, prb->src_zp);
426 maybe_prepare_runtime_zero_points(
427 dst_zero_points_m, prb->attr, DNNL_ARG_DST, 1, prb->dst_zp);
428
429 args_t args, ref_args;
430
431 args.set(DNNL_ARG_FROM, src_dt);
432 args.set(DNNL_ARG_TO, dst_dt);
433 args.set(DNNL_ARG_SCRATCHPAD, scratchpad_dt);
434 args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales);
435 args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales);
436 args.set(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zero_points_m);
437 args.set(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zero_points_m);
438
439 SAFE(execute_and_wait(prim, args, res), WARN);
440
441 if (is_bench_mode(CORR)) {
442 const auto assign_comp_mem = [&](dnn_mem_t &m, flag_bit_t flag) {
443 if (prb->is_reorder_with_compensation(flag)) {
444 dims_t dims = prb->get_compensation_dims(flag);
445 int ndims = static_cast<int>(dims.size());
446 auto md = dnn_mem_t::init_md(
447 ndims, dims.data(), dnnl_s32, tag::abx);
448 m = dnn_mem_t(md, ref_engine);
449 }
450 return OK;
451 };
452
453 dnn_mem_t dst_s8_comp_ref, dst_zp_comp_ref;
454 assign_comp_mem(dst_s8_comp_ref, FLAG_S8S8_COMP);
455 assign_comp_mem(dst_zp_comp_ref, FLAG_ZP_COMP);
456
457 ref_args.set(DNNL_ARG_FROM, src_fp);
458 ref_args.set(DNNL_ARG_TO, dst_fp);
459 ref_args.set(DNNL_ARG_SRC_1, dst_s8_comp_ref); // Additional input
460 ref_args.set(DNNL_ARG_SRC_2, dst_zp_comp_ref); // Additional input
461 ref_args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales);
462 ref_args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales);
463
464 // Remove extra desc so that reorders with compensation could have
465 // proper reorder from blocked layout to plain for comparison.
466 dnnl::impl::memory_extra_desc_t empty_extra {};
467 const auto orig_dst_extra = dst_dt.md_->extra;
468 dst_dt.md_->extra = empty_extra;
469
470 // Validate main reorder part.
471 check_correctness(prb, {DST}, args, ref_args, setup_cmp, res);
472
473 // Restore extra for compensation comparison and performance mode.
474 dst_dt.md_->extra = orig_dst_extra;
475
476 // Validate compensated reorder part.
477 if (prb->is_reorder_with_compensation(FLAG_ANY)) {
478 compare_compensation(
479 prb, dst_s8_comp_ref, dst_zp_comp_ref, dst_dt, res);
480 }
481 }
482
483 return measure_perf(prb->ctx_exe, res, prim, args);
484}
485
486} // namespace reorder
487