1 | /******************************************************************************* |
2 | * Copyright 2017-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include <cstring> |
19 | |
20 | #include <stdlib.h> |
21 | |
22 | #include "oneapi/dnnl/dnnl.h" |
23 | |
24 | // TODO: refactor the driver to avoid using extra flags of a memory descriptor. |
25 | #include "common/memory_desc.hpp" |
26 | |
27 | #include "utils/parallel.hpp" |
28 | |
29 | #include "dnn_types.hpp" |
30 | #include "dnnl_common.hpp" |
31 | #include "dnnl_memory.hpp" |
32 | |
33 | #include "reorder.hpp" |
34 | |
35 | namespace reorder { |
36 | |
37 | // Filling for integers is different due to problematic int -> float conversion. |
38 | // And it doesn't require many different points to be tested. |
39 | int fill_memory_int(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt, |
40 | dnn_mem_t &mem_fp) { |
41 | const auto conf = prb->get_conf(kind); |
42 | |
43 | for (int64_t idx = 0; idx < mem_fp.nelems(); ++idx) { |
44 | const float gen[4] = { |
45 | conf->max, // saturate to max of output data type |
46 | conf->min, // saturate to min of output data type |
47 | 0, |
48 | 16, |
49 | }; |
50 | |
51 | const int64_t rng = kind == SRC ? (idx % 4) : ((idx * 5 / 4) % 4); |
52 | mem_fp.set_elem( |
53 | idx, round_to_nearest_representable(conf->dt, gen[rng])); |
54 | } |
55 | |
56 | SAFE(mem_dt.reorder(mem_fp), WARN); |
57 | return OK; |
58 | } |
59 | |
60 | int fill_memory_fp(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt, |
61 | dnn_mem_t &mem_fp) { |
62 | const auto conf = prb->get_conf(kind); |
63 | const auto &src_scales = prb->attr.scales.get(DNNL_ARG_FROM); |
64 | const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_TO); |
65 | const int src_scale_mask = attr_t::get_default_mask(src_scales.policy); |
66 | const int dst_scale_mask = attr_t::get_default_mask(dst_scales.policy); |
67 | |
68 | for (int64_t idx = 0; idx < mem_fp.nelems(); ++idx) { |
69 | float src_scale = 1.f, dst_scale = 1.f; |
70 | if (!src_scales.is_def()) { |
71 | int64_t src_mask_idx = mem_fp.get_scale_idx(idx, src_scale_mask); |
72 | src_scale = prb->src_scales[src_mask_idx]; |
73 | } |
74 | if (!dst_scales.is_def()) { |
75 | int64_t dst_mask_idx = mem_fp.get_scale_idx(idx, dst_scale_mask); |
76 | dst_scale = prb->dst_scales[dst_mask_idx]; |
77 | } |
78 | const float scale = src_scale / dst_scale; |
79 | |
80 | const float gen[7] = { |
81 | conf->max, // saturate to max of output data type |
82 | conf->min, // saturate to min of output data type |
83 | 1.6f, // rounding check |
84 | 0.2f, // saturate to 0 |
85 | 1.f / scale, // exact multiplication check |
86 | 2.f, |
87 | scale, |
88 | }; |
89 | |
90 | const int64_t rng = kind == SRC ? (idx % 7) : ((idx * 8 / 7) % 7); |
91 | mem_fp.set_elem( |
92 | idx, round_to_nearest_representable(conf->dt, gen[rng])); |
93 | } |
94 | |
95 | SAFE(mem_dt.reorder(mem_fp), WARN); |
96 | return OK; |
97 | } |
98 | |
99 | int fill_memory(const prb_t *prb, data_kind_t kind, dnn_mem_t &mem_dt, |
100 | dnn_mem_t &mem_fp) { |
101 | const auto dt = kind == SRC ? prb->sdt : prb->ddt; |
102 | if (is_integral_dt(dt)) return fill_memory_int(prb, kind, mem_dt, mem_fp); |
103 | return fill_memory_fp(prb, kind, mem_dt, mem_fp); |
104 | } |
105 | |
106 | int compare_compensation(const prb_t *prb, dnn_mem_t &mem_s8_comp_ref, |
107 | dnn_mem_t &mem_zp_comp_ref, dnn_mem_t &mem_got, res_t *res) { |
108 | // Note: following check relies on certain assumptions on CPU. These |
109 | // assumptions may not hold for GPU. In addition, it's prohibit to work |
110 | // with raw pointers directly for buffer type of memory. |
111 | if (!is_cpu(get_test_engine())) return FAIL; |
112 | |
113 | const auto padded_nelems = mem_got.nelems(true); |
114 | // Note: internally offset is aligned on 4, otherwise it's UB. |
115 | size_t first_comp_offset = div_up(padded_nelems, 4) * 4; |
116 | int *comp_handle |
117 | = reinterpret_cast<int *>((char *)mem_got + first_comp_offset); |
118 | |
119 | const auto cmp_compensation = [&](const dnn_mem_t &mem_ref, int comp_mask) { |
120 | // Idea behind this check: |
121 | // Using knowledge from the library where `comp_handle` starts, and that |
122 | // memory utilizes blocking over OC and G, if present, we wrap that |
123 | // piece of memory which is described by shortened tag coming from prb |
124 | // into a separate memory and reorder it to plain so that it is a |
125 | // straight comparison of values in native plain layout. |
126 | auto comp_md = dnn_mem_t::init_md(mem_ref.ndims(), mem_ref.dims(), |
127 | mem_ref.dt(), trim_tag_by_mask(prb->dtag, comp_mask)); |
128 | dnn_mem_t comp_m(comp_md, mem_ref.engine(), {false, comp_handle}); |
129 | |
130 | compare::compare_t cmp; |
131 | cmp.set_zero_trust_percent(100.f); // No sense in zero trust test. |
132 | int status = cmp.compare(mem_ref, comp_m, attr_t(), res); |
133 | |
134 | // Shift original compensation pointer for next compensation |
135 | comp_handle += comp_m.nelems(true); |
136 | return status; |
137 | }; |
138 | |
139 | if (mem_s8_comp_ref.ndims()) |
140 | SAFE(cmp_compensation(mem_s8_comp_ref, |
141 | prb->get_compensation_mask(FLAG_S8S8_COMP)), |
142 | WARN); |
143 | if (mem_zp_comp_ref.ndims()) |
144 | SAFE(cmp_compensation( |
145 | mem_zp_comp_ref, prb->get_compensation_mask(FLAG_ZP_COMP)), |
146 | WARN); |
147 | |
148 | return res->state == FAILED ? FAIL : OK; |
149 | } |
150 | |
151 | dnnl_status_t init_pd(init_pd_args_t<prb_t> &init_pd_args) { |
152 | const prb_t *prb = init_pd_args.prb; |
153 | |
154 | auto dims = prb->dims; |
155 | for (int d = 0; d < prb->ndims; ++d) |
156 | if (prb->runtime_dim_mask & (1 << d)) dims[d] = DNNL_RUNTIME_DIM_VAL; |
157 | |
158 | auto src_d |
159 | = dnn_mem_t::init_md(prb->ndims, dims.data(), prb->sdt, prb->stag); |
160 | auto dst_d |
161 | = dnn_mem_t::init_md(prb->ndims, dims.data(), prb->ddt, prb->dtag); |
162 | |
163 | // Prepare and assign extra for dst_md. |
164 | auto & = static_cast<dnnl_memory_desc_t>(dst_d)->extra; |
165 | extra.flags = dnnl::impl::memory_extra_flags::none; |
166 | if (prb->is_reorder_with_compensation(FLAG_ANY)) { |
167 | for (const auto &i_oflag : prb->oflag) { |
168 | if (i_oflag.first & FLAG_S8S8_COMP) { |
169 | extra.flags |= dnnl::impl::memory_extra_flags:: |
170 | compensation_conv_s8s8; |
171 | extra.compensation_mask = i_oflag.second; |
172 | |
173 | const float s8_scale_factor = reorder_rescale_factor(); |
174 | const bool need_rescale = s8_scale_factor != 1.f; |
175 | if (need_rescale) { |
176 | extra.flags |= dnnl::impl::memory_extra_flags::scale_adjust; |
177 | extra.scale_adjust = s8_scale_factor; |
178 | } |
179 | } |
180 | if (i_oflag.first & FLAG_ZP_COMP) { |
181 | extra.flags |= dnnl::impl::memory_extra_flags:: |
182 | compensation_conv_asymmetric_src; |
183 | extra.asymm_compensation_mask = i_oflag.second; |
184 | } |
185 | } |
186 | } |
187 | |
188 | auto src_engine = init_pd_args.engine; |
189 | auto dst_engine = init_pd_args.engine; |
190 | if (is_gpu()) { |
191 | switch (prb->cross_engine) { |
192 | case CPU2GPU: src_engine = get_cpu_engine(); break; |
193 | case GPU2CPU: dst_engine = get_cpu_engine(); break; |
194 | default: break; |
195 | } |
196 | } |
197 | |
198 | auto dnnl_attr = make_benchdnn_dnnl_wrapper( |
199 | create_dnnl_attr(prb->attr, attr_args_t())); |
200 | |
201 | init_pd_args.is_iterator_supported = false; |
202 | return dnnl_reorder_primitive_desc_create( |
203 | &init_pd_args.pd, src_d, src_engine, dst_d, dst_engine, dnnl_attr); |
204 | } |
205 | |
206 | void skip_unimplemented_prb(const prb_t *prb, res_t *res) { |
207 | const auto sdt = prb->sdt; |
208 | const auto ddt = prb->ddt; |
209 | skip_unimplemented_data_type({sdt, ddt}, prb->dir, res); |
210 | skip_unimplemented_sum_po(prb->attr, res); |
211 | |
212 | bool scales_ok = true; |
213 | #if !defined(DNNL_X64) || DNNL_X64 == 0 |
214 | { |
215 | // reference reorder supports only a subset of scale policies |
216 | const std::vector<policy_t> supported_policy = {policy_t::COMMON, |
217 | policy_t::PER_DIM_0, policy_t::PER_DIM_1, policy_t::PER_DIM_01}; |
218 | |
219 | for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) { |
220 | scales_ok = std::any_of(supported_policy.cbegin(), |
221 | supported_policy.cend(), [&](const policy_t policy) { |
222 | return prb->attr.scales.get(arg).policy == policy; |
223 | }); |
224 | } |
225 | } |
226 | #endif |
227 | if (!scales_ok) { |
228 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
229 | return; |
230 | } |
231 | |
232 | if (prb->is_reorder_with_compensation(FLAG_ANY)) { |
233 | // Compensation is supported for s8 dst data type. |
234 | const bool dt_ok = ddt == dnnl_s8; |
235 | // Compensation can be paired with dst scale only. |
236 | const bool attr_ok |
237 | = prb->attr.zero_points.is_def() && prb->attr.post_ops.is_def(); |
238 | // Compensation does not support runtime dims. |
239 | const bool rt_ok = prb->runtime_dim_mask == 0; |
240 | |
241 | // Compensation and scales mask should coincide |
242 | const auto comp_mask = prb->get_compensation_mask(FLAG_ANY); |
243 | bool masks_ok = true; |
244 | for (auto arg : {DNNL_ARG_SRC, DNNL_ARG_DST}) { |
245 | const auto &e = prb->attr.scales.get(arg); |
246 | if (!e.is_def()) { |
247 | int e_mask = attr_t::get_default_mask(e.policy); |
248 | masks_ok = masks_ok && e_mask == comp_mask; |
249 | } |
250 | } |
251 | |
252 | if (!dt_ok || !attr_ok || !rt_ok || !masks_ok) { |
253 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
254 | return; |
255 | } |
256 | |
257 | #if !defined(DNNL_X64) || DNNL_X64 == 0 |
258 | // Simple reorder doesn't provide decent coverage for compensated cases. |
259 | // Shut them down unconditionally by default. |
260 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
261 | return; |
262 | #endif |
263 | } |
264 | |
265 | // Destination scale is not supported for runtime dimensions since the |
266 | // implementation logic inverts dst scales and requires scratchpad for |
267 | // `mask > 0` cases which is impossible to estimate with rt dims. |
268 | const auto &dst_scales = prb->attr.scales.get(DNNL_ARG_DST); |
269 | if (!dst_scales.is_def() && attr_t::get_default_mask(dst_scales.policy) > 0 |
270 | && prb->runtime_dim_mask != 0) { |
271 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
272 | return; |
273 | } |
274 | |
275 | // Compensation is supported through jit reorder only, but jit reorder |
276 | // doesn't support different masks for source and destination scales. |
277 | const auto &src_scales = prb->attr.scales.get(DNNL_ARG_SRC); |
278 | if (!src_scales.is_def() && !dst_scales.is_def()) { |
279 | if (attr_t::get_default_mask(src_scales.policy) |
280 | != attr_t::get_default_mask(dst_scales.policy) |
281 | && prb->is_reorder_with_compensation(FLAG_ANY)) { |
282 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
283 | return; |
284 | } |
285 | } |
286 | |
287 | if (is_cpu()) { |
288 | // CPU reorder doesn't support bf16<-->s32 combinations. |
289 | const bool s32_src_ok = IMPLICATION(sdt == dnnl_s32, ddt != dnnl_bf16); |
290 | const bool s32_dst_ok = IMPLICATION(ddt == dnnl_s32, sdt != dnnl_bf16); |
291 | if (!s32_src_ok || !s32_dst_ok) { |
292 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
293 | return; |
294 | } |
295 | |
296 | // CPU f16 reorders only support f16<->f32 combinations |
297 | const bool f16_src_ok = IMPLICATION( |
298 | sdt == dnnl_f16, ddt == dnnl_f16 || ddt == dnnl_f32); |
299 | const bool f16_dst_ok = IMPLICATION( |
300 | ddt == dnnl_f16, sdt == dnnl_f16 || sdt == dnnl_f32); |
301 | if (!f16_src_ok || !f16_dst_ok) { |
302 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
303 | return; |
304 | } |
305 | } |
306 | |
307 | if (is_gpu()) { |
308 | // GPU does not support run-time dims. |
309 | // Reorders w/ compensation are not supported by design: zp_comp is done |
310 | // in kernels directly, but s8s8 instructions are available in HW. |
311 | if (prb->runtime_dim_mask != 0 |
312 | || prb->is_reorder_with_compensation(FLAG_ANY)) { |
313 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
314 | return; |
315 | } |
316 | } |
317 | } |
318 | |
319 | void skip_invalid_prb(const prb_t *prb, res_t *res) { |
320 | // No sense in cross engine reorders when one of devices is switched off. |
321 | #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_NONE \ |
322 | || DNNL_GPU_RUNTIME == DNNL_RUNTIME_NONE |
323 | auto cross_engine = prb->cross_engine; |
324 | if (cross_engine == CPU2GPU || cross_engine == GPU2CPU) |
325 | res->state = SKIPPED, res->reason = INVALID_CASE; |
326 | #endif |
327 | |
328 | // Zero-points can't be used with sum post-op. |
329 | if (!prb->attr.zero_points.is_def(DNNL_ARG_DST) |
330 | && prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) != -1) { |
331 | res->state = SKIPPED, res->reason = INVALID_CASE; |
332 | return; |
333 | } |
334 | |
335 | // only integral data types can have zero points |
336 | const bool is_src_zp_ok = is_integral_dt(prb->sdt) |
337 | || prb->attr.zero_points.is_def(DNNL_ARG_SRC); |
338 | const bool is_dst_zp_ok = is_integral_dt(prb->ddt) |
339 | || prb->attr.zero_points.is_def(DNNL_ARG_DST); |
340 | if (!(is_src_zp_ok && is_dst_zp_ok)) { |
341 | res->state = SKIPPED, res->reason = INVALID_CASE; |
342 | return; |
343 | } |
344 | } |
345 | |
346 | void setup_cmp(compare::compare_t &cmp, const prb_t *prb, data_kind_t kind, |
347 | const args_t &ref_args) { |
348 | const bool has_s32 = prb->sdt == dnnl_s32 || prb->ddt == dnnl_s32; |
349 | const bool has_s8 = prb->sdt == dnnl_s8 || prb->ddt == dnnl_s8; |
350 | const bool has_u8 = prb->sdt == dnnl_u8 || prb->ddt == dnnl_u8; |
351 | // For u8 4/7 inputs becomes 0, for s32/s8 3/7 inputs becomes 0; |
352 | const float zero_trust_percent |
353 | = has_u8 ? 58.f : (has_s32 || has_s8) ? 43.f : 30.f; |
354 | cmp.set_zero_trust_percent(zero_trust_percent); |
355 | |
356 | // Additional check to avoid false-positive result from f32->s32 conversion |
357 | // in case of sum post-op on GPU happening when two max_dt values |
358 | // are summed together. |
359 | const auto reorder_add_check |
360 | = [&](const compare::compare_t::driver_check_func_args_t &args) { |
361 | if (args.dt == dnnl_s32 && args.got == max_dt(args.dt) |
362 | && is_gpu()) { |
363 | // 128.f = float(INT_MAX) - BENCHDNN_S32_TO_F32_SAT_CONST; |
364 | return args.diff == 128.f; |
365 | } |
366 | return false; |
367 | }; |
368 | cmp.set_driver_check_function(reorder_add_check); |
369 | } |
370 | |
371 | int doit(const prb_t *prb, res_t *res) { |
372 | if (bench_mode == LIST) return res->state = LISTED, OK; |
373 | |
374 | benchdnn_dnnl_wrapper_t<dnnl_primitive_t> prim; |
375 | SAFE(init_prim(prb->ctx_init, prim, init_pd, prb, res), WARN); |
376 | if (res->state == SKIPPED || res->state == UNIMPLEMENTED) return OK; |
377 | |
378 | auto const_pd = query_pd(prim); |
379 | |
380 | benchdnn_dnnl_wrapper_t<dnnl_memory_desc_t> src_md {}, dst_md {}; |
381 | if (prb->runtime_dim_mask != 0) { |
382 | // re-create memory descriptors with defined dims |
383 | src_md = dnn_mem_t::init_md( |
384 | prb->ndims, prb->dims.data(), prb->sdt, prb->stag); |
385 | dst_md = dnn_mem_t::init_md( |
386 | prb->ndims, prb->dims.data(), prb->ddt, prb->dtag); |
387 | } else { |
388 | src_md = clone_md(query_md(const_pd, DNNL_ARG_SRC)); |
389 | dst_md = clone_md(query_md(const_pd, DNNL_ARG_DST)); |
390 | } |
391 | const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD); |
392 | |
393 | dnnl_engine_t src_engine |
394 | = query_engine(const_pd, dnnl_query_reorder_src_engine); |
395 | dnnl_engine_t dst_engine |
396 | = query_engine(const_pd, dnnl_query_reorder_dst_engine); |
397 | const auto &ref_engine = get_cpu_engine(); |
398 | |
399 | dnn_mem_t src_fp(src_md, dnnl_f32, tag::abx, ref_engine); |
400 | dnn_mem_t src_dt(src_md, src_engine); |
401 | |
402 | dnn_mem_t scratchpad_dt(scratchpad_md, src_engine); |
403 | |
404 | dnn_mem_t dst_fp(dst_md, dnnl_f32, tag::abx, ref_engine); |
405 | dnn_mem_t dst_dt(dst_md, dst_engine); |
406 | |
407 | SAFE(fill_memory(prb, SRC, src_dt, src_fp), WARN); |
408 | |
409 | const bool has_sum |
410 | = prb->attr.post_ops.find(attr_t::post_ops_t::kind_t::SUM) >= 0; |
411 | if (has_sum) { SAFE(fill_memory(prb, DST, dst_dt, dst_fp), WARN); } |
412 | |
413 | const int src_mask = attr_t::get_default_mask( |
414 | prb->attr.scales.get(DNNL_ARG_SRC).policy); |
415 | const int dst_mask = attr_t::get_default_mask( |
416 | prb->attr.scales.get(DNNL_ARG_DST).policy); |
417 | dnn_mem_t src_scales, dst_scales; |
418 | dnn_mem_t src_zero_points_m, dst_zero_points_m; |
419 | |
420 | maybe_prepare_runtime_scales(src_scales, prb->attr.scales.get(DNNL_ARG_SRC), |
421 | prb->nelems(src_mask), prb->src_scales); |
422 | maybe_prepare_runtime_scales(dst_scales, prb->attr.scales.get(DNNL_ARG_DST), |
423 | prb->nelems(dst_mask), prb->dst_scales); |
424 | maybe_prepare_runtime_zero_points( |
425 | src_zero_points_m, prb->attr, DNNL_ARG_SRC, 1, prb->src_zp); |
426 | maybe_prepare_runtime_zero_points( |
427 | dst_zero_points_m, prb->attr, DNNL_ARG_DST, 1, prb->dst_zp); |
428 | |
429 | args_t args, ref_args; |
430 | |
431 | args.set(DNNL_ARG_FROM, src_dt); |
432 | args.set(DNNL_ARG_TO, dst_dt); |
433 | args.set(DNNL_ARG_SCRATCHPAD, scratchpad_dt); |
434 | args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales); |
435 | args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales); |
436 | args.set(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zero_points_m); |
437 | args.set(DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zero_points_m); |
438 | |
439 | SAFE(execute_and_wait(prim, args, res), WARN); |
440 | |
441 | if (is_bench_mode(CORR)) { |
442 | const auto assign_comp_mem = [&](dnn_mem_t &m, flag_bit_t flag) { |
443 | if (prb->is_reorder_with_compensation(flag)) { |
444 | dims_t dims = prb->get_compensation_dims(flag); |
445 | int ndims = static_cast<int>(dims.size()); |
446 | auto md = dnn_mem_t::init_md( |
447 | ndims, dims.data(), dnnl_s32, tag::abx); |
448 | m = dnn_mem_t(md, ref_engine); |
449 | } |
450 | return OK; |
451 | }; |
452 | |
453 | dnn_mem_t dst_s8_comp_ref, dst_zp_comp_ref; |
454 | assign_comp_mem(dst_s8_comp_ref, FLAG_S8S8_COMP); |
455 | assign_comp_mem(dst_zp_comp_ref, FLAG_ZP_COMP); |
456 | |
457 | ref_args.set(DNNL_ARG_FROM, src_fp); |
458 | ref_args.set(DNNL_ARG_TO, dst_fp); |
459 | ref_args.set(DNNL_ARG_SRC_1, dst_s8_comp_ref); // Additional input |
460 | ref_args.set(DNNL_ARG_SRC_2, dst_zp_comp_ref); // Additional input |
461 | ref_args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales); |
462 | ref_args.set(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales); |
463 | |
464 | // Remove extra desc so that reorders with compensation could have |
465 | // proper reorder from blocked layout to plain for comparison. |
466 | dnnl::impl::memory_extra_desc_t {}; |
467 | const auto = dst_dt.md_->extra; |
468 | dst_dt.md_->extra = empty_extra; |
469 | |
470 | // Validate main reorder part. |
471 | check_correctness(prb, {DST}, args, ref_args, setup_cmp, res); |
472 | |
473 | // Restore extra for compensation comparison and performance mode. |
474 | dst_dt.md_->extra = orig_dst_extra; |
475 | |
476 | // Validate compensated reorder part. |
477 | if (prb->is_reorder_with_compensation(FLAG_ANY)) { |
478 | compare_compensation( |
479 | prb, dst_s8_comp_ref, dst_zp_comp_ref, dst_dt, res); |
480 | } |
481 | } |
482 | |
483 | return measure_perf(prb->ctx_exe, res, prim, args); |
484 | } |
485 | |
486 | } // namespace reorder |
487 | |