1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include <atomic>
19#include <cmath>
20#include <sstream>
21#include <string>
22
23#include "utils/parallel.hpp"
24
25#include "common.hpp"
26#include "utils/compare.hpp"
27#include "utils/norm.hpp"
28
29#include "eltwise/eltwise.hpp"
30
31namespace compare {
32
33static void dump_point_values(const dnnl_memory_desc_t &md, data_kind_t kind,
34 int64_t l_offset, float exp_f32, float exp, float got, float diff,
35 float rel_diff) {
36 std::stringstream ss;
37 dims_t l_dims = md2dims(md);
38 dims_t dims_idx = off2dims_idx(l_dims, l_offset);
39 ss << dims_idx;
40 std::string ind_str = ss.str();
41
42 std::string skind;
43 if (kind != DAT_TOTAL) skind = "[" + std::string(data_kind2str(kind)) + "]";
44
45 BENCHDNN_PRINT(0,
46 "[%4ld]%s[%s] exp_f32:%12g exp:%12g got:%12g diff:%8g rdiff:%8g\n",
47 (long)l_offset, skind.c_str(), ind_str.c_str(), exp_f32, exp, got,
48 diff, rel_diff);
49}
50
51static void dump_norm_values(const diff_norm_t &diff_norm, data_kind_t kind) {
52 std::string skind;
53 if (kind != DAT_TOTAL) skind = "[" + std::string(data_kind2str(kind)) + "]";
54
55 BENCHDNN_PRINT(0,
56 "%s[L0] = %g\n"
57 "%s[L1] exp:%8g got:%8g diff:%8g rel_diff:%8g\n"
58 "%s[L2] exp:%8g got:%8g diff:%8g rel_diff:%8g\n"
59 "%s[L8] exp:%8g got:%8g diff:%8g rel_diff:%8g\n",
60 skind.c_str(), diff_norm.rel_diff(norm_t::L0), skind.c_str(),
61 diff_norm.a_[norm_t::L1], diff_norm.b_[norm_t::L1],
62 diff_norm.diff_[norm_t::L1], diff_norm.rel_diff(norm_t::L1),
63 skind.c_str(), diff_norm.a_[norm_t::L2], diff_norm.b_[norm_t::L2],
64 diff_norm.diff_[norm_t::L2], diff_norm.rel_diff(norm_t::L2),
65 skind.c_str(), diff_norm.a_[norm_t::L8], diff_norm.b_[norm_t::L8],
66 diff_norm.diff_[norm_t::L8], diff_norm.rel_diff(norm_t::L8));
67}
68
69static bool has_binary_comparison_po(const attr_t &attr) {
70 const auto &po = attr.post_ops;
71 if (po.is_def()) return false;
72
73 using alg_t = attr_t::post_ops_t::kind_t;
74 static const std::vector<alg_t> cmp_alg = {alg_t::MAX, alg_t::MIN,
75 alg_t::GE, alg_t::GT, alg_t::LE, alg_t::LT, alg_t::EQ, alg_t::NE};
76
77 for (int idx = 0; idx < po.len(); ++idx) {
78 const auto &e = po.entry[idx];
79 if (!e.is_binary_kind()) continue;
80
81 if (std::any_of(cmp_alg.cbegin(), cmp_alg.cend(),
82 [&](const alg_t alg) { return e.kind == alg; }))
83 return true;
84 }
85 return false;
86}
87
88bool compare_extreme_values(float a, float b) {
89 if (std::isnan(a) && std::isnan(b)) return true;
90 if (std::isinf(a) && std::isinf(b) && std::signbit(a) == std::signbit(b))
91 return true;
92 return false;
93}
94
95compare_t::driver_check_func_args_t::driver_check_func_args_t(
96 const dnn_mem_t &exp_mem, const dnn_mem_t &got_f32, const int64_t i,
97 const dnnl_data_type_t data_type, const float trh)
98 : dt(data_type)
99 , idx(i)
100 , exp_f32(exp_mem.get_elem(idx))
101 , exp(round_to_nearest_representable(dt, exp_f32))
102 , got(got_f32.get_elem(idx))
103 , diff(fabsf(exp - got))
104 , rel_diff(diff / (fabsf(exp) > FLT_MIN ? fabsf(exp) : 1))
105 , trh(trh) {}
106
107int compare_t::compare_norm(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
108 const attr_t &attr, res_t *res) const {
109 const auto nelems = got_mem.nelems();
110 if (nelems == 0) {
111 if (res->state == EXECUTED) res->state = PASSED;
112 return OK;
113 }
114
115 res->total = nelems;
116
117 dnn_mem_t got_f32(got_mem, dnnl_f32, tag::abx, get_cpu_engine());
118 const auto dt = got_mem.dt();
119
120 diff_norm_t diff_norm;
121 const bool need_dump = verbose >= 99;
122 for (int64_t i = 0; i < nelems; ++i) {
123 driver_check_func_args_t args(exp_mem, got_f32, i, dt, trh_);
124
125 if (std::isnan(args.exp_f32) && is_integral_dt(dt)) {
126 // Don't include integer max values into norm as they make it
127 // irrelevant for validation.
128 ;
129 } else if (is_cpu() && dt == dnnl_s32 && args.exp == max_dt(dnnl_s32)
130 && args.got >= BENCHDNN_S32_TO_F32_SAT_CONST
131 && args.got < max_dt(dnnl_s32)) {
132 // Don't include f32->s32 saturation values into norm as they make
133 // it irrelevant for validation.
134 ;
135 } else {
136 diff_norm.update(args.exp, args.got);
137 }
138
139 if (need_dump)
140 dump_point_values(got_mem.md_, kind_, i, args.exp_f32, args.exp,
141 args.got, args.diff, args.rel_diff);
142 }
143 diff_norm.done();
144
145 bool ok = diff_norm.rel_diff(norm_t::L2) <= trh_;
146 if (!ok) res->errors = 1;
147
148 const bool dump = need_dump || !ok;
149 if (dump) dump_norm_values(diff_norm, kind_);
150
151 if (res->errors) res->state = FAILED;
152 if (res->state == EXECUTED) res->state = PASSED;
153
154 return res->state == FAILED ? FAIL : OK;
155}
156
157int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
158 const attr_t &attr, res_t *res) const {
159 const auto nelems = got_mem.nelems();
160 if (nelems == 0) {
161 if (res->state == EXECUTED) res->state = PASSED;
162 return OK;
163 }
164
165 res->total = nelems;
166
167 dnn_mem_t got_f32(got_mem, dnnl_f32, tag::abx, get_cpu_engine());
168 const auto dt = got_mem.dt();
169 const bool has_eltwise = attr.post_ops.eltwise_index() != -1;
170 const bool has_exp_eltwise
171 = attr.post_ops.find(attr_t::post_ops_t::kind_t::EXP) >= 0;
172 const bool has_dst_scale = !attr.scales.get(DNNL_ARG_DST).is_def();
173
174 // Atomics to be updated in parallel section, non-atomics - in sequential.
175 std::atomic<bool> all_ok(true);
176 std::atomic<int64_t> zeros(0);
177 int64_t n_errors = 0;
178 volatile bool from_parallel = true;
179 const bool need_dump = verbose >= 99;
180
181 const auto compare_point_values = [&](int64_t i) {
182 driver_check_func_args_t args(exp_mem, got_f32, i, dt, trh_);
183
184 bool ok = args.diff == 0.f;
185 if (std::isnan(args.exp_f32) && is_integral_dt(dt)) {
186 // Relax output requirements for this case, since different backends
187 // may implement NaN fp32 -> int32 conversion in a different manner.
188 ok = true;
189 }
190 // If fast check failed, go through all of them.
191 if (!ok) {
192 // Standard check for relative diff is under set threshold...
193 ok = (fabsf(args.exp) > 1e-5f ? args.rel_diff : args.diff) <= trh_;
194 // If not, check that both are NaNs or infinity with same sign...
195 if (!ok) ok = compare::compare_extreme_values(args.exp, args.got);
196 // If not, use hack to check not fully correct s32 saturation on
197 // cpu...
198 if (!ok && is_cpu() && dt == dnnl_s32
199 && args.exp == max_dt(dnnl_s32))
200 ok = args.got >= BENCHDNN_S32_TO_F32_SAT_CONST
201 && args.got < max_dt(dnnl_s32);
202 // If not, check driver additional checks if set...
203 if (!ok && driver_check_func_) ok = driver_check_func_(args);
204 // If not, check if there are eltwise post-ops, use very relaxed
205 // comparison since we can't control inputs for each driver finely
206 // or validate if the output value from operation satisfies the
207 // check for catastrophic cancellation (see eltwise additional check
208 // function). We rely on validation of pure eltwise and let some
209 // big rdiff errors slip away hoping that absolute error is good
210 // enough.
211 if (!ok && has_eltwise) {
212 const float experimental_tolerated_trh
213 = std::max(epsilon_dt(dt), 2e-5f);
214 ok = args.diff <= experimental_tolerated_trh;
215 }
216 // For eltwise it also may happen that threshold is really small,
217 // but absolute difference is really big. Also exponent is a special
218 // transcendental post-op that has accuracy issues with older isa.
219 if (!ok && has_eltwise
220 && (fabsf(args.exp) > 1e+5f || has_exp_eltwise)) {
221 ok = args.rel_diff <= std::max(epsilon_dt(dt), 5e-6f);
222 }
223 // Attr dst scale is used as a divisor to quantize data to dt.
224 // Implementation might decide to pre-compute inverse value and
225 // multiply on it in kernel. This difference might result in a
226 // slight error comparing to a division operation.
227 if (!ok && has_dst_scale) {
228 const float experimental_tolerated_trh
229 = std::max(epsilon_dt(dt), 1e-5f);
230 ok = args.rel_diff <= experimental_tolerated_trh;
231 }
232 // Binary MAX, MIN and comparison operations post-ops may return
233 // different results for different backends when NaN is one of
234 // inputs. Depending on its position and implementation, either
235 // first or second operand may be returned.
236 if (!ok && has_binary_comparison_po(attr) && op_output_has_nans_)
237 ok = true;
238 // Some drivers (like pooling or resampling) on integer data types
239 // may result in sporadic order of operations. This may cause a
240 // difference around `x.5f` value, and can be rounded either way to
241 // `x` or `x + 1` which can't be fixed by filling.
242 if (!ok && is_integral_dt(args.dt)) {
243 // Check that original value is close to x.5f.
244 static constexpr float small_eps = 9e-6;
245 const float floor_val = floorf(args.exp_f32);
246 const float ceil_val = ceilf(args.exp_f32);
247 if (fabsf((floor_val + 0.5f) - args.exp_f32) < small_eps) {
248 // If it is, check exp and got values are on opposite sides.
249 if (args.exp == floor_val) {
250 ok = args.got == ceil_val;
251 } else if (args.exp == ceil_val) {
252 ok = args.got == floor_val;
253 }
254 }
255 }
256 }
257 // Update zero stats for mistrust testing.
258 if (from_parallel && fabsf(args.got) == 0) zeros++;
259
260 if (!ok && all_ok) all_ok = false;
261 if (!ok && !from_parallel) n_errors++;
262
263 const bool dump
264 = need_dump || (!ok && (n_errors < 10 || verbose >= 10));
265 if (!from_parallel && dump)
266 dump_point_values(got_mem.md_, kind_, i, args.exp_f32, args.exp,
267 args.got, args.diff, args.rel_diff);
268 };
269
270 // parallel comparison to speed up the process
271 benchdnn_parallel_nd(nelems, compare_point_values);
272
273 // serial comparison with enabled dumping when needed for nicer output.
274 if (!all_ok || need_dump) {
275 from_parallel = false;
276 for (int64_t i = 0; i < nelems; ++i)
277 compare_point_values(i);
278 }
279
280 // Set state to FAILED in case of any errors.
281 if (n_errors) res->errors = n_errors, res->state = FAILED;
282 // State could be already FAILED, check zero trust for non-FAILED only.
283 if (res->state != FAILED) {
284 const auto zeros_percent = 100.f * zeros / nelems;
285 if (nelems >= 10 && zeros_percent > zero_trust_percent_) {
286 res->state = MISTRUSTED;
287 std::string skind;
288 if (kind_ != DAT_TOTAL)
289 skind = "[" + std::string(data_kind2str(kind_)) + "]";
290 BENCHDNN_PRINT(2,
291 "No trust stats [%s]: z:%2.0f%% (>%2.0f%%) (z: %ld, "
292 "total: %ld)\n",
293 skind.c_str(), zeros_percent, zero_trust_percent_,
294 (long)zeros.load(), (long)nelems);
295 }
296 }
297 // Set PASSED if no failure in current or previous checks happened and test
298 // can be trusted.
299 if (res->state == EXECUTED) res->state = PASSED;
300
301 return res->state == FAILED ? FAIL : OK;
302}
303
304int compare_t::compare(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem,
305 const attr_t &attr, res_t *res) const {
306 if (use_norm_) return compare_norm(exp_mem, got_mem, attr, res);
307 return compare_p2p(exp_mem, got_mem, attr, res);
308}
309
310} // namespace compare
311