1 | /******************************************************************************* |
2 | * Copyright 2020-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include <atomic> |
19 | #include <cmath> |
20 | #include <sstream> |
21 | #include <string> |
22 | |
23 | #include "utils/parallel.hpp" |
24 | |
25 | #include "common.hpp" |
26 | #include "utils/compare.hpp" |
27 | #include "utils/norm.hpp" |
28 | |
29 | #include "eltwise/eltwise.hpp" |
30 | |
31 | namespace compare { |
32 | |
33 | static void dump_point_values(const dnnl_memory_desc_t &md, data_kind_t kind, |
34 | int64_t l_offset, float exp_f32, float exp, float got, float diff, |
35 | float rel_diff) { |
36 | std::stringstream ss; |
37 | dims_t l_dims = md2dims(md); |
38 | dims_t dims_idx = off2dims_idx(l_dims, l_offset); |
39 | ss << dims_idx; |
40 | std::string ind_str = ss.str(); |
41 | |
42 | std::string skind; |
43 | if (kind != DAT_TOTAL) skind = "[" + std::string(data_kind2str(kind)) + "]" ; |
44 | |
45 | BENCHDNN_PRINT(0, |
46 | "[%4ld]%s[%s] exp_f32:%12g exp:%12g got:%12g diff:%8g rdiff:%8g\n" , |
47 | (long)l_offset, skind.c_str(), ind_str.c_str(), exp_f32, exp, got, |
48 | diff, rel_diff); |
49 | } |
50 | |
51 | static void dump_norm_values(const diff_norm_t &diff_norm, data_kind_t kind) { |
52 | std::string skind; |
53 | if (kind != DAT_TOTAL) skind = "[" + std::string(data_kind2str(kind)) + "]" ; |
54 | |
55 | BENCHDNN_PRINT(0, |
56 | "%s[L0] = %g\n" |
57 | "%s[L1] exp:%8g got:%8g diff:%8g rel_diff:%8g\n" |
58 | "%s[L2] exp:%8g got:%8g diff:%8g rel_diff:%8g\n" |
59 | "%s[L8] exp:%8g got:%8g diff:%8g rel_diff:%8g\n" , |
60 | skind.c_str(), diff_norm.rel_diff(norm_t::L0), skind.c_str(), |
61 | diff_norm.a_[norm_t::L1], diff_norm.b_[norm_t::L1], |
62 | diff_norm.diff_[norm_t::L1], diff_norm.rel_diff(norm_t::L1), |
63 | skind.c_str(), diff_norm.a_[norm_t::L2], diff_norm.b_[norm_t::L2], |
64 | diff_norm.diff_[norm_t::L2], diff_norm.rel_diff(norm_t::L2), |
65 | skind.c_str(), diff_norm.a_[norm_t::L8], diff_norm.b_[norm_t::L8], |
66 | diff_norm.diff_[norm_t::L8], diff_norm.rel_diff(norm_t::L8)); |
67 | } |
68 | |
69 | static bool has_binary_comparison_po(const attr_t &attr) { |
70 | const auto &po = attr.post_ops; |
71 | if (po.is_def()) return false; |
72 | |
73 | using alg_t = attr_t::post_ops_t::kind_t; |
74 | static const std::vector<alg_t> cmp_alg = {alg_t::MAX, alg_t::MIN, |
75 | alg_t::GE, alg_t::GT, alg_t::LE, alg_t::LT, alg_t::EQ, alg_t::NE}; |
76 | |
77 | for (int idx = 0; idx < po.len(); ++idx) { |
78 | const auto &e = po.entry[idx]; |
79 | if (!e.is_binary_kind()) continue; |
80 | |
81 | if (std::any_of(cmp_alg.cbegin(), cmp_alg.cend(), |
82 | [&](const alg_t alg) { return e.kind == alg; })) |
83 | return true; |
84 | } |
85 | return false; |
86 | } |
87 | |
88 | bool compare_extreme_values(float a, float b) { |
89 | if (std::isnan(a) && std::isnan(b)) return true; |
90 | if (std::isinf(a) && std::isinf(b) && std::signbit(a) == std::signbit(b)) |
91 | return true; |
92 | return false; |
93 | } |
94 | |
95 | compare_t::driver_check_func_args_t::driver_check_func_args_t( |
96 | const dnn_mem_t &exp_mem, const dnn_mem_t &got_f32, const int64_t i, |
97 | const dnnl_data_type_t data_type, const float trh) |
98 | : dt(data_type) |
99 | , idx(i) |
100 | , exp_f32(exp_mem.get_elem(idx)) |
101 | , exp(round_to_nearest_representable(dt, exp_f32)) |
102 | , got(got_f32.get_elem(idx)) |
103 | , diff(fabsf(exp - got)) |
104 | , rel_diff(diff / (fabsf(exp) > FLT_MIN ? fabsf(exp) : 1)) |
105 | , trh(trh) {} |
106 | |
107 | int compare_t::compare_norm(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem, |
108 | const attr_t &attr, res_t *res) const { |
109 | const auto nelems = got_mem.nelems(); |
110 | if (nelems == 0) { |
111 | if (res->state == EXECUTED) res->state = PASSED; |
112 | return OK; |
113 | } |
114 | |
115 | res->total = nelems; |
116 | |
117 | dnn_mem_t got_f32(got_mem, dnnl_f32, tag::abx, get_cpu_engine()); |
118 | const auto dt = got_mem.dt(); |
119 | |
120 | diff_norm_t diff_norm; |
121 | const bool need_dump = verbose >= 99; |
122 | for (int64_t i = 0; i < nelems; ++i) { |
123 | driver_check_func_args_t args(exp_mem, got_f32, i, dt, trh_); |
124 | |
125 | if (std::isnan(args.exp_f32) && is_integral_dt(dt)) { |
126 | // Don't include integer max values into norm as they make it |
127 | // irrelevant for validation. |
128 | ; |
129 | } else if (is_cpu() && dt == dnnl_s32 && args.exp == max_dt(dnnl_s32) |
130 | && args.got >= BENCHDNN_S32_TO_F32_SAT_CONST |
131 | && args.got < max_dt(dnnl_s32)) { |
132 | // Don't include f32->s32 saturation values into norm as they make |
133 | // it irrelevant for validation. |
134 | ; |
135 | } else { |
136 | diff_norm.update(args.exp, args.got); |
137 | } |
138 | |
139 | if (need_dump) |
140 | dump_point_values(got_mem.md_, kind_, i, args.exp_f32, args.exp, |
141 | args.got, args.diff, args.rel_diff); |
142 | } |
143 | diff_norm.done(); |
144 | |
145 | bool ok = diff_norm.rel_diff(norm_t::L2) <= trh_; |
146 | if (!ok) res->errors = 1; |
147 | |
148 | const bool dump = need_dump || !ok; |
149 | if (dump) dump_norm_values(diff_norm, kind_); |
150 | |
151 | if (res->errors) res->state = FAILED; |
152 | if (res->state == EXECUTED) res->state = PASSED; |
153 | |
154 | return res->state == FAILED ? FAIL : OK; |
155 | } |
156 | |
157 | int compare_t::compare_p2p(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem, |
158 | const attr_t &attr, res_t *res) const { |
159 | const auto nelems = got_mem.nelems(); |
160 | if (nelems == 0) { |
161 | if (res->state == EXECUTED) res->state = PASSED; |
162 | return OK; |
163 | } |
164 | |
165 | res->total = nelems; |
166 | |
167 | dnn_mem_t got_f32(got_mem, dnnl_f32, tag::abx, get_cpu_engine()); |
168 | const auto dt = got_mem.dt(); |
169 | const bool has_eltwise = attr.post_ops.eltwise_index() != -1; |
170 | const bool has_exp_eltwise |
171 | = attr.post_ops.find(attr_t::post_ops_t::kind_t::EXP) >= 0; |
172 | const bool has_dst_scale = !attr.scales.get(DNNL_ARG_DST).is_def(); |
173 | |
174 | // Atomics to be updated in parallel section, non-atomics - in sequential. |
175 | std::atomic<bool> all_ok(true); |
176 | std::atomic<int64_t> zeros(0); |
177 | int64_t n_errors = 0; |
178 | volatile bool from_parallel = true; |
179 | const bool need_dump = verbose >= 99; |
180 | |
181 | const auto compare_point_values = [&](int64_t i) { |
182 | driver_check_func_args_t args(exp_mem, got_f32, i, dt, trh_); |
183 | |
184 | bool ok = args.diff == 0.f; |
185 | if (std::isnan(args.exp_f32) && is_integral_dt(dt)) { |
186 | // Relax output requirements for this case, since different backends |
187 | // may implement NaN fp32 -> int32 conversion in a different manner. |
188 | ok = true; |
189 | } |
190 | // If fast check failed, go through all of them. |
191 | if (!ok) { |
192 | // Standard check for relative diff is under set threshold... |
193 | ok = (fabsf(args.exp) > 1e-5f ? args.rel_diff : args.diff) <= trh_; |
194 | // If not, check that both are NaNs or infinity with same sign... |
195 | if (!ok) ok = compare::compare_extreme_values(args.exp, args.got); |
196 | // If not, use hack to check not fully correct s32 saturation on |
197 | // cpu... |
198 | if (!ok && is_cpu() && dt == dnnl_s32 |
199 | && args.exp == max_dt(dnnl_s32)) |
200 | ok = args.got >= BENCHDNN_S32_TO_F32_SAT_CONST |
201 | && args.got < max_dt(dnnl_s32); |
202 | // If not, check driver additional checks if set... |
203 | if (!ok && driver_check_func_) ok = driver_check_func_(args); |
204 | // If not, check if there are eltwise post-ops, use very relaxed |
205 | // comparison since we can't control inputs for each driver finely |
206 | // or validate if the output value from operation satisfies the |
207 | // check for catastrophic cancellation (see eltwise additional check |
208 | // function). We rely on validation of pure eltwise and let some |
209 | // big rdiff errors slip away hoping that absolute error is good |
210 | // enough. |
211 | if (!ok && has_eltwise) { |
212 | const float experimental_tolerated_trh |
213 | = std::max(epsilon_dt(dt), 2e-5f); |
214 | ok = args.diff <= experimental_tolerated_trh; |
215 | } |
216 | // For eltwise it also may happen that threshold is really small, |
217 | // but absolute difference is really big. Also exponent is a special |
218 | // transcendental post-op that has accuracy issues with older isa. |
219 | if (!ok && has_eltwise |
220 | && (fabsf(args.exp) > 1e+5f || has_exp_eltwise)) { |
221 | ok = args.rel_diff <= std::max(epsilon_dt(dt), 5e-6f); |
222 | } |
223 | // Attr dst scale is used as a divisor to quantize data to dt. |
224 | // Implementation might decide to pre-compute inverse value and |
225 | // multiply on it in kernel. This difference might result in a |
226 | // slight error comparing to a division operation. |
227 | if (!ok && has_dst_scale) { |
228 | const float experimental_tolerated_trh |
229 | = std::max(epsilon_dt(dt), 1e-5f); |
230 | ok = args.rel_diff <= experimental_tolerated_trh; |
231 | } |
232 | // Binary MAX, MIN and comparison operations post-ops may return |
233 | // different results for different backends when NaN is one of |
234 | // inputs. Depending on its position and implementation, either |
235 | // first or second operand may be returned. |
236 | if (!ok && has_binary_comparison_po(attr) && op_output_has_nans_) |
237 | ok = true; |
238 | // Some drivers (like pooling or resampling) on integer data types |
239 | // may result in sporadic order of operations. This may cause a |
240 | // difference around `x.5f` value, and can be rounded either way to |
241 | // `x` or `x + 1` which can't be fixed by filling. |
242 | if (!ok && is_integral_dt(args.dt)) { |
243 | // Check that original value is close to x.5f. |
244 | static constexpr float small_eps = 9e-6; |
245 | const float floor_val = floorf(args.exp_f32); |
246 | const float ceil_val = ceilf(args.exp_f32); |
247 | if (fabsf((floor_val + 0.5f) - args.exp_f32) < small_eps) { |
248 | // If it is, check exp and got values are on opposite sides. |
249 | if (args.exp == floor_val) { |
250 | ok = args.got == ceil_val; |
251 | } else if (args.exp == ceil_val) { |
252 | ok = args.got == floor_val; |
253 | } |
254 | } |
255 | } |
256 | } |
257 | // Update zero stats for mistrust testing. |
258 | if (from_parallel && fabsf(args.got) == 0) zeros++; |
259 | |
260 | if (!ok && all_ok) all_ok = false; |
261 | if (!ok && !from_parallel) n_errors++; |
262 | |
263 | const bool dump |
264 | = need_dump || (!ok && (n_errors < 10 || verbose >= 10)); |
265 | if (!from_parallel && dump) |
266 | dump_point_values(got_mem.md_, kind_, i, args.exp_f32, args.exp, |
267 | args.got, args.diff, args.rel_diff); |
268 | }; |
269 | |
270 | // parallel comparison to speed up the process |
271 | benchdnn_parallel_nd(nelems, compare_point_values); |
272 | |
273 | // serial comparison with enabled dumping when needed for nicer output. |
274 | if (!all_ok || need_dump) { |
275 | from_parallel = false; |
276 | for (int64_t i = 0; i < nelems; ++i) |
277 | compare_point_values(i); |
278 | } |
279 | |
280 | // Set state to FAILED in case of any errors. |
281 | if (n_errors) res->errors = n_errors, res->state = FAILED; |
282 | // State could be already FAILED, check zero trust for non-FAILED only. |
283 | if (res->state != FAILED) { |
284 | const auto zeros_percent = 100.f * zeros / nelems; |
285 | if (nelems >= 10 && zeros_percent > zero_trust_percent_) { |
286 | res->state = MISTRUSTED; |
287 | std::string skind; |
288 | if (kind_ != DAT_TOTAL) |
289 | skind = "[" + std::string(data_kind2str(kind_)) + "]" ; |
290 | BENCHDNN_PRINT(2, |
291 | "No trust stats [%s]: z:%2.0f%% (>%2.0f%%) (z: %ld, " |
292 | "total: %ld)\n" , |
293 | skind.c_str(), zeros_percent, zero_trust_percent_, |
294 | (long)zeros.load(), (long)nelems); |
295 | } |
296 | } |
297 | // Set PASSED if no failure in current or previous checks happened and test |
298 | // can be trusted. |
299 | if (res->state == EXECUTED) res->state = PASSED; |
300 | |
301 | return res->state == FAILED ? FAIL : OK; |
302 | } |
303 | |
304 | int compare_t::compare(const dnn_mem_t &exp_mem, const dnn_mem_t &got_mem, |
305 | const attr_t &attr, res_t *res) const { |
306 | if (use_norm_) return compare_norm(exp_mem, got_mem, attr, res); |
307 | return compare_p2p(exp_mem, got_mem, attr, res); |
308 | } |
309 | |
310 | } // namespace compare |
311 | |