1 | /******************************************************************************* |
2 | * Copyright 2017-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> // for std::reverse and std::copy |
18 | #include <functional> // for std::bind and std::placeholders |
19 | #include <list> |
20 | #include <string> // for std::string |
21 | #include <utility> // for std::pair |
22 | #include <vector> // for std::vector |
23 | |
24 | #include <assert.h> |
25 | |
26 | #include "oneapi/dnnl/dnnl.hpp" |
27 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
28 | #include "oneapi/dnnl/dnnl_ocl.hpp" |
29 | #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP |
30 | #include "oneapi/dnnl/dnnl_sycl.hpp" |
31 | #endif |
32 | |
33 | #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL |
34 | #include "oneapi/dnnl/dnnl_threadpool.h" |
35 | #endif |
36 | |
37 | #ifndef DNNL_DISABLE_PRIMITIVE_CACHE |
38 | #include "src/common/primitive_cache.hpp" |
39 | #endif |
40 | |
41 | #include "cpu/platform.hpp" |
42 | |
43 | #include "tests/test_thread.hpp" |
44 | |
45 | #include "dnnl_common.hpp" |
46 | #include "dnnl_memory.hpp" |
47 | |
48 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \ |
49 | || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL |
50 | extern "C" dnnl_status_t dnnl_impl_gpu_set_profiling(int flag); |
51 | extern "C" dnnl_status_t dnnl_impl_gpu_reset_profiling(); |
52 | extern "C" dnnl_status_t dnnl_impl_gpu_get_profile_info( |
53 | uint64_t &time, double &freq, int mode); |
54 | #endif |
55 | |
56 | int check_pd_cache(const_dnnl_primitive_desc_t pd) { |
57 | #ifndef DNNL_DISABLE_PRIMITIVE_CACHE |
58 | int capacity = 0; |
59 | DNN_SAFE(dnnl_get_primitive_cache_capacity(&capacity), CRIT); |
60 | if (capacity && !dnnl::impl::is_pd_in_cache(pd)) { |
61 | BENCHDNN_PRINT(0, "error: %s\n" , |
62 | "primitive descriptor is expected to be fetched from " |
63 | "the primitive cache" ); |
64 | return FAIL; |
65 | } |
66 | #endif |
67 | return OK; |
68 | } |
69 | |
70 | int check_primitive_cache(dnnl_primitive_t p) { |
71 | #ifndef DNNL_DISABLE_PRIMITIVE_CACHE |
72 | int capacity = 0; |
73 | DNN_SAFE(dnnl_get_primitive_cache_capacity(&capacity), CRIT); |
74 | if (capacity && !dnnl::impl::is_primitive_in_cache(p)) { |
75 | BENCHDNN_PRINT(0, "error: %s\n" , |
76 | "primitive is expected to be fetched from the primitive " |
77 | "cache" ); |
78 | return FAIL; |
79 | } |
80 | #endif |
81 | return OK; |
82 | } |
83 | |
84 | size_t set_primitive_cache_capacity_without_clearing(size_t capacity) { |
85 | #ifndef DNNL_DISABLE_PRIMITIVE_CACHE |
86 | return dnnl::impl::set_primitive_cache_capacity_without_clearing(capacity); |
87 | #endif |
88 | return size_t(0); |
89 | } |
90 | |
91 | int get_cache_blob_id( |
92 | std::vector<uint8_t> &cache_blob_id, const_dnnl_primitive_desc_t pd) { |
93 | dnnl_dim_t count; |
94 | const uint8_t *c_id; |
95 | DNN_SAFE(dnnl_primitive_desc_query( |
96 | pd, dnnl_query_cache_blob_id_size_s64, 0, (void *)&count), |
97 | WARN); |
98 | DNN_SAFE(dnnl_primitive_desc_query( |
99 | pd, dnnl_query_cache_blob_id, 0, (void **)&c_id), |
100 | WARN); |
101 | cache_blob_id = {c_id, c_id + count}; |
102 | return OK; |
103 | } |
104 | |
105 | int get_cache_blob(std::vector<uint8_t> &cache_blob, dnnl_primitive_t prim) { |
106 | size_t size = 0; |
107 | DNN_SAFE(dnnl_primitive_get_cache_blob(prim, &size, nullptr), WARN); |
108 | |
109 | cache_blob.resize(size); |
110 | DNN_SAFE(dnnl_primitive_get_cache_blob(prim, &size, cache_blob.data()), |
111 | WARN); |
112 | return OK; |
113 | } |
114 | |
115 | struct lru_cache_t { |
116 | lru_cache_t(size_t capacity) : capacity_(capacity) {} |
117 | |
118 | const std::vector<uint8_t> &get(const std::vector<uint8_t> &key) { |
119 | auto it = cache_mapper_.find(key); |
120 | if (it == cache_mapper_.end()) { return dummy_; } |
121 | cache_list_.splice(cache_list_.begin(), cache_list_, it->second); |
122 | return cache_list_.front().value_; |
123 | } |
124 | |
125 | void add(const std::vector<uint8_t> &key, |
126 | const std::vector<uint8_t> &value) { |
127 | assert(!cache_mapper_.count(key)); |
128 | if (cache_mapper_.size() >= capacity_) { |
129 | cache_mapper_.erase(cache_list_.back().key_); |
130 | cache_list_.pop_back(); |
131 | } |
132 | cache_list_.push_front(entry_t(key, value)); |
133 | cache_mapper_.insert({key, cache_list_.begin()}); |
134 | } |
135 | |
136 | private: |
137 | lru_cache_t(const lru_cache_t &other) = delete; |
138 | lru_cache_t(lru_cache_t &&other) = delete; |
139 | lru_cache_t &operator=(const lru_cache_t &other) = delete; |
140 | lru_cache_t &operator=(lru_cache_t &&other) = delete; |
141 | |
142 | struct entry_t { |
143 | entry_t(const std::vector<uint8_t> &key, |
144 | const std::vector<uint8_t> &value) |
145 | : key_(key), value_(value) {} |
146 | std::vector<uint8_t> key_; |
147 | std::vector<uint8_t> value_; |
148 | }; |
149 | |
150 | size_t capacity_; |
151 | std::list<entry_t> cache_list_; |
152 | std::map<std::vector<uint8_t>, std::list<entry_t>::iterator> cache_mapper_; |
153 | |
154 | const std::vector<uint8_t> dummy_; |
155 | }; |
156 | |
157 | lru_cache_t &get_test_cache() { |
158 | static lru_cache_t cache(1024); |
159 | return cache; |
160 | } |
161 | |
162 | int test_persistent_cache_api(benchdnn_dnnl_wrapper_t<dnnl_primitive_t> &prim, |
163 | const_dnnl_primitive_desc_t pd, res_t *res) { |
164 | if (!is_gpu() || (is_gpu() && DNNL_GPU_RUNTIME != DNNL_RUNTIME_OCL)) { |
165 | return OK; |
166 | } |
167 | |
168 | // Start testing persistent cache API. |
169 | // 1. Disable primitive cache to make sure that the next primitive will |
170 | // be created from the cache blob and not fetched from the primitive cache. |
171 | const auto old_capacity = set_primitive_cache_capacity_without_clearing(0); |
172 | // 2. Get cache blob ID to use it as a key for the `test_cache`. |
173 | std::vector<uint8_t> cache_blob_id; |
174 | SAFE(get_cache_blob_id(cache_blob_id, pd), WARN); |
175 | // 3. Check if a cache blob for the obtained cache blob ID is present in the |
176 | // `test_cache`. |
177 | // a) If the cache blob is found the primitive is created from it. |
178 | // b) If the cache blob is not found then get it from the previously |
179 | // created primitive, store it in the cache and create the primitive |
180 | // from it. |
181 | dnnl_primitive_t p {}; |
182 | auto &cache = get_test_cache(); |
183 | auto cache_value = cache.get(cache_blob_id); |
184 | if (!cache_value.empty()) { |
185 | const size_t size = cache_value.size(); |
186 | const uint8_t *cache_blob = cache_value.data(); |
187 | DNN_SAFE( |
188 | dnnl_primitive_create_from_cache_blob(&p, pd, size, cache_blob), |
189 | WARN); |
190 | } else { |
191 | std::vector<uint8_t> cache_blob; |
192 | SAFE(get_cache_blob(cache_blob, prim), WARN); |
193 | |
194 | // The cross-engine reorder is a special primitive that may contain no |
195 | // kernels therefore the cache blob will always be empty, which is |
196 | // the correct behavior. |
197 | if (cache_blob.empty()) { |
198 | set_primitive_cache_capacity_without_clearing(old_capacity); |
199 | if (res->impl_name.find("cross_engine" ) != std::string::npos) |
200 | return OK; |
201 | |
202 | BENCHDNN_PRINT( |
203 | 0, "error: %s\n" , "cache blob is not expected to be empty" ); |
204 | res->state = FAILED; |
205 | return FAIL; |
206 | } |
207 | |
208 | DNN_SAFE(dnnl_primitive_create_from_cache_blob( |
209 | &p, pd, cache_blob.size(), cache_blob.data()), |
210 | WARN); |
211 | cache.add(cache_blob_id, cache_blob); |
212 | } |
213 | prim.reset(p); |
214 | |
215 | // 4. Restore the original primitive cache capacity to make it functional. |
216 | set_primitive_cache_capacity_without_clearing(old_capacity); |
217 | |
218 | return OK; |
219 | } |
220 | |
221 | float round_to_nearest_representable(dnnl_data_type_t dt, float value) { |
222 | switch (dt) { |
223 | case dnnl_f32: break; |
224 | case dnnl_f64: break; |
225 | case dnnl_bf16: value = (float)dnnl::impl::bfloat16_t(value); break; |
226 | case dnnl_f16: value = (float)dnnl::impl::float16_t(value); break; |
227 | case dnnl_s32: |
228 | case dnnl_s8: |
229 | case dnnl_u8: value = maybe_saturate(dt, value); break; |
230 | default: SAFE(FAIL, CRIT); |
231 | } |
232 | |
233 | return value; |
234 | } |
235 | |
236 | // Engine kind used to run oneDNN primitives for testing |
237 | dnnl_engine_kind_t engine_tgt_kind = dnnl_cpu; |
238 | // Engine index used to run oneDNN primitives for testing |
239 | size_t engine_index = 0; |
240 | // CPU ISA specific hints : none by default |
241 | isa_hints_t hints {isa_hints_t::none}; |
242 | |
243 | memory_kind_ext_t memory_kind {default_memory_kind}; |
244 | |
245 | void init_isa_settings() { |
246 | if (hints.get() == isa_hints_t::no_hints) |
247 | DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_no_hints)); |
248 | else if (hints.get() == isa_hints_t::prefer_ymm) |
249 | DNN_SAFE_V(dnnl_set_cpu_isa_hints(dnnl_cpu_isa_prefer_ymm)); |
250 | else { |
251 | // Do nothing when hints == none |
252 | assert(hints.get() == isa_hints_t::none); |
253 | } |
254 | } |
255 | |
256 | args_t &args_t::set(int arg, const dnn_mem_t &mem) { |
257 | args_.emplace_back(arg, &mem); |
258 | return *this; |
259 | } |
260 | |
261 | args_t &args_t::set( |
262 | const std::vector<int> &args, const std::vector<dnn_mem_t> &mems) { |
263 | assert(args.size() == mems.size()); |
264 | for (size_t i = 0; i < mems.size(); ++i) |
265 | args_.emplace_back(args[i], &mems[i]); |
266 | return *this; |
267 | } |
268 | |
269 | const dnn_mem_t &args_t::find(int arg) const { |
270 | static dnn_mem_t empty_stub; |
271 | for (const auto &e : args_) { |
272 | if (e.first == arg) return *(e.second); |
273 | } |
274 | return empty_stub; |
275 | } |
276 | |
277 | // Unmap before passing the memory to execute |
278 | void execute_unmap_args( |
279 | const args_t &args, std::vector<dnnl_exec_arg_t> &dnnl_args) { |
280 | dnnl_args.resize(args.size()); |
281 | for (int i = 0; i < args.size(); ++i) { |
282 | if (args.dnn_mem(i).is_mapped()) args.dnn_mem(i).unmap(); |
283 | |
284 | dnnl_args[i].arg = args.arg(i); |
285 | dnnl_args[i].memory = args.dnn_mem(i).m_; |
286 | } |
287 | } |
288 | |
289 | // Map the memory back after execute |
290 | void execute_map_args(const args_t &args) { |
291 | for (int i = 0; i < args.size(); ++i) |
292 | if (!args.dnn_mem(i).is_mapped()) args.dnn_mem(i).map(); |
293 | } |
294 | |
295 | int execute_and_wait(perf_function_t &exec_func, const dnnl_engine_t &engine, |
296 | const args_t &args, res_t *res) { |
297 | stream_t stream(engine); |
298 | std::vector<dnnl_exec_arg_t> dnnl_args; |
299 | |
300 | execute_unmap_args(args, dnnl_args); |
301 | |
302 | DNN_SAFE(exec_func(stream, dnnl_args), CRIT); |
303 | DNN_SAFE(dnnl_stream_wait(stream), CRIT); |
304 | if (res) res->state = EXECUTED; |
305 | |
306 | execute_map_args(args); |
307 | |
308 | return OK; |
309 | } |
310 | |
311 | dnnl_status_t primitive_executor(dnnl_primitive_t prim, |
312 | const dnnl_stream_t &stream, |
313 | const std::vector<dnnl_exec_arg_t> &dnnl_args) { |
314 | return dnnl_primitive_execute( |
315 | prim, stream, (int)dnnl_args.size(), dnnl_args.data()); |
316 | } |
317 | |
318 | int execute_and_wait(dnnl_primitive_t prim, const args_t &args, res_t *res) { |
319 | perf_function_t exec_func = std::bind(&primitive_executor, prim, |
320 | std::placeholders::_1, std::placeholders::_2); |
321 | auto pd = query_pd(prim); |
322 | auto engine = query_engine(pd); |
323 | return execute_and_wait(exec_func, engine, args, res); |
324 | } |
325 | |
326 | void enable_gpu_profiling() { |
327 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \ |
328 | || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL |
329 | if (!is_bench_mode(PROF)) return; |
330 | DNN_SAFE_V(dnnl_impl_gpu_set_profiling(1)); |
331 | #endif |
332 | } |
333 | |
334 | void disable_gpu_profiling() { |
335 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \ |
336 | || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL |
337 | if (!is_bench_mode(PROF)) return; |
338 | DNN_SAFE_V(dnnl_impl_gpu_reset_profiling()); |
339 | DNN_SAFE_V(dnnl_impl_gpu_set_profiling(0)); |
340 | #endif |
341 | } |
342 | |
343 | void reset_gpu_profiling() { |
344 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \ |
345 | || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL |
346 | if (!is_bench_mode(PROF)) return; |
347 | DNN_SAFE_V(dnnl_impl_gpu_reset_profiling()); |
348 | #endif |
349 | } |
350 | |
351 | void get_gpu_profiling_info(uint64_t &nsec, double &freq, int mode) { |
352 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL \ |
353 | || DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL |
354 | if (!is_bench_mode(PROF)) return; |
355 | DNN_SAFE_V(dnnl_impl_gpu_get_profile_info(nsec, freq, mode)); |
356 | #endif |
357 | } |
358 | |
359 | inline bool should_stop(const timer::timer_t &t) { |
360 | const bool stop = false |
361 | || (fix_times_per_prb && t.times() >= fix_times_per_prb) |
362 | || (!fix_times_per_prb && t.total_ms() >= max_ms_per_prb |
363 | && t.times() >= min_times_per_prb); |
364 | return stop; |
365 | } |
366 | |
367 | inline int measure_perf_individual(timer::timer_t &t, dnnl_stream_t stream, |
368 | perf_function_t &perf_func, std::vector<dnnl_exec_arg_t> &dnnl_args) { |
369 | t.reset(); |
370 | while (true) { |
371 | DNN_SAFE(perf_func(stream, dnnl_args), WARN); |
372 | t.stamp(); |
373 | if (should_stop(t)) break; |
374 | } |
375 | return OK; |
376 | } |
377 | |
378 | inline int measure_perf_aggregate(timer::timer_t &t, dnnl_stream_t stream, |
379 | perf_function_t &perf_func, std::vector<dnnl_exec_arg_t> &dnnl_args) { |
380 | const int max_batch_times = 10000; |
381 | |
382 | // Warm-up run, this is not measured due to possibility the associated |
383 | // kernel has not been built and skews the results. |
384 | DNN_SAFE(perf_func(stream, dnnl_args), WARN); |
385 | DNN_SAFE(dnnl_stream_wait(stream), WARN); |
386 | |
387 | int cur_batch_times |
388 | = fix_times_per_prb ? fix_times_per_prb : min_times_per_prb; |
389 | |
390 | t.reset(); |
391 | reset_gpu_profiling(); |
392 | |
393 | bool is_first_loop = true; |
394 | while (true) { |
395 | for (int i = 0; i < cur_batch_times; i++) { |
396 | DNN_SAFE(perf_func(stream, dnnl_args), WARN); |
397 | } |
398 | DNN_SAFE(dnnl_stream_wait(stream), WARN); |
399 | |
400 | if (is_bench_mode(PROF)) { |
401 | uint64_t nsec = 0; |
402 | double freq = 0; |
403 | get_gpu_profiling_info(nsec, freq, 0); |
404 | reset_gpu_profiling(); |
405 | t.stamp_with_frequency(cur_batch_times, nsec / 1e6, freq); |
406 | } else { |
407 | t.stamp(cur_batch_times); |
408 | } |
409 | |
410 | if (should_stop(t)) break; |
411 | |
412 | // Adjust cur_batch_times after the first batch run |
413 | if (is_first_loop) { |
414 | double ms_min = t.ms(timer::timer_t::min); |
415 | // Heuristic: try to use ~5 batch runs for the whole benchmark |
416 | int batch_times_heuristic = (ms_min == 0.0) |
417 | ? INT_MAX |
418 | : MAX2(1, |
419 | (int)((max_ms_per_prb - t.total_ms()) / ms_min |
420 | / 5)); |
421 | cur_batch_times = MIN2(max_batch_times, batch_times_heuristic); |
422 | is_first_loop = false; |
423 | } |
424 | } |
425 | |
426 | return OK; |
427 | } |
428 | |
429 | int measure_perf(const thr_ctx_t &ctx, res_t *res, perf_function_t &perf_func, |
430 | args_t &args) { |
431 | int ret = OK; |
432 | if (is_bench_mode(PERF)) { |
433 | const auto &engine = get_test_engine(); |
434 | stream_t stream(engine, ctx.get_interop_obj()); |
435 | std::vector<dnnl_exec_arg_t> dnnl_args; |
436 | execute_unmap_args(args, dnnl_args); |
437 | |
438 | auto &t = res->timer_map.perf_timer(); |
439 | // For non-DPCPP CPU: measure individual iterations. |
440 | // For DPCPP CPU and GPU: measure iterations in batches to hide driver |
441 | // overhead. DPCPP CPU follows the model of GPU, thus, handled similar. |
442 | if (is_cpu() && !is_sycl_engine(engine)) |
443 | ret = execute_in_thr_ctx(ctx, measure_perf_individual, t, stream, |
444 | perf_func, dnnl_args); |
445 | else |
446 | ret = execute_in_thr_ctx(ctx, measure_perf_aggregate, t, stream, |
447 | perf_func, dnnl_args); |
448 | |
449 | if (ret == OK) execute_map_args(args); |
450 | } |
451 | return ret; |
452 | } |
453 | |
454 | int measure_perf( |
455 | const thr_ctx_t &ctx, res_t *res, dnnl_primitive_t prim, args_t &args) { |
456 | perf_function_t perf_func = std::bind(&primitive_executor, prim, |
457 | std::placeholders::_1, std::placeholders::_2); |
458 | |
459 | return measure_perf(ctx, res, perf_func, args); |
460 | } |
461 | |
462 | void maybe_prepare_runtime_scales(dnn_mem_t &scales_m, |
463 | const attr_t::scale_t &scale, int64_t scale_cnt, const float *scales) { |
464 | if (!scale.runtime) return; |
465 | |
466 | const int64_t count = scale.policy == policy_t::COMMON ? 1 : scale_cnt; |
467 | |
468 | scales_m = dnn_mem_t(1, &count, dnnl_f32, tag::x, get_test_engine()); |
469 | for (int64_t c = 0; c < count; ++c) |
470 | ((float *)scales_m)[c] = scales[c]; |
471 | } |
472 | |
473 | void maybe_prepare_runtime_scales_v2(dnn_mem_t &scales_dt, dnn_mem_t &scales_fp, |
474 | const attr_t::scale_t &scale, int64_t scale_cnt, const float *scales) { |
475 | if (!scale.runtime) return; |
476 | maybe_prepare_runtime_scales(scales_dt, scale, scale_cnt, scales); |
477 | const int64_t count = scale.policy == policy_t::COMMON ? 1 : scale_cnt; |
478 | scales_fp = dnn_mem_t(1, &count, dnnl_f32, tag::x, get_cpu_engine()); |
479 | for (int64_t c = 0; c < count; ++c) |
480 | ((float *)scales_fp)[c] = ((float *)scales_dt)[c]; |
481 | } |
482 | |
483 | void maybe_prepare_runtime_zero_points(dnn_mem_t &zero_points_m, |
484 | const attr_t &attr, int arg, int64_t count, |
485 | const int32_t *zero_points) { |
486 | if (!attr.zero_points.runtime(arg)) return; |
487 | |
488 | const auto e = attr.zero_points.get(arg); |
489 | const int64_t cnt = e.policy == policy_t::COMMON ? 1 : count; |
490 | |
491 | zero_points_m = dnn_mem_t(1, &cnt, dnnl_s32, tag::x, get_test_engine()); |
492 | for (int64_t c = 0; c < cnt; ++c) |
493 | ((int32_t *)zero_points_m)[c] = zero_points[c]; |
494 | } |
495 | |
496 | void maybe_prepare_runtime_zero_points_v2(dnn_mem_t &zero_points_dt, |
497 | dnn_mem_t &zero_points_fp, const attr_t &attr, int arg, int64_t count, |
498 | const int32_t *zero_points) { |
499 | if (!attr.zero_points.runtime(arg)) return; |
500 | maybe_prepare_runtime_zero_points( |
501 | zero_points_dt, attr, arg, count, zero_points); |
502 | const auto e = attr.zero_points.get(arg); |
503 | const int64_t cnt = e.policy == policy_t::COMMON ? 1 : count; |
504 | zero_points_fp = dnn_mem_t(1, &cnt, dnnl_s32, tag::x, get_cpu_engine()); |
505 | for (int64_t c = 0; c < cnt; ++c) |
506 | ((int32_t *)zero_points_fp)[c] = ((int32_t *)zero_points_dt)[c]; |
507 | } |
508 | |
509 | std::vector<float> prepare_po_vals(const dnn_mem_t &dst_m, const args_t &args, |
510 | const std::vector<std::pair<int, int>> &v_po_masks, |
511 | const size_t dst_off) { |
512 | std::vector<float> v_vals(v_po_masks.size()); |
513 | |
514 | for (size_t d = 0; d < v_po_masks.size(); ++d) { |
515 | const auto po_offset |
516 | = dst_m.get_scale_idx(dst_off, v_po_masks[d].second); |
517 | const float val = args.find(v_po_masks[d].first).get_elem(po_offset); |
518 | v_vals[d] = val; |
519 | } |
520 | return v_vals; |
521 | } |
522 | |
523 | bool check_md_consistency_with_tag( |
524 | const_dnnl_memory_desc_t md, const std::string &tag) { |
525 | auto md_new_tag = dnn_mem_t::init_md( |
526 | query_md_ndims(md), query_md_dims(md), query_md_data_type(md), tag); |
527 | return dnnl_memory_desc_equal(md_new_tag, md); |
528 | } |
529 | |
530 | void skip_start(res_t *res) { |
531 | if (benchdnn_stat.tests < test_start) { |
532 | res->state = SKIPPED; |
533 | res->reason = SKIP_START; |
534 | return; |
535 | } |
536 | } |
537 | |
538 | void skip_unimplemented_data_type( |
539 | const std::vector<dnnl_data_type_t> &v_dt, dir_t dir, res_t *res) { |
540 | const bool has_f64_support = is_f64_supported(); |
541 | #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE |
542 | using namespace dnnl::impl::cpu::platform; |
543 | // bf16 is supported on AVX512-CORE+ |
544 | const bool has_bf16_support = is_gpu() |
545 | || (is_cpu() && has_data_type_support(dnnl_bf16) |
546 | && IMPLICATION(!(dir & FLAG_INF), |
547 | has_training_support(dnnl_bf16))); |
548 | const bool has_f16_support = (is_gpu() && (dir & FLAG_FWD)) |
549 | || (is_cpu() && has_data_type_support(dnnl_f16) |
550 | && IMPLICATION( |
551 | !(dir & FLAG_INF), has_training_support(dnnl_f16))); |
552 | |
553 | #else |
554 | const bool has_bf16_support = is_gpu(); |
555 | // f16 is supported on GPU for inference only. |
556 | const bool has_f16_support = is_gpu() && (dir & FLAG_FWD); |
557 | #endif |
558 | |
559 | for (const auto &i_dt : v_dt) { |
560 | bool need_skip = false; |
561 | switch (i_dt) { |
562 | case dnnl_bf16: need_skip = !has_bf16_support; break; |
563 | case dnnl_f16: need_skip = !has_f16_support; break; |
564 | case dnnl_f64: need_skip = !has_f64_support; break; |
565 | default: break; |
566 | } |
567 | if (need_skip) { |
568 | res->state = SKIPPED, res->reason = DATA_TYPE_NOT_SUPPORTED; |
569 | return; |
570 | } |
571 | } |
572 | } |
573 | |
574 | void skip_unimplemented_sum_po( |
575 | const attr_t &attr, res_t *res, dnnl_data_type_t dst_dt) { |
576 | const auto &po = attr.post_ops; |
577 | if (po.is_def()) return; |
578 | |
579 | const int first_sum_idx = po.find(attr_t::post_ops_t::SUM); |
580 | if (first_sum_idx == -1) return; |
581 | |
582 | const auto sum_dt = po.entry[first_sum_idx].sum.dt; |
583 | |
584 | for (int idx = 0; idx < po.len(); ++idx) { |
585 | const auto &e = po.entry[idx]; |
586 | if (e.is_sum_kind()) { |
587 | // Sum with zero-point is not supported on GPU |
588 | if (is_gpu() && e.sum.zero_point != 0) { |
589 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
590 | break; |
591 | } |
592 | // Each sum must have same data on CPU |
593 | if (is_cpu() && e.sum.dt != sum_dt) { |
594 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
595 | break; |
596 | } |
597 | // Sum must have data type with the same size like dst on both |
598 | if (dst_dt != dnnl_data_type_undef && sum_dt != dnnl_data_type_undef |
599 | && dnnl_data_type_size(dst_dt) |
600 | != dnnl_data_type_size(e.sum.dt)) { |
601 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
602 | return; |
603 | } |
604 | } |
605 | } |
606 | } |
607 | |
608 | void skip_unimplemented_arg_scale(const attr_t &attr, res_t *res) { |
609 | for (const auto &arg_s : attr.scales.scales) { |
610 | if (arg_s.second.policy != policy_t::COMMON) { |
611 | res->state = SKIPPED, res->reason = CASE_NOT_SUPPORTED; |
612 | return; |
613 | } |
614 | } |
615 | } |
616 | |
617 | void skip_invalid_inplace(res_t *res, dnnl_data_type_t sdt, |
618 | dnnl_data_type_t ddt, const std::string &stag, |
619 | const std::string &dtag) { |
620 | // Note: existing implementation of dnn_mem_t doesn't allow to track the |
621 | // fact that two different objects pointing on the same SYCL memory should |
622 | // not map/unmap both objects. |
623 | // This leads to the restriction that memory descriptors should coincide, |
624 | // thus, a single memory object would be used for in-place validation. |
625 | // General limitation of in-place mode is having same amount of memory on |
626 | // input and output. |
627 | if (sdt != ddt) { |
628 | res->state = SKIPPED, res->reason = INVALID_CASE; |
629 | return; |
630 | } |
631 | |
632 | if (dtag == tag::any) return; |
633 | if (stag != dtag) { |
634 | res->state = SKIPPED, res->reason = INVALID_CASE; |
635 | return; |
636 | } |
637 | } |
638 | |
639 | // Check ensures that attributes don't cause implementation fallback |
640 | int check_same_pd(const dnnl_primitive_desc_t &pd_no_attr, res_t *res) { |
641 | const auto pd_no_attr_name = query_impl_info(pd_no_attr); |
642 | if (res->impl_name == pd_no_attr_name) return OK; |
643 | |
644 | res->state = FAILED; |
645 | BENCHDNN_PRINT(0, |
646 | "ERROR: attributes caused impl fallback from [%s] to [%s]\n" , |
647 | pd_no_attr_name.c_str(), res->impl_name.c_str()); |
648 | return FAIL; |
649 | } |
650 | |
651 | bool is_cpu(const dnnl_engine_t &engine) { |
652 | return query_engine_kind(engine) == dnnl_cpu; |
653 | } |
654 | |
655 | bool is_gpu(const dnnl_engine_t &engine) { |
656 | return query_engine_kind(engine) == dnnl_gpu; |
657 | } |
658 | |
659 | bool is_sycl_engine(const dnnl_engine_t &engine) { |
660 | if (is_cpu(engine)) return DNNL_CPU_RUNTIME == DNNL_RUNTIME_DPCPP; |
661 | if (is_gpu(engine)) return DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP; |
662 | return false; |
663 | } |
664 | |
665 | bool is_opencl_engine(const dnnl_engine_t &engine) { |
666 | if (is_gpu(engine)) return DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL; |
667 | return false; |
668 | } |
669 | |
670 | bool is_nvidia_gpu(const dnnl_engine_t &engine) { |
671 | #ifdef DNNL_WITH_SYCL |
672 | if (!is_gpu(engine)) return false; |
673 | constexpr int nvidia_vendor_id = 0x10DE; |
674 | auto eng = dnnl::engine(engine, true); |
675 | auto device = dnnl::sycl_interop::get_device(eng); |
676 | const auto eng_vendor_id |
677 | = device.get_info<::sycl::info::device::vendor_id>(); |
678 | return eng_vendor_id == nvidia_vendor_id; |
679 | #endif |
680 | return false; |
681 | } |
682 | |
683 | bool is_amd_gpu(const dnnl_engine_t &engine) { |
684 | #ifdef DNNL_WITH_SYCL |
685 | if (!is_gpu(engine)) return false; |
686 | constexpr int amd_vendor_id = 0x1002; |
687 | auto eng = dnnl::engine(engine, true); |
688 | auto device = dnnl::sycl_interop::get_device(eng); |
689 | const auto eng_vendor_id |
690 | = device.get_info<::sycl::info::device::vendor_id>(); |
691 | return eng_vendor_id == amd_vendor_id; |
692 | #endif |
693 | return false; |
694 | } |
695 | |
696 | bool is_f64_supported(const dnnl_engine_t &engine) { |
697 | if (!is_gpu(engine)) return false; |
698 | if (is_nvidia_gpu(engine) || is_amd_gpu(engine)) return false; |
699 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP |
700 | if (is_sycl_engine(engine)) { |
701 | auto eng = dnnl::engine(engine, true); |
702 | auto dev = dnnl::sycl_interop::get_device(eng); |
703 | #ifdef DNNL_SYCL_INTEROP_USE_SYCL121 |
704 | return dev.has_extension("cl_khr_fp64" ); |
705 | #else |
706 | return dev.has(::sycl::aspect::fp64); |
707 | #endif |
708 | } |
709 | #endif |
710 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
711 | if (is_opencl_engine(engine)) { |
712 | auto eng = dnnl::engine(engine, true); |
713 | cl_device_id dev = dnnl::ocl_interop::get_device(eng); |
714 | size_t param_size = 0; |
715 | cl_int err = clGetDeviceInfo( |
716 | dev, CL_DEVICE_EXTENSIONS, 0, nullptr, ¶m_size); |
717 | if (err != CL_SUCCESS) return false; |
718 | |
719 | std::string extension_string(param_size, '\0'); |
720 | err = clGetDeviceInfo(dev, CL_DEVICE_EXTENSIONS, param_size, |
721 | &extension_string[0], ¶m_size); |
722 | if (err != CL_SUCCESS) return false; |
723 | |
724 | return extension_string.find("cl_khr_fp64" ) != std::string::npos; |
725 | } |
726 | #endif |
727 | return false; |
728 | } |
729 | |
730 | #if defined(_WIN32) && !defined(__GNUC__) |
731 | #include "windows.h" |
732 | |
733 | static size_t get_cpu_ram_size() { |
734 | MEMORYSTATUSEX s {}; |
735 | s.dwLength = sizeof(s); |
736 | GlobalMemoryStatusEx(&s); |
737 | return s.ullTotalPhys; |
738 | } |
739 | #elif defined(__APPLE__) || defined(__FreeBSD__) || defined(__QNXNTO__) |
740 | #include <unistd.h> |
741 | #include <sys/sysctl.h> |
742 | |
743 | static size_t get_cpu_ram_size() { |
744 | #ifdef __APPLE__ |
745 | int query_ram[] = {CTL_HW, HW_MEMSIZE}; |
746 | #else |
747 | int query_ram[] = {CTL_HW, HW_PHYSMEM}; |
748 | #endif |
749 | int query_ram_len = sizeof(query_ram) / sizeof(*query_ram); |
750 | size_t totalram = 0; |
751 | size_t length = sizeof(totalram); |
752 | |
753 | sysctl(query_ram, query_ram_len, &totalram, &length, NULL, 0); |
754 | return totalram; |
755 | } |
756 | #else |
757 | #include <sys/sysinfo.h> |
758 | |
759 | static size_t get_cpu_ram_size() { |
760 | struct sysinfo s {}; |
761 | sysinfo(&s); |
762 | return s.totalram; |
763 | } |
764 | #endif |
765 | |
766 | static size_t get_gpu_ram_size() { |
767 | // XXX: create a tmp engine to query what we need. |
768 | // It will be removed in the future as part of switching back |
769 | // to the global engine. |
770 | engine_t eng_tmp(engine_tgt_kind); |
771 | dnnl::engine eng(eng_tmp, true); |
772 | if (eng.get_kind() != dnnl::engine::kind::gpu) return 0; |
773 | |
774 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
775 | cl_int status = CL_SUCCESS; |
776 | // Get single device attached to the engine. |
777 | engine_t engine_tgt(engine_tgt_kind); |
778 | cl_device_id ocl_device = dnnl::ocl_interop::get_device(eng); |
779 | |
780 | cl_ulong ram_size = 0; |
781 | status = clGetDeviceInfo(ocl_device, CL_DEVICE_GLOBAL_MEM_SIZE, |
782 | sizeof(cl_ulong), &ram_size, nullptr); |
783 | if (status == CL_SUCCESS) return (size_t)ram_size; |
784 | #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_DPCPP |
785 | auto sycl_dev = dnnl::sycl_interop::get_device(eng); |
786 | return (size_t)sycl_dev.get_info<::sycl::info::device::global_mem_size>(); |
787 | #endif |
788 | return 0; |
789 | } |
790 | |
791 | struct check_mem_size_args_t { |
792 | check_mem_size_args_t(const_dnnl_primitive_desc_t pd, bool want_input, |
793 | bool add_ref_size = false) |
794 | : pd(pd) |
795 | , want_input(want_input) |
796 | , add_ref_size(add_ref_size) |
797 | , is_scratchpad(false) |
798 | , total_size_device(0) |
799 | , total_size_cpu(0) |
800 | , scratchpad_size(0) {} |
801 | |
802 | // Input args. |
803 | const_dnnl_primitive_desc_t pd; |
804 | bool want_input; |
805 | bool add_ref_size; |
806 | bool is_scratchpad; |
807 | |
808 | // Output args. |
809 | size_t total_size_device; |
810 | size_t total_size_cpu; |
811 | size_t scratchpad_size; |
812 | }; |
813 | |
814 | static int check_total_size( |
815 | const check_mem_size_args_t &check_mem_size_args, res_t *res) { |
816 | static uint64_t cpu_device_capacity = get_cpu_ram_size(); |
817 | static uint64_t gpu_device_capacity = get_gpu_ram_size(); |
818 | |
819 | const uint64_t device_max_capacity |
820 | = is_cpu() ? cpu_device_capacity : gpu_device_capacity; |
821 | const uint64_t cpu_max_capacity = cpu_device_capacity; |
822 | |
823 | // 0.75f is taken randomly and is subject to change in future. |
824 | const double capacity_factor = 0.75; |
825 | const double benchdnn_device_limit = capacity_factor * device_max_capacity; |
826 | const double benchdnn_cpu_limit = capacity_factor * cpu_max_capacity; |
827 | assert(benchdnn_device_limit > 0 && benchdnn_cpu_limit > 0); |
828 | |
829 | const bool fits_device_ram = is_gpu() |
830 | ? (check_mem_size_args.total_size_device <= benchdnn_device_limit) |
831 | : true; |
832 | if (!fits_device_ram) { |
833 | BENCHDNN_PRINT( |
834 | 2, "%s\n" , "benchdnn: not enough device RAM for a problem." ); |
835 | res->state = SKIPPED; |
836 | res->reason = NOT_ENOUGH_RAM; |
837 | } |
838 | |
839 | auto GB = [](double bytes) { return bytes / powf(2, 30); }; |
840 | |
841 | if (is_gpu()) { |
842 | BENCHDNN_PRINT((!fits_device_ram ? 2 : 6), |
843 | "Requested: %g GB, benchdnn device limit: %g GB, device RAM " |
844 | "capacity: %g GB\n" , |
845 | GB(check_mem_size_args.total_size_device), |
846 | GB(benchdnn_device_limit), GB(gpu_device_capacity)); |
847 | } |
848 | |
849 | size_t total_size_cpu = check_mem_size_args.total_size_cpu; |
850 | if (is_cpu()) total_size_cpu += check_mem_size_args.total_size_device; |
851 | bool fits_cpu_ram = total_size_cpu <= benchdnn_cpu_limit; |
852 | |
853 | if (!fits_cpu_ram) { |
854 | BENCHDNN_PRINT( |
855 | 2, "%s\n" , "benchdnn: not enough CPU RAM for a problem." ); |
856 | // Try to catch a huge scratchpad size requested by the library. |
857 | // Use following logic: |
858 | // scratch_size |
859 | // ---------------------- <= 0.75 (pre-defined threshold). |
860 | // io_size + scratch_size |
861 | // |
862 | // 0.75 value supposed to be experimental and might be adjusted. |
863 | static constexpr float scratch_trh = 0.75f; |
864 | if (check_mem_size_args.scratchpad_size |
865 | > scratch_trh * total_size_cpu) { |
866 | BENCHDNN_PRINT(2, "%s `%ld` %s `%ld`.\n" , |
867 | "benchdnn: CPU scratchpad size" , |
868 | (long)check_mem_size_args.scratchpad_size, |
869 | "exceeded a given threshold" , |
870 | (long)(scratch_trh * total_size_cpu)); |
871 | res->state = FAILED; |
872 | } else { |
873 | res->state = SKIPPED; |
874 | } |
875 | res->reason = NOT_ENOUGH_RAM; |
876 | } |
877 | |
878 | BENCHDNN_PRINT((!fits_cpu_ram ? 2 : 6), |
879 | "Requested: %g GB, benchdnn CPU limit: %g GB, CPU RAM capacity: %g " |
880 | "GB\n" , |
881 | GB(total_size_cpu), GB(benchdnn_cpu_limit), |
882 | GB(cpu_device_capacity)); |
883 | |
884 | return res->state == FAILED ? FAIL : OK; |
885 | } |
886 | |
887 | static void add_md_size(const_dnnl_memory_desc_t md, |
888 | check_mem_size_args_t &check_mem_size_args) { |
889 | const auto mem_size = dnnl_memory_desc_get_size(md); |
890 | // Runtime mem size is not defined. |
891 | if (mem_size == 0 || mem_size == DNNL_RUNTIME_SIZE_VAL) return; |
892 | |
893 | check_mem_size_args.total_size_device += mem_size; // Original memory size. |
894 | if (!check_mem_size_args.add_ref_size) return; |
895 | |
896 | // Reference memories are always tag::abx fp32, hence need re-creating |
897 | // memory descriptor and take its size. |
898 | auto ref_md = dnn_mem_t::init_md( |
899 | query_md_ndims(md), query_md_dims(md), dnnl_f32, tag::abx); |
900 | const auto ref_md_size = dnnl_memory_desc_get_size(ref_md); |
901 | |
902 | // Correctness pass allocates additional tag::abx f32 memory. |
903 | bool compare_mem_factor = !check_mem_size_args.want_input |
904 | && check_mem_size_args.add_ref_size; |
905 | |
906 | // All memory is mapped once it is created and unmapped only before |
907 | // primitive execution. Device memory requires additional buffer for mapped |
908 | // memory. |
909 | // XXX: In DPC++ build oneDNN uses USM memory, which shouldn't require an |
910 | // additional buffer, so map factor should be equal to 0 for DPC++. |
911 | // However due to a driver issue oneDNN pretends that shared USM is not |
912 | // accessible on the host, hence map will allocate an extra memory. |
913 | check_mem_size_args.total_size_cpu += !is_cpu() * mem_size; // Map factor. |
914 | if (check_mem_size_args.is_scratchpad) { |
915 | check_mem_size_args.scratchpad_size += mem_size; |
916 | } else { |
917 | check_mem_size_args.total_size_cpu += ref_md_size; // Reference memory. |
918 | // Comparison memory. |
919 | check_mem_size_args.total_size_cpu += compare_mem_factor * ref_md_size; |
920 | } |
921 | } |
922 | |
923 | bool is_fwd_prop_kind(dnnl_prop_kind_t prop_kind) { |
924 | return prop_kind == dnnl_forward_training |
925 | || prop_kind == dnnl_forward_inference |
926 | || prop_kind == dnnl_prop_kind_undef; |
927 | } |
928 | |
929 | static void get_memory_bytes(check_mem_size_args_t &check_mem_size_args) { |
930 | auto const_pd = check_mem_size_args.pd; |
931 | const int n_idx = check_mem_size_args.want_input |
932 | ? query_n_inputs(const_pd) |
933 | : query_n_outputs(const_pd); |
934 | const auto prop_kind = query_prop_kind(const_pd); |
935 | const bool is_fwd = is_fwd_prop_kind(prop_kind); |
936 | |
937 | #define MD(name) dnnl_query_##name##_md |
938 | std::vector<dnnl_query_t> query_fwd_in_mds {MD(src), MD(weights)}; |
939 | std::vector<dnnl_query_t> query_fwd_out_mds {MD(dst), MD(workspace)}; |
940 | |
941 | std::vector<dnnl_query_t> query_bwd_in_mds { |
942 | MD(src), MD(weights), MD(dst), MD(diff_dst), MD(workspace)}; |
943 | std::vector<dnnl_query_t> query_bwd_out_mds { |
944 | MD(diff_src), MD(diff_weights)}; |
945 | #undef MD |
946 | |
947 | const auto &query_in_mds = is_fwd ? query_fwd_in_mds : query_bwd_in_mds; |
948 | const auto &query_out_mds = is_fwd ? query_fwd_out_mds : query_bwd_out_mds; |
949 | const auto &query_mds |
950 | = check_mem_size_args.want_input ? query_in_mds : query_out_mds; |
951 | |
952 | for_(const auto query : query_mds) |
953 | for (int idx = 0; idx < n_idx; ++idx) { |
954 | const auto &md = query_md(const_pd, query, idx); |
955 | add_md_size(md, check_mem_size_args); |
956 | } |
957 | } |
958 | |
959 | int check_mem_size(const_dnnl_memory_desc_t md, res_t *res) { |
960 | if (!mem_check) return OK; |
961 | |
962 | check_mem_size_args_t check_mem_size_args(nullptr, false, false); |
963 | check_mem_size_args.total_size_device = dnnl_memory_desc_get_size(md); |
964 | |
965 | return check_total_size(check_mem_size_args, res); |
966 | } |
967 | |
968 | int check_mem_size(const_dnnl_primitive_desc_t const_pd, res_t *res) { |
969 | if (!mem_check) return OK; |
970 | |
971 | // Get input sizes. |
972 | check_mem_size_args_t check_mem_size_args(const_pd, /* want_input = */ true, |
973 | /* add_ref_size = */ true); |
974 | get_memory_bytes(check_mem_size_args); |
975 | |
976 | // Get scratchpad size. Treat it as `want_input=true` to avoid comparison |
977 | // factor count. Since scratchpad modes are mutual excluded, it takes sizes |
978 | // of both modes since either of them will report 0 size. |
979 | check_mem_size_args.is_scratchpad = true; |
980 | const auto &scratchpad_md = query_md(const_pd, DNNL_ARG_SCRATCHPAD); |
981 | add_md_size(scratchpad_md, check_mem_size_args); |
982 | check_mem_size_args.is_scratchpad = false; |
983 | check_mem_size_args.total_size_device += query_mem_consumption(const_pd); |
984 | check_mem_size_args.scratchpad_size += query_mem_consumption(const_pd); |
985 | |
986 | // Get output sizes. |
987 | check_mem_size_args.want_input = false; |
988 | get_memory_bytes(check_mem_size_args); |
989 | |
990 | return check_total_size(check_mem_size_args, res); |
991 | } |
992 | |
993 | int (const_dnnl_primitive_desc_t const_pd, res_t *res) { |
994 | check_mem_size_args_t check_mem_in_size_args( |
995 | const_pd, /* want_input = */ true); |
996 | get_memory_bytes(check_mem_in_size_args); // Get input bytes. |
997 | check_mem_size_args_t check_mem_out_size_args( |
998 | const_pd, /* want_input = */ false); |
999 | get_memory_bytes(check_mem_out_size_args); // Get output bytes. |
1000 | |
1001 | // Update read bytes with dst bytes in case of sum post-op. |
1002 | auto const_attr_po = query_post_ops(const_pd); |
1003 | auto po_len = dnnl_post_ops_len(const_attr_po); |
1004 | for (int idx = 0; idx < po_len; ++idx) { |
1005 | const auto kind = dnnl_post_ops_get_kind(const_attr_po, idx); |
1006 | if (kind == dnnl_sum) { |
1007 | const auto &dst_md = query_md(const_pd, DNNL_ARG_DST); |
1008 | add_md_size(dst_md, check_mem_in_size_args); |
1009 | } |
1010 | } |
1011 | |
1012 | res->ibytes = check_mem_in_size_args.total_size_device; |
1013 | res->obytes = check_mem_out_size_args.total_size_device; |
1014 | |
1015 | return OK; |
1016 | } |
1017 | |
1018 | memory_kind_ext_t str2memory_kind(const char *str) { |
1019 | #define CASE(param) \ |
1020 | if (!strcasecmp(#param, str)) return memory_kind_ext_t::param |
1021 | |
1022 | CASE(usm); |
1023 | CASE(buffer); |
1024 | CASE(usm_device); |
1025 | CASE(usm_shared); |
1026 | |
1027 | #undef CASE |
1028 | |
1029 | assert(!"not expected" ); |
1030 | return memory_kind_ext_t::usm; |
1031 | } |
1032 | |
1033 | static void maybe_print_cpu_engine_error_message() { |
1034 | #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL |
1035 | fprintf(stderr, |
1036 | "ERROR: can't create CPU engine. Possible reasons for this error:\n" |
1037 | "- Incorrect SYCL_DEVICE_FILTER. The filter must be either unset " |
1038 | "or include 'opencl:cpu' devices.\n" |
1039 | "- Missing TBB library which is required for OpenCL CPU runtime. " |
1040 | "Check that TBB library is available in the system.\n" |
1041 | "- Missing OpenCL CPU runtime or other issues with OpenCL CPU " |
1042 | "runtime. Check that output from `sycl-ls` or `clinfo -l` commands " |
1043 | "include any CPU devices.\n" ); |
1044 | #endif |
1045 | } |
1046 | |
1047 | engine_t::engine_t(dnnl_engine_kind_t engine_kind) : is_owner_(true) { |
1048 | enable_gpu_profiling(); |
1049 | size_t idx = engine_kind == dnnl_cpu ? 0 : engine_index; |
1050 | dnnl_status_t status = dnnl_engine_create(&engine_, engine_kind, idx); |
1051 | if (engine_kind == dnnl_cpu && status != dnnl_success) |
1052 | maybe_print_cpu_engine_error_message(); |
1053 | DNN_SAFE_V(status); |
1054 | } |
1055 | |
1056 | engine_t::engine_t(dnnl_engine_t engine) : engine_(engine), is_owner_(false) {} |
1057 | |
1058 | engine_t::engine_t(const engine_t &other) { |
1059 | is_owner_ = other.is_owner_; |
1060 | |
1061 | if (!is_owner_) { |
1062 | engine_ = other.engine_; |
1063 | return; |
1064 | } |
1065 | |
1066 | dnnl_engine_kind_t engine_kind; |
1067 | DNN_SAFE_V(dnnl_engine_get_kind(other.engine_, &engine_kind)); |
1068 | |
1069 | if (engine_kind == dnnl_cpu) { |
1070 | #if DNNL_CPU_RUNTIME == DNNL_RUNTIME_SYCL |
1071 | void *dev; |
1072 | void *ctx; |
1073 | DNN_SAFE_V(dnnl_sycl_interop_engine_get_device(other.engine_, &dev)); |
1074 | DNN_SAFE_V(dnnl_sycl_interop_engine_get_context(other.engine_, &ctx)); |
1075 | DNN_SAFE_V(dnnl_sycl_interop_engine_create(&engine_, dev, ctx)); |
1076 | #else |
1077 | DNN_SAFE_V(dnnl_engine_create(&engine_, dnnl_cpu, 0)); |
1078 | #endif |
1079 | } else if (engine_kind == dnnl_gpu) { |
1080 | #if DNNL_GPU_RUNTIME == DNNL_RUNTIME_OCL |
1081 | cl_device_id dev; |
1082 | cl_context ctx; |
1083 | DNN_SAFE_V(dnnl_ocl_interop_get_device(other.engine_, &dev)); |
1084 | DNN_SAFE_V(dnnl_ocl_interop_engine_get_context(other.engine_, &ctx)); |
1085 | DNN_SAFE_V(dnnl_ocl_interop_engine_create(&engine_, dev, ctx)); |
1086 | #elif DNNL_GPU_RUNTIME == DNNL_RUNTIME_SYCL |
1087 | void *dev; |
1088 | void *ctx; |
1089 | DNN_SAFE_V(dnnl_sycl_interop_engine_get_device(other.engine_, &dev)); |
1090 | DNN_SAFE_V(dnnl_sycl_interop_engine_get_context(other.engine_, &ctx)); |
1091 | DNN_SAFE_V(dnnl_sycl_interop_engine_create(&engine_, dev, ctx)); |
1092 | #endif |
1093 | } else { |
1094 | assert(!"unsupported engine kind" ); |
1095 | } |
1096 | } |
1097 | |
1098 | engine_t::~engine_t() { |
1099 | if (is_owner_) DNN_SAFE_V(dnnl_engine_destroy(engine_)); |
1100 | } |
1101 | |
1102 | stream_t::stream_t(dnnl_engine_t engine, void *interop_obj) { |
1103 | #if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL |
1104 | if (is_cpu(engine)) { |
1105 | auto tp = static_cast<dnnl::threadpool_interop::threadpool_iface *>( |
1106 | interop_obj); |
1107 | if (tp == nullptr) tp = dnnl::testing::get_threadpool(); |
1108 | SAFE_V(dnnl_threadpool_interop_stream_create(&stream_, engine, tp)); |
1109 | return; |
1110 | } |
1111 | #endif |
1112 | DNN_SAFE_V(dnnl_stream_create(&stream_, engine, dnnl_stream_default_flags)); |
1113 | } |
1114 | |
1115 | stream_t::~stream_t() { |
1116 | DNN_SAFE_V(dnnl_stream_destroy(stream_)); |
1117 | } |
1118 | |
1119 | float reorder_rescale_factor() { |
1120 | float factor = 1.f; |
1121 | #if DNNL_CPU_RUNTIME != DNNL_RUNTIME_NONE |
1122 | if (is_cpu(get_test_engine())) |
1123 | factor = dnnl::impl::cpu::platform::s8s8_weights_scale_factor(); |
1124 | #endif |
1125 | return factor; |
1126 | } |
1127 | |
1128 | dims_t md2dims(const dnnl_memory_desc_t &md) { |
1129 | auto ndims = query_md_ndims(md); |
1130 | dims_t dims(ndims, 0); |
1131 | for (int d = 0; d < ndims; ++d) |
1132 | dims[d] = query_md_dims(md)[d]; |
1133 | return dims; |
1134 | } |
1135 | |
1136 | dnnl_data_type_t deduce_cfg_data_type( |
1137 | dnnl_data_type_t in_dt, const attr_t &attr, data_kind_t dk) { |
1138 | dnnl_data_type_t dt_ = in_dt; |
1139 | |
1140 | if ((dk == SRC || dk == WEI) && dt_ == dnnl_f32) { |
1141 | // Update data type based on fpmath-mode attribute |
1142 | switch (attr.fpmath_mode) { |
1143 | case dnnl_fpmath_mode_strict: break; |
1144 | case dnnl_fpmath_mode_bf16: dt_ = dnnl_bf16; break; |
1145 | case dnnl_fpmath_mode_tf32: dt_ = dnnl_bf16; break; |
1146 | default: assert(!"unsupported_fpmath_mode" ); SAFE_V(CRIT); |
1147 | } |
1148 | } else if (dk == DST) { |
1149 | // Sum post-op defines the type of filling destination. |
1150 | const int sum_idx = attr.post_ops.find(attr_t::post_ops_t::SUM); |
1151 | if (sum_idx >= 0) { |
1152 | auto sum_dt = attr.post_ops.entry[sum_idx].sum.dt; |
1153 | if (sum_dt != dnnl_data_type_undef) dt_ = sum_dt; |
1154 | } |
1155 | } |
1156 | |
1157 | return dt_; |
1158 | } |
1159 | |