performance_profiling.cpp source code [oneDNN/examples/performance_profiling.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	/// @example performance_profiling.cpp
18	/// @copybrief performance_profiling_cpp
19	/// > Annotated version: @ref performance_profiling_cpp
20
21	/// @page performance_profiling_cpp Performance Profiling Example
22	/// This example demonstrates the best practices for application performance
23	/// optimizations with oneDNN.
24	///
25	/// > Example code: @ref performance_profiling.cpp
26	///
27	/// This example uses [ONEDNN_VERBOSE](@ref dev_guide_verbose) trace output
28	/// to tune oneDNN code to align
29	/// with the [best practices](@ref dev_guide_inference).
30	///
31	/// It assumes knowledge of memory formats and their usage in
32	/// oneDNN. You can read more about this topic
33	/// [here](@ref memory_format_propagation_cpp).
34	///
35	/// Additionally, see the [article for recommended environment for
36	/// running benchmarks](@ref dev_guide_performance_settings).
37	///
38	/// The example has three different implementations of the mathematical
39	/// operation:
40	/// 1. Naive implementation* executes 2D convolution followed by*
41	/// ReLU on the data in NCHW* format. This implementation*
42	/// does not align with oneDNN best practices and results in
43	/// suboptimal performance.
44	/// 2. Blocked format implementation* executes the same operations*
45	/// sequence on the blocked format* optimized for convolution*
46	/// performance. This implementation uses `format_tag=ANY` to create a
47	/// convolution memory descriptor to determine the data format optimal
48	/// for the convolution implementation. It then propagates the blocked
49	/// format* to the non-intensive ReLU. This implementation results*
50	/// in better overall performance than the naive implementation.
51	/// 3. Fused implementation* executes convolution fused with ReLU on*
52	/// blocked data format. This implementation uses
53	/// `format_tag=ANY` to create a convolution memory descriptor, and then
54	/// adds ReLU as a post-op* to the convolution primitive. This version*
55	/// implements all of the best practices for inference resulting in the
56	/// best overall performance.
57	///
58	/// @section performance_profiling_cpp_walkthrough Walkthrough
59	///
60	/// The program in \ref performance_profiling.cpp includes all three
61	/// implementations introduced above. You can select the specific implementation
62	/// using command line options.
63	///
64	/// After compilation, you can execute each implementation with:
65	/// ~~~sh
66	/// ./program.exe [cpu\|gpu] [implementation]
67	/// ~~~
68	///
69	/// Before you run the program, set your `ONEDNN_VERBOSE` environment
70	/// variable to 1:
71	/// ~~~sh
72	/// export ONEDNN_VERBOSE=1
73	/// ~~~
74	///
75	/// The program starts by creating oneDNN memory objects in NCHW
76	/// format. These are called `user_` because they are meant to represent the
77	/// user's source data entering oneDNN with the NCHW format.
78	/// @page performance_profiling_cpp
79	/// @snippet performance_profiling.cpp Set dimensions
80	/// @page performance_profiling_cpp
81	/// @note Here the library allocates memory.
82	/// @page performance_profiling_cpp
83	/// @snippet performance_profiling.cpp Create memory objects
84	/// @page performance_profiling_cpp
85	/// @note You can change the batch size to easily increase/decrease the workload.
86	///
87	/// The following descriptions of each implementation will reference each other,
88	/// and are meant to be read in order.
89	///
90
91	#include <iostream>
92	#include <stdexcept>
93	#include <vector>
94
95	#include "oneapi/dnnl/dnnl.hpp"
96
97	#include "example_utils.hpp"
98
99	using namespace dnnl;
100
101	// [Prologue]
102
103	// Set Strides and Padding
104	const memory::dims strides = {`4`, `4`};
105	const memory::dims padding = {`0`, `0`};
106
107	// [Prologue]
108	//
109	// function to init data
110	void init_data(memory &m, float v) {
111	size_t size = m.get_desc().get_size() / sizeof(float);
112	std::vector<float> data(size, v);
113	write_to_dnnl_memory(data.data(), m);
114	}
115
116	// function to execute non-fused relu
117	void create_and_execute_relu(memory &data, engine &eng, stream &s) {
118	// relu operates on whatever data format is given to it
119
120	// create a primitive
121	auto relu_pd = eltwise_forward::primitive_desc(eng,
122	prop_kind::forward_inference, algorithm::eltwise_relu,
123	data.get_desc(), data.get_desc(), `0.f`, `0.f`);
124	auto relu = eltwise_forward(relu_pd);
125
126	// execute it (in-place)
127	relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}});
128	}
129
130	// [Create post_op attr with relu]
131	// function to create post-op attribute for fused relu
132	primitive_attr create_attr_with_relu_post_op() {
133	// create a post-op with relu
134	post_ops ops;
135	ops.append_eltwise(algorithm::eltwise_relu, `0.f`, `0.f`);
136
137	// create an attribute and set the corresponding post op
138	primitive_attr attr;
139	attr.set_post_ops(ops);
140
141	return attr;
142	}
143	// [Create post_op attr with relu]
144
145	// Implementation for naive convolution on nchw (data) and oihw (weights),
146	// followed by execution of non-fused relu
147	void conv_relu_naive(const memory &user_src, const memory &user_wei,
148	memory user_dst, engine &eng, stream &s) {
149	/// @section performance_profiling_cpp_implementation1 Naive Implementation
150	/// This implementation is launched with the following shell code:
151	/// ~~~sh
152	/// ./program.exe cpu naive
153	/// ~~~
154	/// The program will call the implementation defined in the function
155	/// `conv_relu_naive()`.
156	///
157	/// First it sets the dimensions and format for convolution memory
158	/// descriptors (`_md`) to match `user_` values--one `md` each for source,
159	/// destination, and weight data. Then it uses those `md` to create the
160	/// convolution primitive descriptor `conv_pd`, which tells oneDNN to use
161	/// plain format (NCHW) for the convolution.
162	/// @page performance_profiling_cpp
163	/// @snippet performance_profiling.cpp Create mem_desc
164	// [Create mem_desc]
165	// copy the dimensions and format from user's memory
166	auto conv_src_md = memory::desc(user_src.get_desc());
167	auto conv_wei_md = memory::desc(user_wei.get_desc());
168	auto conv_dst_md = memory::desc(user_dst.get_desc());
169	// [Create mem_desc]
170	/// @page performance_profiling_cpp
171	/// Next the program creates a convolution primitive descriptor `conv_pd`
172	/// and convolution primitive `conv`. These structs will inherit
173	/// NCHW format from `md` by way of the `conv_d`. Finally it creates
174	/// the convolution primitive `conv` and adds it to the stream `s`, and then
175	/// executes the `create_and_execute_relu(user_dst)` function.
176	/// @page performance_profiling_cpp
177	/// @snippet performance_profiling.cpp Create conv_prim_desc
178	// [Create conv_prim_desc]
179	// create a convolution primitive descriptor
180	auto conv_pd = convolution_forward::primitive_desc(eng,
181	prop_kind::forward_inference, algorithm::convolution_direct,
182	conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
183	// [Create conv_prim_desc]
184	/// @page performance_profiling_cpp
185	/// @snippet performance_profiling.cpp Create conv_primitive
186	// [Create conv_primitive]
187	// create convolution primitive
188	auto conv = convolution_forward(conv_pd);
189	// [Create conv_primitive]
190	/// @page performance_profiling_cpp
191	/// @snippet performance_profiling.cpp Add to stream
192	// [Add to stream]
193	// execute convolution by adding it to the stream s
194	conv.execute(s,
195	{{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei},
196	{DNNL_ARG_DST, user_dst}});
197	// [Add to stream]
198	/// @page performance_profiling_cpp
199	/// @snippet performance_profiling.cpp Create and execute relu
200	// [Create and execute relu]
201	// execute relu (on convolution's destination format, whatever it is)
202	create_and_execute_relu(user_dst, eng, s);
203	s.wait();
204	// [Create and execute relu]
205	/// @page performance_profiling_cpp
206	/// @note The function for creation and execution of ReLU primitive is
207	/// defined elsewhere to keep this example clean. It is an non-intensive
208	/// operation, so the `create_and_execute_relu()` function uses whatever
209	/// the input data format is at the time it is called.
210	///
211	/// Using NCHW data format may result in suboptimal performance for compute
212	/// intensive primitives, as shown in the following ONEDNN_VERBOSE output
213	/// by the convolution and relu execution
214	/// times of 38.3 and 2.9 milliseconds, respectively.
215	///
216	/// ONEDNN_VERBOSE output (see configuration notice\):*
217	/// ~~~sh
218	/// onednn_verbose,exec,cpu,convolution,gemm:jit,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:abcd:f0 bia_undef::undef::f0 dst_f32::blocked:abcd:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,38.314
219	/// onednn_verbose,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:abcd:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.87695
220	/// ~~~
221	/// In Blocked format implementation, we will incorporate the best
222	/// practice of letting oneDNN determine the optimal format
223	/// for convolution primitive.
224	}
225
226	// Implementation for convolution on blocked format for data and
227	// weights, followed by execution of non-fused relu
228	void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst,
229	engine &eng, stream &s) {
230	/// @page performance_profiling_cpp
231	/// @section performance_profiling_cpp_implementation2 Blocked format implementation
232	/// This implementation is launched with the following shell code:
233	/// ~~~sh
234	/// ./program.exe cpu blocked
235	/// ~~~
236	/// The program will call the implementation defined in the function
237	/// `conv_relu_blocked()`.
238	///
239	/// First it creates the md as in naive implementation. Next it changes
240	/// the dnnl::memory::format_tag for each md to `ANY`. Then it uses those
241	/// md to create the convolution primitive descriptor conv_pd, which tells
242	/// oneDNN to use whatever format it recommends for the convolution.
243	/// oneDNN will choose a friendly blocked format.
244	/// @page performance_profiling_cpp
245	/// @snippet performance_profiling.cpp Create mem_desc with tag=any
246	// [Create mem_desc with tag=any]
247	// copy the dimensions and data type from user's memory and set format tag
248	// to "any" to allow convolution to pick the best implementation
249	auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
250	user_src.get_desc().get_data_type(), memory::format_tag::any);
251	auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
252	user_wei.get_desc().get_data_type(), memory::format_tag::any);
253	auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
254	user_dst.get_desc().get_data_type(), memory::format_tag::any);
255	// [Create mem_desc with tag=any]
256
257	/// Next the program creates a convolution primitive descriptor conv_pd and
258	/// convolution primitive conv as in naive implementation.
259	/// However, in this implementation the structs will inherit blocked format
260	/// from md by way of the conv_d.
261	/// @page performance_profiling_cpp
262	/// @snippet performance_profiling.cpp Create conv_prim_desc implementation2
263	// [Create conv_prim_desc implementation2]
264	// create a convolution primitive descriptor and primitive
265	auto conv_pd = convolution_forward::primitive_desc(eng,
266	prop_kind::forward_inference, algorithm::convolution_direct,
267	conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
268	// [Create conv_prim_desc implementation2]
269	/// Since the resulting convolution primitive will expect
270	/// blocked source data, conditional reorders are inserted to convert
271	/// input data to blocked format if required.
272	/// The input data user_src is NCHW, so this conditional will be triggered:
273	///
274	/// @note The reoders are applied using oneDNN `reorder` primitive.
275	/// @page performance_profiling_cpp
276	/// @snippet performance_profiling.cpp Conditionally create and execute reorder prims
277	// [Conditionally create and execute reorder prims]
278	// prepare convolution source
279	memory conv_src = user_src;
280	if (conv_pd.src_desc() != user_src.get_desc()) {
281	conv_src = memory(conv_pd.src_desc(), eng);
282	auto r_pd = reorder::primitive_desc(user_src, conv_src);
283	reorder(r_pd).execute(s, user_src, conv_src);
284	}
285
286	// prepare convolution weights
287	memory conv_wei = user_wei;
288	if (conv_pd.weights_desc() != user_wei.get_desc()) {
289	conv_wei = memory(conv_pd.weights_desc(), eng);
290	auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
291	reorder(r_pd).execute(s, user_wei, conv_wei);
292	}
293
294	// prepare convolution destination
295	memory conv_dst = user_dst;
296	if (conv_pd.dst_desc() != user_dst.get_desc())
297	conv_dst = memory(conv_pd.dst_desc(), eng);
298	// [Conditionally create and execute reorder prims]
299	/// Finally it creates the convolution primitive `conv` and adds it to the
300	/// stream `s` with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`)
301	/// as inputs and then executes the
302	/// `create_and_execute_relu(conv_dst)` function.
303	/// @page performance_profiling_cpp
304	/// @snippet performance_profiling.cpp Create conv_primitive implementation2
305	// [Create conv_primitive implementation2]
306	// create convolution primitive
307	auto conv = convolution_forward(conv_pd);
308	// [Create conv_primitive implementation2]
309	/// @page performance_profiling_cpp
310	/// @snippet performance_profiling.cpp Add to stream implementation2
311	// [Add to stream implementation2]
312	// execute convolution by adding it to the stream s
313	conv.execute(s,
314	{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
315	{DNNL_ARG_DST, conv_dst}});
316	// [Add to stream implementation2]
317	/// @page performance_profiling_cpp
318	/// @snippet performance_profiling.cpp Create and execute relu implementation2
319	// [Create and execute relu implementation2]
320	// execute relu (on convolution's destination format, whatever it is)
321	create_and_execute_relu(conv_dst, eng, s);
322	// [Create and execute relu implementation2]
323	if (conv_pd.dst_desc() != user_dst.get_desc()) {
324	auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
325	reorder(r_pd).execute(s, conv_dst, user_dst);
326	}
327	s.wait();
328	/// @page performance_profiling_cpp
329	/// Blocked memory format is recommended for oneDNN primitive
330	/// execution and provides better performance, as shown in the
331	/// ONEDNN_VERBOSE output by the convolution and relu execution times of
332	/// 18.3 and 2.7 milliseconds (down from 38.3 and 2.9 in
333	/// naive implementation), respectively.
334	/// In this implementation, there is an additional reorder operation that
335	/// executes before and after the the conv + relu. This small cost is worth
336	/// the gain from executing in blocked format. If fact, it becomes
337	/// negligible when chaining together multiple oneDNN operations in
338	/// succession. In these situations, you can do one reorder at the beginning
339	/// and one at the end of the chain, and only pay the reorder penalty at
340	/// those points in the execution.
341	///
342	/// ONEDNN_VERBOSE output (see configuration notice\):*
343	/// ~~~sh
344	/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0310059
345	/// onednn_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,18.3101
346	/// onednn_verbose,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:aBcd16b:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.66895
347	/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.80396
348	/// ~~~
349	/// This inference implementation is closer to best practices than
350	/// naive implementation* because it uses oneDNN recommended memory*
351	/// format. fused implementation* will futher optimize the performance by*
352	/// fusing convolution with ReLU using oneDNN
353	/// [post-ops](@ref dev_guide_attributes_post_ops).
354	// reorder data to the user's format if needed.
355	}
356
357	// Implementation for convolution on blocked format for data and
358	// weights and the relu operation fused via a post-op attribute added to the
359	// convolution prim_descriptor
360	void conv_relu_fused(memory user_src, memory user_wei, memory user_dst,
361	const engine &eng, stream &s) {
362	/// @section performance_profiling_cpp_implementation3 Fused Implementation
363	/// This implementation is launched with the following shell code:
364	/// ~~~sh
365	/// ./program.exe cpu fused
366	/// ~~~
367	/// The program will call the implementation defined in the function
368	/// `conv_relu_fused()`.
369	/// @page performance_profiling_cpp
370	///
371	/// First the memory descriptors and convolution primitive descriptor are
372	/// created as in naive implementation.
373	// copy the dimensions data type from user's memory and set format tag
374	// to any to allow convolution to pick the best implementation
375	auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
376	user_src.get_desc().get_data_type(), memory::format_tag::any);
377	auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
378	user_wei.get_desc().get_data_type(), memory::format_tag::any);
379	auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
380	user_dst.get_desc().get_data_type(), memory::format_tag::any);
381
382	/// Then in preparation for the convolution prim desctiptor, a ReLU post-op
383	/// is built and added to the primitive attribute `attr`:
384	/// @page performance_profiling_cpp
385	/// @snippet performance_profiling.cpp Create post_op attr with relu
386
387	// Next the convolution prim descriptor is created, which inherits the ReLU
388	/// post-op by way of the attributes `attr`:
389	/// @page performance_profiling_cpp
390	/// @snippet performance_profiling.cpp Create prim_desc with attr
391	// [Create prim_desc with attr]
392	// create an attribute for fused relu
393	auto attr = create_attr_with_relu_post_op();
394
395	// create a convolution primitive descriptor
396	auto conv_pd = convolution_forward::primitive_desc(eng,
397	prop_kind::forward_inference, algorithm::convolution_direct,
398	conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding,
399	attr);
400	// [Create prim_desc with attr]
401	/// Then conditional reorders are applied as in blocked format*
402	/// implementation to convert `user_` format NCHW to blocked. Finally, it*
403	/// creates the convolution primitive `conv` and adds it to the stream `s`
404	/// with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`).
405	// prepare convolution source
406	memory conv_src = user_src;
407	if (conv_pd.src_desc() != user_src.get_desc()) {
408	conv_src = memory(conv_pd.src_desc(), eng);
409	auto r_pd = reorder::primitive_desc(user_src, conv_src);
410	reorder(r_pd).execute(s, user_src, conv_src);
411	}
412
413	// prepare convolution weights
414	memory conv_wei = user_wei;
415	if (conv_pd.weights_desc() != user_wei.get_desc()) {
416	conv_wei = memory(conv_pd.weights_desc(), eng);
417	auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
418	reorder(r_pd).execute(s, user_wei, conv_wei);
419	}
420
421	// prepare convolution destination
422	memory conv_dst = user_dst;
423	if (conv_pd.dst_desc() != user_dst.get_desc())
424	conv_dst = memory(conv_pd.dst_desc(), eng);
425	/// @page performance_profiling_cpp
426	/// @note There is no separate addition to the stream for the ReLU
427	/// operation because it has been added as a post-op to the `conv` primitive.
428	/// @page performance_profiling_cpp
429	/// @snippet performance_profiling.cpp Create conv_primitive implementation3
430	// [Create conv_primitive implementation3]
431	// create convolution primitive
432	auto conv = convolution_forward(conv_pd);
433	// [Create conv_primitive implementation3]
434	/// @page performance_profiling_cpp
435	/// @snippet performance_profiling.cpp Add to stream implementation3
436	// [Add to stream implementation3]
437	// execute convolution by adding it to the stream s
438	conv.execute(s,
439	{{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
440	{DNNL_ARG_DST, conv_dst}});
441	// [Add to stream implementation3]
442	// reorder data to user's format if needed
443	if (conv_pd.dst_desc() != user_dst.get_desc()) {
444	auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
445	reorder(r_pd).execute(s, conv_dst, user_dst);
446	}
447	s.wait();
448	/// @page performance_profiling_cpp
449	/// This implementation complies with best practices for f32 inference by
450	/// using the oneDNN recommended blocked format for convolution and
451	/// adding ReLU as a post-op to execute a fused version of conv + ReLU.
452	/// The consequence to following best practices can be seen in the execution
453	/// time of the fused primitive of 18.0 milliseconds.
454	///
455	/// ONEDNN_VERBOSE output (see configuration notice\):*
456	/// ~~~sh
457	/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0148926
458	/// onednn_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,post_ops:'eltwise_relu;';,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,17.968
459	/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.66797
460	/// ~~~
461	}
462
463	/// @page performance_profiling_cpp
464	/// @section performance_profiling_cpp_roundup Performance summary
465	///
466	/// \| Implementation \| Time, ms \| Cumulative speedup \|
467	/// \| :-- \| --: \| --: \|
468	/// \| Naive \| 41.2 \| 1.0 \|
469	/// \| Blocked format \| 21.0 \| 2.0 \|
470	/// \| Fused \| 18.0 \| 2.3 \|
471	///
472	///
473	/// @page performance_profiling_cpp
474	/// @section performance_profiling_cpp_config Configuration Notice
475	/// @note This example is meant to demonstrate oneDNN best practices.
476	/// @note It is not meant for benchmarking purposes. The platform is not fully
477	/// @note optimized, so the primitive execution times are only relevant in
478	/// @note relation to the other times in this example.
479	///
480	/// Runtime Settings:
481	/// OMP_NUM_THREADS=14*
482	/// KMP_AFFINITY=granularity=fine,compact*
483	///
484	/// Platform:
485	/// CPU: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz*
486	/// Thread(s) per core: 1*
487	/// Core(s) per socket: 28*
488	/// Socket(s): 2*
489	/// NUMA node(s): 2*
490	/// RAM (DDR4): 192 GB*
491
492	void performance_profiling(engine::kind engine_kind, int argc, char **argv) {
493	// Initialize engine
494	engine eng(engine_kind, `0`);
495
496	// Initialize stream
497	stream s(eng);
498	// [Set dimensions]
499	// set dimensions for synthetic data and weights
500	const memory::dim BATCH = `128`;
501	const memory::dim IC = `3`, OC = `96`;
502	const memory::dim IH = `227`, KH = `11`, OH = `55`;
503	const memory::dim IW = `227`, KW = `11`, OW = `55`;
504	// [Set dimensions]
505
506	// [Create memory objects]
507	// create oneDNN memory objects for user's tensors (in nchw and oihw formats)
508	auto user_src = memory({{BATCH, IC, IH, IW}, memory::data_type::f32,
509	memory::format_tag::nchw},
510	eng);
511	auto user_wei = memory({{OC, IC, KH, KW}, memory::data_type::f32,
512	memory::format_tag::oihw},
513	eng);
514	auto user_dst = memory({{BATCH, OC, OH, OW}, memory::data_type::f32,
515	memory::format_tag::nchw},
516	eng);
517	// [Create memory objects]
518
519	// fill source, destination, and weights with synthetic data
520	init_data(user_src, `1`);
521	init_data(user_dst, -`1`);
522	init_data(user_wei, `.5`);
523
524	// set implementation ("naive"\|\|"blocked"\|\|"fused") setting implementation
525	// to "validation" will run all implementations
526	std::string implementation;
527	if (argc <= `2`)
528	implementation = "validation";
529	else if (argc == `3`)
530	implementation = argv[`2`];
531
532	if (!(implementation == "validation" \|\| implementation == "naive"
533	\|\| implementation == "blocked" \|\| implementation == "fused")) {
534	std::cout << "The implementation can be one of:\n";
535	std::cout << " - naive: NCHW format without fusion\n";
536	std::cout << " - blocked: format propagation without fusion\n";
537	std::cout << " - fused: format propagation with fusion\n";
538	std::cout << " - validation: runs all implementations\n\n";
539	std::cout << "Validation will run if no parameters are specified.\n\n";
540
541	throw std::invalid_argument("Incorrect input arguments.");
542	}
543
544	if (implementation == "naive" \|\| implementation == "validation") {
545	std::cout << "Implementation: naive.\n";
546	// run conv + relu w/o fusing
547	conv_relu_naive(user_src, user_wei, user_dst, eng, s);
548	std::cout << "Conv + ReLU w/ nchw format completed.\n";
549	}
550
551	if (implementation == "blocked" \|\| implementation == "validation") {
552	std::cout << "Implementation: blocked.\n";
553	// run conv + relu w/o fusing
554	conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
555	std::cout << "Conv + ReLU w/ blocked format completed.\n";
556	}
557
558	if (implementation == "fused" \|\| implementation == "validation") {
559	std::cout << "Implementation: fused.\n";
560	// run conv + relu w/ fusing
561	conv_relu_fused(user_src, user_wei, user_dst, eng, s);
562	std::cout << "Conv + ReLU w/ fusing completed.\n";
563	}
564	}
565
566	int main(int argc, char **argv) {
567	engine::kind engine_kind = parse_engine_kind(argc, argv, `1`);
568	return handle_example_errors(
569	performance_profiling, engine_kind, argc, argv);
570	}
571

Browse the source code of oneDNN/examples/performance_profiling.cpp