1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example performance_profiling.cpp
18/// @copybrief performance_profiling_cpp
19/// > Annotated version: @ref performance_profiling_cpp
20
21/// @page performance_profiling_cpp Performance Profiling Example
22/// This example demonstrates the best practices for application performance
23/// optimizations with oneDNN.
24///
25/// > Example code: @ref performance_profiling.cpp
26///
27/// This example uses [ONEDNN_VERBOSE](@ref dev_guide_verbose) trace output
28/// to tune oneDNN code to align
29/// with the [best practices](@ref dev_guide_inference).
30///
31/// It assumes knowledge of memory formats and their usage in
32/// oneDNN. You can read more about this topic
33/// [here](@ref memory_format_propagation_cpp).
34///
35/// Additionally, see the [article for recommended environment for
36/// running benchmarks](@ref dev_guide_performance_settings).
37///
38/// The example has three different implementations of the mathematical
39/// operation:
40/// 1. *Naive implementation* executes 2D convolution followed by
41/// ReLU on the data in **NCHW** format. This implementation
42/// does not align with oneDNN best practices and results in
43/// suboptimal performance.
44/// 2. *Blocked format implementation* executes the same operations
45/// sequence on the **blocked format** optimized for convolution
46/// performance. This implementation uses `format_tag=ANY` to create a
47/// convolution memory descriptor to determine the data format optimal
48/// for the convolution implementation. It then **propagates the blocked
49/// format** to the non-intensive ReLU. This implementation results
50/// in better overall performance than the naive implementation.
51/// 3. *Fused implementation* executes convolution fused with ReLU on
52/// blocked data format. This implementation uses
53/// `format_tag=ANY` to create a convolution memory descriptor, and then
54/// adds ReLU as a **post-op** to the convolution primitive. This version
55/// implements all of the best practices for inference resulting in the
56/// best overall performance.
57///
58/// @section performance_profiling_cpp_walkthrough Walkthrough
59///
60/// The program in \ref performance_profiling.cpp includes all three
61/// implementations introduced above. You can select the specific implementation
62/// using command line options.
63///
64/// After compilation, you can execute each implementation with:
65/// ~~~sh
66/// ./program.exe [cpu|gpu] [implementation]
67/// ~~~
68///
69/// Before you run the program, set your `ONEDNN_VERBOSE` environment
70/// variable to 1:
71/// ~~~sh
72/// export ONEDNN_VERBOSE=1
73/// ~~~
74///
75/// The program starts by creating oneDNN memory objects in **NCHW**
76/// format. These are called `user_` because they are meant to represent the
77/// user's source data entering oneDNN with the NCHW format.
78/// @page performance_profiling_cpp
79/// @snippet performance_profiling.cpp Set dimensions
80/// @page performance_profiling_cpp
81/// @note Here the library allocates memory.
82/// @page performance_profiling_cpp
83/// @snippet performance_profiling.cpp Create memory objects
84/// @page performance_profiling_cpp
85/// @note You can change the batch size to easily increase/decrease the workload.
86///
87/// The following descriptions of each implementation will reference each other,
88/// and are meant to be read in order.
89///
90
91#include <iostream>
92#include <stdexcept>
93#include <vector>
94
95#include "oneapi/dnnl/dnnl.hpp"
96
97#include "example_utils.hpp"
98
99using namespace dnnl;
100
101// [Prologue]
102
103// Set Strides and Padding
104const memory::dims strides = {4, 4};
105const memory::dims padding = {0, 0};
106
107// [Prologue]
108//
109// function to init data
110void init_data(memory &m, float v) {
111 size_t size = m.get_desc().get_size() / sizeof(float);
112 std::vector<float> data(size, v);
113 write_to_dnnl_memory(data.data(), m);
114}
115
116// function to execute non-fused relu
117void create_and_execute_relu(memory &data, engine &eng, stream &s) {
118 // relu operates on whatever data format is given to it
119
120 // create a primitive
121 auto relu_pd = eltwise_forward::primitive_desc(eng,
122 prop_kind::forward_inference, algorithm::eltwise_relu,
123 data.get_desc(), data.get_desc(), 0.f, 0.f);
124 auto relu = eltwise_forward(relu_pd);
125
126 // execute it (in-place)
127 relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}});
128}
129
130// [Create post_op attr with relu]
131// function to create post-op attribute for fused relu
132primitive_attr create_attr_with_relu_post_op() {
133 // create a post-op with relu
134 post_ops ops;
135 ops.append_eltwise(algorithm::eltwise_relu, 0.f, 0.f);
136
137 // create an attribute and set the corresponding post op
138 primitive_attr attr;
139 attr.set_post_ops(ops);
140
141 return attr;
142}
143// [Create post_op attr with relu]
144
145// Implementation for naive convolution on nchw (data) and oihw (weights),
146// followed by execution of non-fused relu
147void conv_relu_naive(const memory &user_src, const memory &user_wei,
148 memory user_dst, engine &eng, stream &s) {
149 /// @section performance_profiling_cpp_implementation1 Naive Implementation
150 /// This implementation is launched with the following shell code:
151 /// ~~~sh
152 /// ./program.exe cpu naive
153 /// ~~~
154 /// The program will call the implementation defined in the function
155 /// `conv_relu_naive()`.
156 ///
157 /// First it sets the dimensions and format for convolution memory
158 /// descriptors (`_md`) to match `user_` values--one `md` each for source,
159 /// destination, and weight data. Then it uses those `md` to create the
160 /// convolution primitive descriptor `conv_pd`, which tells oneDNN to use
161 /// plain format (NCHW) for the convolution.
162 /// @page performance_profiling_cpp
163 /// @snippet performance_profiling.cpp Create mem_desc
164 // [Create mem_desc]
165 // copy the dimensions and format from user's memory
166 auto conv_src_md = memory::desc(user_src.get_desc());
167 auto conv_wei_md = memory::desc(user_wei.get_desc());
168 auto conv_dst_md = memory::desc(user_dst.get_desc());
169 // [Create mem_desc]
170 /// @page performance_profiling_cpp
171 /// Next the program creates a convolution primitive descriptor `conv_pd`
172 /// and convolution primitive `conv`. These structs will inherit
173 /// NCHW format from `md` by way of the `conv_d`. Finally it creates
174 /// the convolution primitive `conv` and adds it to the stream `s`, and then
175 /// executes the `create_and_execute_relu(user_dst)` function.
176 /// @page performance_profiling_cpp
177 /// @snippet performance_profiling.cpp Create conv_prim_desc
178 // [Create conv_prim_desc]
179 // create a convolution primitive descriptor
180 auto conv_pd = convolution_forward::primitive_desc(eng,
181 prop_kind::forward_inference, algorithm::convolution_direct,
182 conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
183 // [Create conv_prim_desc]
184 /// @page performance_profiling_cpp
185 /// @snippet performance_profiling.cpp Create conv_primitive
186 // [Create conv_primitive]
187 // create convolution primitive
188 auto conv = convolution_forward(conv_pd);
189 // [Create conv_primitive]
190 /// @page performance_profiling_cpp
191 /// @snippet performance_profiling.cpp Add to stream
192 // [Add to stream]
193 // execute convolution by adding it to the stream s
194 conv.execute(s,
195 {{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei},
196 {DNNL_ARG_DST, user_dst}});
197 // [Add to stream]
198 /// @page performance_profiling_cpp
199 /// @snippet performance_profiling.cpp Create and execute relu
200 // [Create and execute relu]
201 // execute relu (on convolution's destination format, whatever it is)
202 create_and_execute_relu(user_dst, eng, s);
203 s.wait();
204 // [Create and execute relu]
205 /// @page performance_profiling_cpp
206 /// @note The function for creation and execution of ReLU primitive is
207 /// defined elsewhere to keep this example clean. It is an non-intensive
208 /// operation, so the `create_and_execute_relu()` function uses whatever
209 /// the input data format is at the time it is called.
210 ///
211 /// Using NCHW data format may result in suboptimal performance for compute
212 /// intensive primitives, as shown in the following ONEDNN_VERBOSE output
213 /// by the convolution and relu execution
214 /// times of 38.3 and 2.9 milliseconds, respectively.
215 ///
216 /// *ONEDNN_VERBOSE output (see configuration notice\*):*
217 /// ~~~sh
218 /// onednn_verbose,exec,cpu,convolution,gemm:jit,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:abcd:f0 bia_undef::undef::f0 dst_f32::blocked:abcd:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,38.314
219 /// onednn_verbose,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:abcd:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.87695
220 /// ~~~
221 /// In *Blocked format implementation*, we will incorporate the best
222 /// practice of letting oneDNN determine the optimal format
223 /// for convolution primitive.
224}
225
226// Implementation for convolution on blocked format for data and
227// weights, followed by execution of non-fused relu
228void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst,
229 engine &eng, stream &s) {
230 /// @page performance_profiling_cpp
231 /// @section performance_profiling_cpp_implementation2 Blocked format implementation
232 /// This implementation is launched with the following shell code:
233 /// ~~~sh
234 /// ./program.exe cpu blocked
235 /// ~~~
236 /// The program will call the implementation defined in the function
237 /// `conv_relu_blocked()`.
238 ///
239 /// First it creates the md as in **naive implementation**. Next it changes
240 /// the dnnl::memory::format_tag for each md to `ANY`. Then it uses those
241 /// md to create the convolution primitive descriptor conv_pd, which tells
242 /// oneDNN to use whatever format it recommends for the convolution.
243 /// oneDNN will choose a friendly blocked format.
244 /// @page performance_profiling_cpp
245 /// @snippet performance_profiling.cpp Create mem_desc with tag=any
246 // [Create mem_desc with tag=any]
247 // copy the dimensions and data type from user's memory and set format tag
248 // to "any" to allow convolution to pick the best implementation
249 auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
250 user_src.get_desc().get_data_type(), memory::format_tag::any);
251 auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
252 user_wei.get_desc().get_data_type(), memory::format_tag::any);
253 auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
254 user_dst.get_desc().get_data_type(), memory::format_tag::any);
255 // [Create mem_desc with tag=any]
256
257 /// Next the program creates a convolution primitive descriptor conv_pd and
258 /// convolution primitive conv as in naive implementation.
259 /// However, in this implementation the structs will inherit blocked format
260 /// from md by way of the conv_d.
261 /// @page performance_profiling_cpp
262 /// @snippet performance_profiling.cpp Create conv_prim_desc implementation2
263 // [Create conv_prim_desc implementation2]
264 // create a convolution primitive descriptor and primitive
265 auto conv_pd = convolution_forward::primitive_desc(eng,
266 prop_kind::forward_inference, algorithm::convolution_direct,
267 conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding);
268 // [Create conv_prim_desc implementation2]
269 /// Since the resulting convolution primitive will expect
270 /// blocked source data, conditional reorders are inserted to convert
271 /// input data to blocked format if required.
272 /// The input data user_src is NCHW, so this conditional will be triggered:
273 ///
274 /// @note The reoders are applied using oneDNN `reorder` primitive.
275 /// @page performance_profiling_cpp
276 /// @snippet performance_profiling.cpp Conditionally create and execute reorder prims
277 // [Conditionally create and execute reorder prims]
278 // prepare convolution source
279 memory conv_src = user_src;
280 if (conv_pd.src_desc() != user_src.get_desc()) {
281 conv_src = memory(conv_pd.src_desc(), eng);
282 auto r_pd = reorder::primitive_desc(user_src, conv_src);
283 reorder(r_pd).execute(s, user_src, conv_src);
284 }
285
286 // prepare convolution weights
287 memory conv_wei = user_wei;
288 if (conv_pd.weights_desc() != user_wei.get_desc()) {
289 conv_wei = memory(conv_pd.weights_desc(), eng);
290 auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
291 reorder(r_pd).execute(s, user_wei, conv_wei);
292 }
293
294 // prepare convolution destination
295 memory conv_dst = user_dst;
296 if (conv_pd.dst_desc() != user_dst.get_desc())
297 conv_dst = memory(conv_pd.dst_desc(), eng);
298 // [Conditionally create and execute reorder prims]
299 /// Finally it creates the convolution primitive `conv` and adds it to the
300 /// stream `s` with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`)
301 /// as inputs and then executes the
302 /// `create_and_execute_relu(conv_dst)` function.
303 /// @page performance_profiling_cpp
304 /// @snippet performance_profiling.cpp Create conv_primitive implementation2
305 // [Create conv_primitive implementation2]
306 // create convolution primitive
307 auto conv = convolution_forward(conv_pd);
308 // [Create conv_primitive implementation2]
309 /// @page performance_profiling_cpp
310 /// @snippet performance_profiling.cpp Add to stream implementation2
311 // [Add to stream implementation2]
312 // execute convolution by adding it to the stream s
313 conv.execute(s,
314 {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
315 {DNNL_ARG_DST, conv_dst}});
316 // [Add to stream implementation2]
317 /// @page performance_profiling_cpp
318 /// @snippet performance_profiling.cpp Create and execute relu implementation2
319 // [Create and execute relu implementation2]
320 // execute relu (on convolution's destination format, whatever it is)
321 create_and_execute_relu(conv_dst, eng, s);
322 // [Create and execute relu implementation2]
323 if (conv_pd.dst_desc() != user_dst.get_desc()) {
324 auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
325 reorder(r_pd).execute(s, conv_dst, user_dst);
326 }
327 s.wait();
328 /// @page performance_profiling_cpp
329 /// Blocked memory format is recommended for oneDNN primitive
330 /// execution and provides better performance, as shown in the
331 /// ONEDNN_VERBOSE output by the convolution and relu execution times of
332 /// 18.3 and 2.7 milliseconds (down from 38.3 and 2.9 in
333 /// *naive implementation*), respectively.
334 /// In this implementation, there is an additional reorder operation that
335 /// executes before and after the the conv + relu. This small cost is worth
336 /// the gain from executing in blocked format. If fact, it becomes
337 /// negligible when chaining together multiple oneDNN operations in
338 /// succession. In these situations, you can do one reorder at the beginning
339 /// and one at the end of the chain, and only pay the reorder penalty at
340 /// those points in the execution.
341 ///
342 /// *ONEDNN_VERBOSE output (see configuration notice\*):*
343 /// ~~~sh
344 /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0310059
345 /// onednn_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,18.3101
346 /// onednn_verbose,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:aBcd16b:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.66895
347 /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.80396
348 /// ~~~
349 /// This inference implementation is closer to best practices than
350 /// *naive implementation* because it uses oneDNN recommended memory
351 /// format. *fused implementation* will futher optimize the performance by
352 /// fusing convolution with ReLU using oneDNN
353 /// [post-ops](@ref dev_guide_attributes_post_ops).
354 // reorder data to the user's format if needed.
355}
356
357// Implementation for convolution on blocked format for data and
358// weights and the relu operation fused via a post-op attribute added to the
359// convolution prim_descriptor
360void conv_relu_fused(memory user_src, memory user_wei, memory user_dst,
361 const engine &eng, stream &s) {
362 /// @section performance_profiling_cpp_implementation3 Fused Implementation
363 /// This implementation is launched with the following shell code:
364 /// ~~~sh
365 /// ./program.exe cpu fused
366 /// ~~~
367 /// The program will call the implementation defined in the function
368 /// `conv_relu_fused()`.
369 /// @page performance_profiling_cpp
370 ///
371 /// First the memory descriptors and convolution primitive descriptor are
372 /// created as in *naive implementation*.
373 // copy the dimensions data type from user's memory and set format tag
374 // to any to allow convolution to pick the best implementation
375 auto conv_src_md = memory::desc(user_src.get_desc().get_dims(),
376 user_src.get_desc().get_data_type(), memory::format_tag::any);
377 auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(),
378 user_wei.get_desc().get_data_type(), memory::format_tag::any);
379 auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(),
380 user_dst.get_desc().get_data_type(), memory::format_tag::any);
381
382 /// Then in preparation for the convolution prim desctiptor, a ReLU post-op
383 /// is built and added to the primitive attribute `attr`:
384 /// @page performance_profiling_cpp
385 /// @snippet performance_profiling.cpp Create post_op attr with relu
386
387 // Next the convolution prim descriptor is created, which inherits the ReLU
388 /// post-op by way of the attributes `attr`:
389 /// @page performance_profiling_cpp
390 /// @snippet performance_profiling.cpp Create prim_desc with attr
391 // [Create prim_desc with attr]
392 // create an attribute for fused relu
393 auto attr = create_attr_with_relu_post_op();
394
395 // create a convolution primitive descriptor
396 auto conv_pd = convolution_forward::primitive_desc(eng,
397 prop_kind::forward_inference, algorithm::convolution_direct,
398 conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding,
399 attr);
400 // [Create prim_desc with attr]
401 /// Then conditional reorders are applied as in *blocked format
402 /// implementation* to convert `user_` format NCHW to blocked. Finally, it
403 /// creates the convolution primitive `conv` and adds it to the stream `s`
404 /// with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`).
405 // prepare convolution source
406 memory conv_src = user_src;
407 if (conv_pd.src_desc() != user_src.get_desc()) {
408 conv_src = memory(conv_pd.src_desc(), eng);
409 auto r_pd = reorder::primitive_desc(user_src, conv_src);
410 reorder(r_pd).execute(s, user_src, conv_src);
411 }
412
413 // prepare convolution weights
414 memory conv_wei = user_wei;
415 if (conv_pd.weights_desc() != user_wei.get_desc()) {
416 conv_wei = memory(conv_pd.weights_desc(), eng);
417 auto r_pd = reorder::primitive_desc(user_wei, conv_wei);
418 reorder(r_pd).execute(s, user_wei, conv_wei);
419 }
420
421 // prepare convolution destination
422 memory conv_dst = user_dst;
423 if (conv_pd.dst_desc() != user_dst.get_desc())
424 conv_dst = memory(conv_pd.dst_desc(), eng);
425 /// @page performance_profiling_cpp
426 /// @note There is no separate addition to the stream for the ReLU
427 /// operation because it has been added as a post-op to the `conv` primitive.
428 /// @page performance_profiling_cpp
429 /// @snippet performance_profiling.cpp Create conv_primitive implementation3
430 // [Create conv_primitive implementation3]
431 // create convolution primitive
432 auto conv = convolution_forward(conv_pd);
433 // [Create conv_primitive implementation3]
434 /// @page performance_profiling_cpp
435 /// @snippet performance_profiling.cpp Add to stream implementation3
436 // [Add to stream implementation3]
437 // execute convolution by adding it to the stream s
438 conv.execute(s,
439 {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei},
440 {DNNL_ARG_DST, conv_dst}});
441 // [Add to stream implementation3]
442 // reorder data to user's format if needed
443 if (conv_pd.dst_desc() != user_dst.get_desc()) {
444 auto r_pd = reorder::primitive_desc(conv_dst, user_dst);
445 reorder(r_pd).execute(s, conv_dst, user_dst);
446 }
447 s.wait();
448 /// @page performance_profiling_cpp
449 /// This implementation complies with best practices for f32 inference by
450 /// using the oneDNN recommended blocked format for convolution and
451 /// adding ReLU as a post-op to execute a fused version of conv + ReLU.
452 /// The consequence to following best practices can be seen in the execution
453 /// time of the fused primitive of 18.0 milliseconds.
454 ///
455 /// *ONEDNN_VERBOSE output (see configuration notice\*):*
456 /// ~~~sh
457 /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0148926
458 /// onednn_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,post_ops:'eltwise_relu;';,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,17.968
459 /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.66797
460 /// ~~~
461}
462
463/// @page performance_profiling_cpp
464/// @section performance_profiling_cpp_roundup Performance summary
465///
466/// | Implementation | Time, ms | Cumulative speedup |
467/// | :-- | --: | --: |
468/// | Naive | 41.2 | 1.0 |
469/// | Blocked format | 21.0 | 2.0 |
470/// | Fused | 18.0 | 2.3 |
471///
472/// ** **
473/// @page performance_profiling_cpp
474/// @section performance_profiling_cpp_config Configuration Notice
475/// @note This example is meant to demonstrate oneDNN best practices.
476/// @note It is not meant for benchmarking purposes. The platform is not fully
477/// @note optimized, so the primitive execution times are only relevant in
478/// @note relation to the other times in this example.
479///
480/// Runtime Settings:
481/// * OMP_NUM_THREADS=14
482/// * KMP_AFFINITY=granularity=fine,compact
483///
484/// Platform:
485/// * CPU: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz
486/// * Thread(s) per core: 1
487/// * Core(s) per socket: 28
488/// * Socket(s): 2
489/// * NUMA node(s): 2
490/// * RAM (DDR4): 192 GB
491
492void performance_profiling(engine::kind engine_kind, int argc, char **argv) {
493 // Initialize engine
494 engine eng(engine_kind, 0);
495
496 // Initialize stream
497 stream s(eng);
498 // [Set dimensions]
499 // set dimensions for synthetic data and weights
500 const memory::dim BATCH = 128;
501 const memory::dim IC = 3, OC = 96;
502 const memory::dim IH = 227, KH = 11, OH = 55;
503 const memory::dim IW = 227, KW = 11, OW = 55;
504 // [Set dimensions]
505
506 // [Create memory objects]
507 // create oneDNN memory objects for user's tensors (in nchw and oihw formats)
508 auto user_src = memory({{BATCH, IC, IH, IW}, memory::data_type::f32,
509 memory::format_tag::nchw},
510 eng);
511 auto user_wei = memory({{OC, IC, KH, KW}, memory::data_type::f32,
512 memory::format_tag::oihw},
513 eng);
514 auto user_dst = memory({{BATCH, OC, OH, OW}, memory::data_type::f32,
515 memory::format_tag::nchw},
516 eng);
517 // [Create memory objects]
518
519 // fill source, destination, and weights with synthetic data
520 init_data(user_src, 1);
521 init_data(user_dst, -1);
522 init_data(user_wei, .5);
523
524 // set implementation ("naive"||"blocked"||"fused") setting implementation
525 // to "validation" will run all implementations
526 std::string implementation;
527 if (argc <= 2)
528 implementation = "validation";
529 else if (argc == 3)
530 implementation = argv[2];
531
532 if (!(implementation == "validation" || implementation == "naive"
533 || implementation == "blocked" || implementation == "fused")) {
534 std::cout << "The implementation can be one of:\n";
535 std::cout << " - naive: NCHW format without fusion\n";
536 std::cout << " - blocked: format propagation without fusion\n";
537 std::cout << " - fused: format propagation with fusion\n";
538 std::cout << " - validation: runs all implementations\n\n";
539 std::cout << "Validation will run if no parameters are specified.\n\n";
540
541 throw std::invalid_argument("Incorrect input arguments.");
542 }
543
544 if (implementation == "naive" || implementation == "validation") {
545 std::cout << "Implementation: naive.\n";
546 // run conv + relu w/o fusing
547 conv_relu_naive(user_src, user_wei, user_dst, eng, s);
548 std::cout << "Conv + ReLU w/ nchw format completed.\n";
549 }
550
551 if (implementation == "blocked" || implementation == "validation") {
552 std::cout << "Implementation: blocked.\n";
553 // run conv + relu w/o fusing
554 conv_relu_blocked(user_src, user_wei, user_dst, eng, s);
555 std::cout << "Conv + ReLU w/ blocked format completed.\n";
556 }
557
558 if (implementation == "fused" || implementation == "validation") {
559 std::cout << "Implementation: fused.\n";
560 // run conv + relu w/ fusing
561 conv_relu_fused(user_src, user_wei, user_dst, eng, s);
562 std::cout << "Conv + ReLU w/ fusing completed.\n";
563 }
564}
565
566int main(int argc, char **argv) {
567 engine::kind engine_kind = parse_engine_kind(argc, argv, 1);
568 return handle_example_errors(
569 performance_profiling, engine_kind, argc, argv);
570}
571