1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example performance_profiling.cpp |
18 | /// @copybrief performance_profiling_cpp |
19 | /// > Annotated version: @ref performance_profiling_cpp |
20 | |
21 | /// @page performance_profiling_cpp Performance Profiling Example |
22 | /// This example demonstrates the best practices for application performance |
23 | /// optimizations with oneDNN. |
24 | /// |
25 | /// > Example code: @ref performance_profiling.cpp |
26 | /// |
27 | /// This example uses [ONEDNN_VERBOSE](@ref dev_guide_verbose) trace output |
28 | /// to tune oneDNN code to align |
29 | /// with the [best practices](@ref dev_guide_inference). |
30 | /// |
31 | /// It assumes knowledge of memory formats and their usage in |
32 | /// oneDNN. You can read more about this topic |
33 | /// [here](@ref memory_format_propagation_cpp). |
34 | /// |
35 | /// Additionally, see the [article for recommended environment for |
36 | /// running benchmarks](@ref dev_guide_performance_settings). |
37 | /// |
38 | /// The example has three different implementations of the mathematical |
39 | /// operation: |
40 | /// 1. *Naive implementation* executes 2D convolution followed by |
41 | /// ReLU on the data in **NCHW** format. This implementation |
42 | /// does not align with oneDNN best practices and results in |
43 | /// suboptimal performance. |
44 | /// 2. *Blocked format implementation* executes the same operations |
45 | /// sequence on the **blocked format** optimized for convolution |
46 | /// performance. This implementation uses `format_tag=ANY` to create a |
47 | /// convolution memory descriptor to determine the data format optimal |
48 | /// for the convolution implementation. It then **propagates the blocked |
49 | /// format** to the non-intensive ReLU. This implementation results |
50 | /// in better overall performance than the naive implementation. |
51 | /// 3. *Fused implementation* executes convolution fused with ReLU on |
52 | /// blocked data format. This implementation uses |
53 | /// `format_tag=ANY` to create a convolution memory descriptor, and then |
54 | /// adds ReLU as a **post-op** to the convolution primitive. This version |
55 | /// implements all of the best practices for inference resulting in the |
56 | /// best overall performance. |
57 | /// |
58 | /// @section performance_profiling_cpp_walkthrough Walkthrough |
59 | /// |
60 | /// The program in \ref performance_profiling.cpp includes all three |
61 | /// implementations introduced above. You can select the specific implementation |
62 | /// using command line options. |
63 | /// |
64 | /// After compilation, you can execute each implementation with: |
65 | /// ~~~sh |
66 | /// ./program.exe [cpu|gpu] [implementation] |
67 | /// ~~~ |
68 | /// |
69 | /// Before you run the program, set your `ONEDNN_VERBOSE` environment |
70 | /// variable to 1: |
71 | /// ~~~sh |
72 | /// export ONEDNN_VERBOSE=1 |
73 | /// ~~~ |
74 | /// |
75 | /// The program starts by creating oneDNN memory objects in **NCHW** |
76 | /// format. These are called `user_` because they are meant to represent the |
77 | /// user's source data entering oneDNN with the NCHW format. |
78 | /// @page performance_profiling_cpp |
79 | /// @snippet performance_profiling.cpp Set dimensions |
80 | /// @page performance_profiling_cpp |
81 | /// @note Here the library allocates memory. |
82 | /// @page performance_profiling_cpp |
83 | /// @snippet performance_profiling.cpp Create memory objects |
84 | /// @page performance_profiling_cpp |
85 | /// @note You can change the batch size to easily increase/decrease the workload. |
86 | /// |
87 | /// The following descriptions of each implementation will reference each other, |
88 | /// and are meant to be read in order. |
89 | /// |
90 | |
91 | #include <iostream> |
92 | #include <stdexcept> |
93 | #include <vector> |
94 | |
95 | #include "oneapi/dnnl/dnnl.hpp" |
96 | |
97 | #include "example_utils.hpp" |
98 | |
99 | using namespace dnnl; |
100 | |
101 | // [Prologue] |
102 | |
103 | // Set Strides and Padding |
104 | const memory::dims strides = {4, 4}; |
105 | const memory::dims padding = {0, 0}; |
106 | |
107 | // [Prologue] |
108 | // |
109 | // function to init data |
110 | void init_data(memory &m, float v) { |
111 | size_t size = m.get_desc().get_size() / sizeof(float); |
112 | std::vector<float> data(size, v); |
113 | write_to_dnnl_memory(data.data(), m); |
114 | } |
115 | |
116 | // function to execute non-fused relu |
117 | void create_and_execute_relu(memory &data, engine &eng, stream &s) { |
118 | // relu operates on whatever data format is given to it |
119 | |
120 | // create a primitive |
121 | auto relu_pd = eltwise_forward::primitive_desc(eng, |
122 | prop_kind::forward_inference, algorithm::eltwise_relu, |
123 | data.get_desc(), data.get_desc(), 0.f, 0.f); |
124 | auto relu = eltwise_forward(relu_pd); |
125 | |
126 | // execute it (in-place) |
127 | relu.execute(s, {{DNNL_ARG_SRC, data}, {DNNL_ARG_DST, data}}); |
128 | } |
129 | |
130 | // [Create post_op attr with relu] |
131 | // function to create post-op attribute for fused relu |
132 | primitive_attr create_attr_with_relu_post_op() { |
133 | // create a post-op with relu |
134 | post_ops ops; |
135 | ops.append_eltwise(algorithm::eltwise_relu, 0.f, 0.f); |
136 | |
137 | // create an attribute and set the corresponding post op |
138 | primitive_attr attr; |
139 | attr.set_post_ops(ops); |
140 | |
141 | return attr; |
142 | } |
143 | // [Create post_op attr with relu] |
144 | |
145 | // Implementation for naive convolution on nchw (data) and oihw (weights), |
146 | // followed by execution of non-fused relu |
147 | void conv_relu_naive(const memory &user_src, const memory &user_wei, |
148 | memory user_dst, engine &eng, stream &s) { |
149 | /// @section performance_profiling_cpp_implementation1 Naive Implementation |
150 | /// This implementation is launched with the following shell code: |
151 | /// ~~~sh |
152 | /// ./program.exe cpu naive |
153 | /// ~~~ |
154 | /// The program will call the implementation defined in the function |
155 | /// `conv_relu_naive()`. |
156 | /// |
157 | /// First it sets the dimensions and format for convolution memory |
158 | /// descriptors (`_md`) to match `user_` values--one `md` each for source, |
159 | /// destination, and weight data. Then it uses those `md` to create the |
160 | /// convolution primitive descriptor `conv_pd`, which tells oneDNN to use |
161 | /// plain format (NCHW) for the convolution. |
162 | /// @page performance_profiling_cpp |
163 | /// @snippet performance_profiling.cpp Create mem_desc |
164 | // [Create mem_desc] |
165 | // copy the dimensions and format from user's memory |
166 | auto conv_src_md = memory::desc(user_src.get_desc()); |
167 | auto conv_wei_md = memory::desc(user_wei.get_desc()); |
168 | auto conv_dst_md = memory::desc(user_dst.get_desc()); |
169 | // [Create mem_desc] |
170 | /// @page performance_profiling_cpp |
171 | /// Next the program creates a convolution primitive descriptor `conv_pd` |
172 | /// and convolution primitive `conv`. These structs will inherit |
173 | /// NCHW format from `md` by way of the `conv_d`. Finally it creates |
174 | /// the convolution primitive `conv` and adds it to the stream `s`, and then |
175 | /// executes the `create_and_execute_relu(user_dst)` function. |
176 | /// @page performance_profiling_cpp |
177 | /// @snippet performance_profiling.cpp Create conv_prim_desc |
178 | // [Create conv_prim_desc] |
179 | // create a convolution primitive descriptor |
180 | auto conv_pd = convolution_forward::primitive_desc(eng, |
181 | prop_kind::forward_inference, algorithm::convolution_direct, |
182 | conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding); |
183 | // [Create conv_prim_desc] |
184 | /// @page performance_profiling_cpp |
185 | /// @snippet performance_profiling.cpp Create conv_primitive |
186 | // [Create conv_primitive] |
187 | // create convolution primitive |
188 | auto conv = convolution_forward(conv_pd); |
189 | // [Create conv_primitive] |
190 | /// @page performance_profiling_cpp |
191 | /// @snippet performance_profiling.cpp Add to stream |
192 | // [Add to stream] |
193 | // execute convolution by adding it to the stream s |
194 | conv.execute(s, |
195 | {{DNNL_ARG_SRC, user_src}, {DNNL_ARG_WEIGHTS, user_wei}, |
196 | {DNNL_ARG_DST, user_dst}}); |
197 | // [Add to stream] |
198 | /// @page performance_profiling_cpp |
199 | /// @snippet performance_profiling.cpp Create and execute relu |
200 | // [Create and execute relu] |
201 | // execute relu (on convolution's destination format, whatever it is) |
202 | create_and_execute_relu(user_dst, eng, s); |
203 | s.wait(); |
204 | // [Create and execute relu] |
205 | /// @page performance_profiling_cpp |
206 | /// @note The function for creation and execution of ReLU primitive is |
207 | /// defined elsewhere to keep this example clean. It is an non-intensive |
208 | /// operation, so the `create_and_execute_relu()` function uses whatever |
209 | /// the input data format is at the time it is called. |
210 | /// |
211 | /// Using NCHW data format may result in suboptimal performance for compute |
212 | /// intensive primitives, as shown in the following ONEDNN_VERBOSE output |
213 | /// by the convolution and relu execution |
214 | /// times of 38.3 and 2.9 milliseconds, respectively. |
215 | /// |
216 | /// *ONEDNN_VERBOSE output (see configuration notice\*):* |
217 | /// ~~~sh |
218 | /// onednn_verbose,exec,cpu,convolution,gemm:jit,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:abcd:f0 bia_undef::undef::f0 dst_f32::blocked:abcd:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,38.314 |
219 | /// onednn_verbose,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:abcd:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.87695 |
220 | /// ~~~ |
221 | /// In *Blocked format implementation*, we will incorporate the best |
222 | /// practice of letting oneDNN determine the optimal format |
223 | /// for convolution primitive. |
224 | } |
225 | |
226 | // Implementation for convolution on blocked format for data and |
227 | // weights, followed by execution of non-fused relu |
228 | void conv_relu_blocked(memory user_src, memory user_wei, memory user_dst, |
229 | engine &eng, stream &s) { |
230 | /// @page performance_profiling_cpp |
231 | /// @section performance_profiling_cpp_implementation2 Blocked format implementation |
232 | /// This implementation is launched with the following shell code: |
233 | /// ~~~sh |
234 | /// ./program.exe cpu blocked |
235 | /// ~~~ |
236 | /// The program will call the implementation defined in the function |
237 | /// `conv_relu_blocked()`. |
238 | /// |
239 | /// First it creates the md as in **naive implementation**. Next it changes |
240 | /// the dnnl::memory::format_tag for each md to `ANY`. Then it uses those |
241 | /// md to create the convolution primitive descriptor conv_pd, which tells |
242 | /// oneDNN to use whatever format it recommends for the convolution. |
243 | /// oneDNN will choose a friendly blocked format. |
244 | /// @page performance_profiling_cpp |
245 | /// @snippet performance_profiling.cpp Create mem_desc with tag=any |
246 | // [Create mem_desc with tag=any] |
247 | // copy the dimensions and data type from user's memory and set format tag |
248 | // to "any" to allow convolution to pick the best implementation |
249 | auto conv_src_md = memory::desc(user_src.get_desc().get_dims(), |
250 | user_src.get_desc().get_data_type(), memory::format_tag::any); |
251 | auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(), |
252 | user_wei.get_desc().get_data_type(), memory::format_tag::any); |
253 | auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(), |
254 | user_dst.get_desc().get_data_type(), memory::format_tag::any); |
255 | // [Create mem_desc with tag=any] |
256 | |
257 | /// Next the program creates a convolution primitive descriptor conv_pd and |
258 | /// convolution primitive conv as in naive implementation. |
259 | /// However, in this implementation the structs will inherit blocked format |
260 | /// from md by way of the conv_d. |
261 | /// @page performance_profiling_cpp |
262 | /// @snippet performance_profiling.cpp Create conv_prim_desc implementation2 |
263 | // [Create conv_prim_desc implementation2] |
264 | // create a convolution primitive descriptor and primitive |
265 | auto conv_pd = convolution_forward::primitive_desc(eng, |
266 | prop_kind::forward_inference, algorithm::convolution_direct, |
267 | conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding); |
268 | // [Create conv_prim_desc implementation2] |
269 | /// Since the resulting convolution primitive will expect |
270 | /// blocked source data, conditional reorders are inserted to convert |
271 | /// input data to blocked format if required. |
272 | /// The input data user_src is NCHW, so this conditional will be triggered: |
273 | /// |
274 | /// @note The reoders are applied using oneDNN `reorder` primitive. |
275 | /// @page performance_profiling_cpp |
276 | /// @snippet performance_profiling.cpp Conditionally create and execute reorder prims |
277 | // [Conditionally create and execute reorder prims] |
278 | // prepare convolution source |
279 | memory conv_src = user_src; |
280 | if (conv_pd.src_desc() != user_src.get_desc()) { |
281 | conv_src = memory(conv_pd.src_desc(), eng); |
282 | auto r_pd = reorder::primitive_desc(user_src, conv_src); |
283 | reorder(r_pd).execute(s, user_src, conv_src); |
284 | } |
285 | |
286 | // prepare convolution weights |
287 | memory conv_wei = user_wei; |
288 | if (conv_pd.weights_desc() != user_wei.get_desc()) { |
289 | conv_wei = memory(conv_pd.weights_desc(), eng); |
290 | auto r_pd = reorder::primitive_desc(user_wei, conv_wei); |
291 | reorder(r_pd).execute(s, user_wei, conv_wei); |
292 | } |
293 | |
294 | // prepare convolution destination |
295 | memory conv_dst = user_dst; |
296 | if (conv_pd.dst_desc() != user_dst.get_desc()) |
297 | conv_dst = memory(conv_pd.dst_desc(), eng); |
298 | // [Conditionally create and execute reorder prims] |
299 | /// Finally it creates the convolution primitive `conv` and adds it to the |
300 | /// stream `s` with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`) |
301 | /// as inputs and then executes the |
302 | /// `create_and_execute_relu(conv_dst)` function. |
303 | /// @page performance_profiling_cpp |
304 | /// @snippet performance_profiling.cpp Create conv_primitive implementation2 |
305 | // [Create conv_primitive implementation2] |
306 | // create convolution primitive |
307 | auto conv = convolution_forward(conv_pd); |
308 | // [Create conv_primitive implementation2] |
309 | /// @page performance_profiling_cpp |
310 | /// @snippet performance_profiling.cpp Add to stream implementation2 |
311 | // [Add to stream implementation2] |
312 | // execute convolution by adding it to the stream s |
313 | conv.execute(s, |
314 | {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei}, |
315 | {DNNL_ARG_DST, conv_dst}}); |
316 | // [Add to stream implementation2] |
317 | /// @page performance_profiling_cpp |
318 | /// @snippet performance_profiling.cpp Create and execute relu implementation2 |
319 | // [Create and execute relu implementation2] |
320 | // execute relu (on convolution's destination format, whatever it is) |
321 | create_and_execute_relu(conv_dst, eng, s); |
322 | // [Create and execute relu implementation2] |
323 | if (conv_pd.dst_desc() != user_dst.get_desc()) { |
324 | auto r_pd = reorder::primitive_desc(conv_dst, user_dst); |
325 | reorder(r_pd).execute(s, conv_dst, user_dst); |
326 | } |
327 | s.wait(); |
328 | /// @page performance_profiling_cpp |
329 | /// Blocked memory format is recommended for oneDNN primitive |
330 | /// execution and provides better performance, as shown in the |
331 | /// ONEDNN_VERBOSE output by the convolution and relu execution times of |
332 | /// 18.3 and 2.7 milliseconds (down from 38.3 and 2.9 in |
333 | /// *naive implementation*), respectively. |
334 | /// In this implementation, there is an additional reorder operation that |
335 | /// executes before and after the the conv + relu. This small cost is worth |
336 | /// the gain from executing in blocked format. If fact, it becomes |
337 | /// negligible when chaining together multiple oneDNN operations in |
338 | /// succession. In these situations, you can do one reorder at the beginning |
339 | /// and one at the end of the chain, and only pay the reorder penalty at |
340 | /// those points in the execution. |
341 | /// |
342 | /// *ONEDNN_VERBOSE output (see configuration notice\*):* |
343 | /// ~~~sh |
344 | /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0310059 |
345 | /// onednn_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,18.3101 |
346 | /// onednn_verbose,exec,cpu,eltwise,jit:avx512_common,forward_inference,data_f32::blocked:aBcd16b:f0 diff_undef::undef::f0,,alg:eltwise_relu alpha:0 beta:0,128x96x55x55,2.66895 |
347 | /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.80396 |
348 | /// ~~~ |
349 | /// This inference implementation is closer to best practices than |
350 | /// *naive implementation* because it uses oneDNN recommended memory |
351 | /// format. *fused implementation* will futher optimize the performance by |
352 | /// fusing convolution with ReLU using oneDNN |
353 | /// [post-ops](@ref dev_guide_attributes_post_ops). |
354 | // reorder data to the user's format if needed. |
355 | } |
356 | |
357 | // Implementation for convolution on blocked format for data and |
358 | // weights and the relu operation fused via a post-op attribute added to the |
359 | // convolution prim_descriptor |
360 | void conv_relu_fused(memory user_src, memory user_wei, memory user_dst, |
361 | const engine &eng, stream &s) { |
362 | /// @section performance_profiling_cpp_implementation3 Fused Implementation |
363 | /// This implementation is launched with the following shell code: |
364 | /// ~~~sh |
365 | /// ./program.exe cpu fused |
366 | /// ~~~ |
367 | /// The program will call the implementation defined in the function |
368 | /// `conv_relu_fused()`. |
369 | /// @page performance_profiling_cpp |
370 | /// |
371 | /// First the memory descriptors and convolution primitive descriptor are |
372 | /// created as in *naive implementation*. |
373 | // copy the dimensions data type from user's memory and set format tag |
374 | // to any to allow convolution to pick the best implementation |
375 | auto conv_src_md = memory::desc(user_src.get_desc().get_dims(), |
376 | user_src.get_desc().get_data_type(), memory::format_tag::any); |
377 | auto conv_wei_md = memory::desc(user_wei.get_desc().get_dims(), |
378 | user_wei.get_desc().get_data_type(), memory::format_tag::any); |
379 | auto conv_dst_md = memory::desc(user_dst.get_desc().get_dims(), |
380 | user_dst.get_desc().get_data_type(), memory::format_tag::any); |
381 | |
382 | /// Then in preparation for the convolution prim desctiptor, a ReLU post-op |
383 | /// is built and added to the primitive attribute `attr`: |
384 | /// @page performance_profiling_cpp |
385 | /// @snippet performance_profiling.cpp Create post_op attr with relu |
386 | |
387 | // Next the convolution prim descriptor is created, which inherits the ReLU |
388 | /// post-op by way of the attributes `attr`: |
389 | /// @page performance_profiling_cpp |
390 | /// @snippet performance_profiling.cpp Create prim_desc with attr |
391 | // [Create prim_desc with attr] |
392 | // create an attribute for fused relu |
393 | auto attr = create_attr_with_relu_post_op(); |
394 | |
395 | // create a convolution primitive descriptor |
396 | auto conv_pd = convolution_forward::primitive_desc(eng, |
397 | prop_kind::forward_inference, algorithm::convolution_direct, |
398 | conv_src_md, conv_wei_md, conv_dst_md, strides, padding, padding, |
399 | attr); |
400 | // [Create prim_desc with attr] |
401 | /// Then conditional reorders are applied as in *blocked format |
402 | /// implementation* to convert `user_` format NCHW to blocked. Finally, it |
403 | /// creates the convolution primitive `conv` and adds it to the stream `s` |
404 | /// with the reordered data (`conv_src`, `conv_wei`, `conv_dst1`). |
405 | // prepare convolution source |
406 | memory conv_src = user_src; |
407 | if (conv_pd.src_desc() != user_src.get_desc()) { |
408 | conv_src = memory(conv_pd.src_desc(), eng); |
409 | auto r_pd = reorder::primitive_desc(user_src, conv_src); |
410 | reorder(r_pd).execute(s, user_src, conv_src); |
411 | } |
412 | |
413 | // prepare convolution weights |
414 | memory conv_wei = user_wei; |
415 | if (conv_pd.weights_desc() != user_wei.get_desc()) { |
416 | conv_wei = memory(conv_pd.weights_desc(), eng); |
417 | auto r_pd = reorder::primitive_desc(user_wei, conv_wei); |
418 | reorder(r_pd).execute(s, user_wei, conv_wei); |
419 | } |
420 | |
421 | // prepare convolution destination |
422 | memory conv_dst = user_dst; |
423 | if (conv_pd.dst_desc() != user_dst.get_desc()) |
424 | conv_dst = memory(conv_pd.dst_desc(), eng); |
425 | /// @page performance_profiling_cpp |
426 | /// @note There is no separate addition to the stream for the ReLU |
427 | /// operation because it has been added as a post-op to the `conv` primitive. |
428 | /// @page performance_profiling_cpp |
429 | /// @snippet performance_profiling.cpp Create conv_primitive implementation3 |
430 | // [Create conv_primitive implementation3] |
431 | // create convolution primitive |
432 | auto conv = convolution_forward(conv_pd); |
433 | // [Create conv_primitive implementation3] |
434 | /// @page performance_profiling_cpp |
435 | /// @snippet performance_profiling.cpp Add to stream implementation3 |
436 | // [Add to stream implementation3] |
437 | // execute convolution by adding it to the stream s |
438 | conv.execute(s, |
439 | {{DNNL_ARG_SRC, conv_src}, {DNNL_ARG_WEIGHTS, conv_wei}, |
440 | {DNNL_ARG_DST, conv_dst}}); |
441 | // [Add to stream implementation3] |
442 | // reorder data to user's format if needed |
443 | if (conv_pd.dst_desc() != user_dst.get_desc()) { |
444 | auto r_pd = reorder::primitive_desc(conv_dst, user_dst); |
445 | reorder(r_pd).execute(s, conv_dst, user_dst); |
446 | } |
447 | s.wait(); |
448 | /// @page performance_profiling_cpp |
449 | /// This implementation complies with best practices for f32 inference by |
450 | /// using the oneDNN recommended blocked format for convolution and |
451 | /// adding ReLU as a post-op to execute a fused version of conv + ReLU. |
452 | /// The consequence to following best practices can be seen in the execution |
453 | /// time of the fused primitive of 18.0 milliseconds. |
454 | /// |
455 | /// *ONEDNN_VERBOSE output (see configuration notice\*):* |
456 | /// ~~~sh |
457 | /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:abcd:f0 dst_f32::blocked:Acdb16a:f0,,,96x3x11x11,0.0148926 |
458 | /// onednn_verbose,exec,cpu,convolution,jit:avx512_common,forward_inference,src_f32::blocked:abcd:f0 wei_f32::blocked:Acdb16a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd16b:f0,post_ops:'eltwise_relu;';,alg:convolution_direct,mb128_ic3oc96_ih227oh55kh11sh4dh0ph0_iw227ow55kw11sw4dw0pw0,17.968 |
459 | /// onednn_verbose,exec,cpu,reorder,jit:uni,undef,src_f32::blocked:aBcd16b:f0 dst_f32::blocked:abcd:f0,,,128x96x55x55,4.66797 |
460 | /// ~~~ |
461 | } |
462 | |
463 | /// @page performance_profiling_cpp |
464 | /// @section performance_profiling_cpp_roundup Performance summary |
465 | /// |
466 | /// | Implementation | Time, ms | Cumulative speedup | |
467 | /// | :-- | --: | --: | |
468 | /// | Naive | 41.2 | 1.0 | |
469 | /// | Blocked format | 21.0 | 2.0 | |
470 | /// | Fused | 18.0 | 2.3 | |
471 | /// |
472 | /// ** ** |
473 | /// @page performance_profiling_cpp |
474 | /// @section performance_profiling_cpp_config Configuration Notice |
475 | /// @note This example is meant to demonstrate oneDNN best practices. |
476 | /// @note It is not meant for benchmarking purposes. The platform is not fully |
477 | /// @note optimized, so the primitive execution times are only relevant in |
478 | /// @note relation to the other times in this example. |
479 | /// |
480 | /// Runtime Settings: |
481 | /// * OMP_NUM_THREADS=14 |
482 | /// * KMP_AFFINITY=granularity=fine,compact |
483 | /// |
484 | /// Platform: |
485 | /// * CPU: Intel(R) Xeon(R) Platinum 8180 CPU @ 2.50GHz |
486 | /// * Thread(s) per core: 1 |
487 | /// * Core(s) per socket: 28 |
488 | /// * Socket(s): 2 |
489 | /// * NUMA node(s): 2 |
490 | /// * RAM (DDR4): 192 GB |
491 | |
492 | void performance_profiling(engine::kind engine_kind, int argc, char **argv) { |
493 | // Initialize engine |
494 | engine eng(engine_kind, 0); |
495 | |
496 | // Initialize stream |
497 | stream s(eng); |
498 | // [Set dimensions] |
499 | // set dimensions for synthetic data and weights |
500 | const memory::dim BATCH = 128; |
501 | const memory::dim IC = 3, OC = 96; |
502 | const memory::dim IH = 227, KH = 11, OH = 55; |
503 | const memory::dim IW = 227, KW = 11, OW = 55; |
504 | // [Set dimensions] |
505 | |
506 | // [Create memory objects] |
507 | // create oneDNN memory objects for user's tensors (in nchw and oihw formats) |
508 | auto user_src = memory({{BATCH, IC, IH, IW}, memory::data_type::f32, |
509 | memory::format_tag::nchw}, |
510 | eng); |
511 | auto user_wei = memory({{OC, IC, KH, KW}, memory::data_type::f32, |
512 | memory::format_tag::oihw}, |
513 | eng); |
514 | auto user_dst = memory({{BATCH, OC, OH, OW}, memory::data_type::f32, |
515 | memory::format_tag::nchw}, |
516 | eng); |
517 | // [Create memory objects] |
518 | |
519 | // fill source, destination, and weights with synthetic data |
520 | init_data(user_src, 1); |
521 | init_data(user_dst, -1); |
522 | init_data(user_wei, .5); |
523 | |
524 | // set implementation ("naive"||"blocked"||"fused") setting implementation |
525 | // to "validation" will run all implementations |
526 | std::string implementation; |
527 | if (argc <= 2) |
528 | implementation = "validation" ; |
529 | else if (argc == 3) |
530 | implementation = argv[2]; |
531 | |
532 | if (!(implementation == "validation" || implementation == "naive" |
533 | || implementation == "blocked" || implementation == "fused" )) { |
534 | std::cout << "The implementation can be one of:\n" ; |
535 | std::cout << " - naive: NCHW format without fusion\n" ; |
536 | std::cout << " - blocked: format propagation without fusion\n" ; |
537 | std::cout << " - fused: format propagation with fusion\n" ; |
538 | std::cout << " - validation: runs all implementations\n\n" ; |
539 | std::cout << "Validation will run if no parameters are specified.\n\n" ; |
540 | |
541 | throw std::invalid_argument("Incorrect input arguments." ); |
542 | } |
543 | |
544 | if (implementation == "naive" || implementation == "validation" ) { |
545 | std::cout << "Implementation: naive.\n" ; |
546 | // run conv + relu w/o fusing |
547 | conv_relu_naive(user_src, user_wei, user_dst, eng, s); |
548 | std::cout << "Conv + ReLU w/ nchw format completed.\n" ; |
549 | } |
550 | |
551 | if (implementation == "blocked" || implementation == "validation" ) { |
552 | std::cout << "Implementation: blocked.\n" ; |
553 | // run conv + relu w/o fusing |
554 | conv_relu_blocked(user_src, user_wei, user_dst, eng, s); |
555 | std::cout << "Conv + ReLU w/ blocked format completed.\n" ; |
556 | } |
557 | |
558 | if (implementation == "fused" || implementation == "validation" ) { |
559 | std::cout << "Implementation: fused.\n" ; |
560 | // run conv + relu w/ fusing |
561 | conv_relu_fused(user_src, user_wei, user_dst, eng, s); |
562 | std::cout << "Conv + ReLU w/ fusing completed.\n" ; |
563 | } |
564 | } |
565 | |
566 | int main(int argc, char **argv) { |
567 | engine::kind engine_kind = parse_engine_kind(argc, argv, 1); |
568 | return handle_example_errors( |
569 | performance_profiling, engine_kind, argc, argv); |
570 | } |
571 | |