1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example memory_format_propagation.cpp
18/// @copybrief memory_format_propagation_cpp
19/// > Annotated version: @ref memory_format_propagation_cpp
20
21#include <iostream>
22#include <sstream>
23#include <string>
24
25/// @page memory_format_propagation_cpp Memory Format Propagation
26/// This example demonstrates memory format propagation, which is critical for
27/// deep learning applications performance.
28///
29/// > Example code: @ref memory_format_propagation.cpp
30///
31/// Memory format propagation is one of the central notions that needs to be
32/// well-understood to use oneDNN correctly.
33///
34/// Convolution and inner product primitives choose the memory format when you
35/// create them with the placeholder memory format
36/// #dnnl::memory::format_tag::any for input or output. The memory format
37/// chosen is based on different circumstances such as hardware and
38/// convolutional parameters. Using the placeholder memory format is the
39/// recommended practice for convolutions, since they are the most
40/// compute-intensive operations in most topologies where they are present.
41///
42/// Other primitives, such as Elementwise, LRN, batch normalization and other,
43/// on forward propagation should use the same memory format as the preceding
44/// layer thus propagating the memory format through multiple oneDNN primitives.
45/// This avoids unnecessary reorders which may be expensive and should be
46/// avoided unless a compute-intensive primitive requires a different format.
47/// For performance reasons, backward computations of such primitives requires
48/// consistent memory format with the corresponding forward computations.
49/// Hence, when initializing there primitives for backward computations you
50/// should use #dnnl::memory::format_tag::any memory format tag as well.
51///
52/// Below is the short summary when to use and not to use memory format
53/// #dnnl::memory::format_tag::any during operation description initialization:
54///
55/// | Primitive Kinds | Forward Propagation | Backward Propagation | No Propagation |
56/// | :-- | :-- | :-- | :-- |
57/// | Compute intensive: (De-)convolution, Inner product, RNN | Use #dnnl::memory::format_tag::any | Use #dnnl::memory::format_tag::any | N/A |
58/// | Compute intensive (no propagation): Matrix Multiplication | N/A | N/A | Use #dnnl::memory::format_tag::any |
59/// | Memory-bandwidth limited: Pooling, Layer and Batch Normalization, Local Response Normalization, Elementwise, Shuffle, Softmax | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs | Use #dnnl::memory::format_tag::any for gradient tensors, and actual memory formats for data tensors | N/A |
60/// | Memory-bandwidth limited: Reorder, Concat, Sum, Binary | N/A | N/A | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs |
61///
62/// Additional format synchronization is required between forward and backward
63/// computations when running training workloads. This topic is covered in
64/// [Training-Specific Aspects](@ref dev_guide_inference_and_training_aspects_training).
65///
66/// For better understanding of the architecture and design of oneDNN
67/// as well as the concepts used in the library, please refer to @ref
68/// dev_guide_understanding_memory_formats.
69///
70/// @section memory_format_propagation_intro Introduction to the tutorial
71///
72/// This C++ API example demonstrates how to use optimized memory formats
73/// supported by oneDNN:
74/// - How to configure primitives to use optimized memory formats.
75/// - How to determine whether data needs to be reordered from/to optimized
76/// memory formats.
77///
78/// This tutorial assumes that the reader has already reviewed the
79/// @ref getting_started_cpp tutorial.
80///
81/// The example is built around a CNN consisting of a convolution followed by
82/// a pooling and consists of the following steps:
83/// 1. Create a pooling primitive descriptor based on the memory format chosen
84/// by the convolution primitive.
85/// 2. Create memory descriptors for input and output data in the NCHW memory
86/// format.
87/// 3. Determine if input and output data needs to be reordered from/to the
88/// optimized memory format.
89/// 4. Create memory objects; and necessary primitives and execute them.
90///
91/// These steps are implemented in the @ref memory_format_propagation_tutorial
92/// which in turn is called from `main()` which is also responsible for error
93/// handling.
94
95#include "oneapi/dnnl/dnnl.hpp"
96
97#include "example_utils.hpp"
98
99using namespace dnnl;
100
101/// @page memory_format_propagation_cpp
102/// @section memory_format_propagation_tutorial memory_format_propagation() function
103///
104void memory_format_propagation_tutorial(engine::kind engine_kind) {
105 /// @page memory_format_propagation_cpp
106 /// @subsection memory_format_propagation_sub1 Initialization
107 ///
108 /// We start by creating an engine and a stream that we will use when
109 /// creating primitive descriptors and executing primitives.
110 ///
111 /// @snippet memory_format_propagation.cpp Initialize engine and stream
112 // [Initialize engine and stream]
113 engine eng(engine_kind, 0);
114 stream s(eng);
115 // [Initialize engine and stream]
116
117 /// @page memory_format_propagation_cpp
118 /// @subsection memory_format_propagation_sub2 Create convolution and pooling primitives
119 ///
120 /// To specify that a primitive should pick an optimized format for the
121 /// specified computation parameters, we create memory descriptors with
122 /// memory format set to @ref dnnl::memory::format_tag::any.
123 ///
124 /// This approach works only for a limited set of primitives: convolutions
125 /// and inner products. Additionally, @ref dnnl::memory::format_tag::any
126 /// can be specified for destination memory descriptors which implies that
127 /// destination will have the same memory format as the source.
128 ///
129 /// @snippet memory_format_propagation.cpp Create placeholder memory descriptors
130 // [Create placeholder memory descriptors]
131 // Tensor and kernel dimensions. We use the same 3x3 kernel with padding=1
132 // for both convolution and pooling primitives, which means that the
133 // activation tensor shapes do not change.
134 const int N = 1, H = 14, W = 14, IC = 128, OC = 256, KH = 3, KW = 3;
135 auto conv_src_md = memory::desc({N, IC, H, W}, memory::data_type::f32,
136 memory::format_tag::any // let convolution choose memory format
137 );
138 auto conv_weights_md = memory::desc(
139 {OC, IC, KH, KW}, memory::data_type::f32,
140 memory::format_tag::any // let convolution choose memory format
141 );
142 auto conv_dst_md = memory::desc({N, OC, H, W}, memory::data_type::f32,
143 memory::format_tag::any // let convolution choose memory format
144 );
145 const auto &pool_dst_md = conv_dst_md; // shape does not change
146 // [Create placeholder memory descriptors]
147
148 /// @page memory_format_propagation_cpp
149 ///
150 /// Next, we pass the memory descriptors to primitive descriptors
151 /// constructors.
152 ///
153 /// @snippet memory_format_propagation.cpp Create convolution and pooling primitive descriptors
154 // [Create convolution and pooling primitive descriptors]
155 auto conv_pd = convolution_forward::primitive_desc(
156 eng, prop_kind::forward_inference, algorithm::convolution_auto,
157 conv_src_md, conv_weights_md,
158 conv_dst_md, // shape information
159 {1, 1}, // strides
160 {1, 1}, {1, 1} // left and right padding
161 );
162
163 auto pool_pd
164 = pooling_forward::primitive_desc(eng, prop_kind::forward_inference,
165 algorithm::pooling_max, conv_pd.dst_desc(),
166 pool_dst_md, // shape information
167 {1, 1}, {KH, KW}, // strides and kernel
168 {0, 0}, // dilation
169 {1, 1}, {1, 1} // left and right padding
170 );
171 // [Create convolution and pooling primitive descriptors]
172
173 /// @page memory_format_propagation_cpp
174 /// @subsection memory_format_propagation_sub3 Create source and destination memory objects
175 ///
176 /// We assume that the 'user' source and destination memory format is
177 /// NCHW. Since there is no result validation in this tutorial, we do not
178 /// bother with filling the data with some values and let oneDNN
179 /// allocate the memory.
180 ///
181 /// @snippet memory_format_propagation.cpp Create source and destination memory objects
182 // [Create source and destination memory objects]
183 auto src_mem = memory(
184 {{N, IC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
185 eng);
186 auto weights_mem = memory({{OC, IC, KH, KW}, memory::data_type::f32,
187 memory::format_tag::oihw},
188 eng);
189 auto dst_mem = memory(
190 {{N, OC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
191 eng);
192 // [Create source and destination memory objects]
193
194 /// @page memory_format_propagation_cpp
195 /// @subsection memory_format_propagation_sub4 Determine if source and destination need to be reordered
196 ///
197 /// The idiomatic way to check if a reorder is necessary between the memory
198 /// format expected a primitive (the convolution in our case) and the
199 /// available memory format is to compare the corresponding memory
200 /// descriptors.
201 ///
202 /// @snippet memory_format_propagation.cpp Determine if source needs to be reordered
203 // [Determine if source needs to be reordered]
204 bool need_reorder_src = conv_pd.src_desc() != src_mem.get_desc();
205 // [Determine if source needs to be reordered]
206
207 /// @page memory_format_propagation_cpp
208 ///
209 /// @warning It is by design that it is not possible to just compare
210 /// memory tags. The reason behind this is that a memory format tags only
211 /// provide a partial description of how data is laid out in memory and do
212 /// not, for example, describe memory objects obtained via sub-memory
213 /// constructor.
214 ///
215 /// We repeat the process for the weights and destination memory format
216 /// descriptors as well.
217 ///
218 /// @snippet memory_format_propagation.cpp Determine if weights and destination need to be reordered
219 // [Determine if weights and destination need to be reordered]
220 bool need_reorder_weights
221 = conv_pd.weights_desc() != weights_mem.get_desc();
222 bool need_reorder_dst = conv_pd.dst_desc() != dst_mem.get_desc();
223 // [Determine if weights and destination need to be reordered]
224
225 /// @page memory_format_propagation_cpp
226 /// @subsection memory_format_propagation_sub45 Allocate intermediate buffers if necessary
227 ///
228 /// Based on the flags computed before, we can now decide if we need extra
229 /// intermediate buffers to hold the source and weights data for the
230 /// convolution and the output of the pooling.
231 ///
232 /// Memory objects for the intermediate buffers are created based on the
233 /// memory descriptors obtained from the primitive descriptors to ensure
234 /// consistency.
235 ///
236 /// @snippet memory_format_propagation.cpp Allocate intermediate buffers if necessary
237 // [Allocate intermediate buffers if necessary]
238 auto conv_src_mem
239 = need_reorder_src ? memory(conv_pd.src_desc(), eng) : src_mem;
240 auto conv_weights_mem = need_reorder_weights
241 ? memory(conv_pd.weights_desc(), eng)
242 : weights_mem;
243 auto conv_dst_mem = memory(conv_pd.dst_desc(), eng);
244 auto pool_dst_mem
245 = need_reorder_dst ? memory(pool_pd.dst_desc(), eng) : dst_mem;
246 // [Allocate intermediate buffers if necessary]
247
248 /// @page memory_format_propagation_cpp
249 /// @subsection memory_format_propagation_sub5 Perform reorders for source data if necessary
250 ///
251 /// Now we get to the part where we actually start executing things. We
252 /// check if reorders are necessary based on the flags computed before and
253 /// create and execute them immediately.
254 ///
255 /// @note We call @ref dnnl::stream::wait() before reorder primitives
256 /// get out of scope and destroyed to accommodate for potentially
257 /// asynchronous execution.
258 ///
259 /// @snippet memory_format_propagation.cpp Perform reorders for source data if necessary
260 // [Perform reorders for source data if necessary]
261 if (need_reorder_src) {
262 auto reorder_src = reorder(src_mem, conv_src_mem);
263 reorder_src.execute(
264 s, {{DNNL_ARG_FROM, src_mem}, {DNNL_ARG_TO, conv_src_mem}});
265 s.wait(); // wait for the reorder to complete
266 }
267
268 if (need_reorder_weights) {
269 auto reorder_weights = reorder(weights_mem, conv_weights_mem);
270 reorder_weights.execute(s,
271 {{DNNL_ARG_FROM, weights_mem},
272 {DNNL_ARG_TO, conv_weights_mem}});
273 s.wait(); // wait for the reorder to complete
274 }
275 // [Perform reorders for source data if necessary]
276
277 /// @page memory_format_propagation_cpp
278 /// @subsection memory_format_propagation_sub6 Create and execute convolution and pooling primitives
279 ///
280 /// After the reorders, we are now ready to compute convolution and
281 /// pooling.
282 ///
283 /// @snippet memory_format_propagation.cpp Create and execute convolution and pooling primitives
284 // [Create and execute convolution and pooling primitives]
285 auto conv_scratchpad_mem = memory(conv_pd.scratchpad_desc(), eng);
286 auto conv = convolution_forward(conv_pd);
287 conv.execute(s,
288 {{DNNL_ARG_SRC, conv_src_mem}, {DNNL_ARG_WEIGHTS, conv_weights_mem},
289 {DNNL_ARG_DST, conv_dst_mem}});
290 auto pool_scratchpad_mem = memory(pool_pd.scratchpad_desc(), eng);
291 auto pool = pooling_forward(pool_pd);
292 pool.execute(
293 s, {{DNNL_ARG_SRC, conv_dst_mem}, {DNNL_ARG_DST, pool_dst_mem}});
294 s.wait();
295 // [Create and execute convolution and pooling primitives]
296
297 /// @page memory_format_propagation_cpp
298 /// @subsection memory_format_propagation_sub7 Reorder destination data if necessary
299 ///
300 /// The only potentially remaining operation is a reorder from the pooling
301 /// destination memory object to the user's one. Similarly to the
302 /// reorders for the source and weights memory objects, it is performed
303 /// depending on the value of the previously computed flag.
304 ///
305 /// @snippet memory_format_propagation.cpp Reorder destination data if necessary
306 // [Reorder destination data if necessary]
307 if (need_reorder_dst) {
308 auto reorder_dst = reorder(pool_dst_mem, dst_mem);
309 reorder_dst.execute(
310 s, {{DNNL_ARG_FROM, pool_dst_mem}, {DNNL_ARG_TO, dst_mem}});
311 s.wait();
312 }
313 // [Reorder destination data if necessary]
314}
315
316int main(int argc, char **argv) {
317 return handle_example_errors(
318 memory_format_propagation_tutorial, parse_engine_kind(argc, argv));
319}
320
321/// @page memory_format_propagation_cpp
322/// @subsection memory_format_propagation_results Results
323///
324/// Upon compiling and run the example the output should be just:
325///
326/// ~~~sh
327/// Example passed.
328/// ~~~
329///
330/// It may be interesting to check what really happens during the run. We can
331/// use `ONEDNN_VERBOSE` environment variable for that (see also @ref
332/// dev_guide_verbose). Here's an example output:
333///
334/// ~~~sh
335/// $ ONEDNN_VERBOSE=1 ./memory-format-propagation-cpp
336/// onednn_verbose,info,oneDNN <ver> (Git Hash <hash>)
337/// onednn_verbose,info,cpu,runtime:OpenMP
338/// onednn_verbose,info,cpu,isa:Intel AVX2
339/// onednn_verbose,info,gpu,runtime:none
340/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,
341/// src_f32::blocked:abcd:f0 dst_f32::blocked:aBcd8b:f0,,,1x128x14x14,0.326904
342/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,
343/// src_f32::blocked:abcd:f0 dst_f32::blocked:ABcd8b8a:f0,,,256x128x3x3,0.244141
344/// onednn_verbose,exec,cpu,convolution,jit:avx2,forward_inference,
345/// src_f32::blocked:aBcd8b:f0 wei_f32::blocked:ABcd8b8a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd8b:f0,,
346/// alg:convolution_direct,mb1_ic128oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1,1.20312
347/// onednn_verbose,exec,cpu,pooling,jit:avx,forward_inference,
348/// src_f32::blocked:aBcd8b:f0 dst_f32::blocked:aBcd8b:f0 ws_undef::undef::f0,,
349/// alg:pooling_max,mb1ic256_ih14oh14kh3sh1ph1_iw14ow14kw3sw1pw1,0.187012
350/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,
351/// src_f32::blocked:aBcd8b:f0 dst_f32::blocked:abcd:f0,,,1x256x14x14,0.0419922
352/// Example passed on CPU.
353/// ~~~
354///
355/// From this output we can deduce that:
356/// * The convolution primitive picked up @ref
357/// dnnl::memory::format_tag::aBcd8b optimized memory format for
358/// activations. In this format the channels dimension (denoted by letter B
359/// since it is the second dimension; see also @ref dev_guide_conventions)
360/// is blocked by a factor of 8. Because of this memory format is different
361/// from the NCHW format the tutorial uses, the source and destination had
362/// to be reordered to and from this optimized memory layout.
363/// * The convolution primitive picked up @ref
364/// dnnl::memory::format_tag::ABcd8b8a optimized memory format (output (A)
365/// and input (B) channel dimensions blocked by 8) which we also had to
366/// reorder the initial weights to since they are in the OIHW memory format.
367