1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example memory_format_propagation.cpp |
18 | /// @copybrief memory_format_propagation_cpp |
19 | /// > Annotated version: @ref memory_format_propagation_cpp |
20 | |
21 | #include <iostream> |
22 | #include <sstream> |
23 | #include <string> |
24 | |
25 | /// @page memory_format_propagation_cpp Memory Format Propagation |
26 | /// This example demonstrates memory format propagation, which is critical for |
27 | /// deep learning applications performance. |
28 | /// |
29 | /// > Example code: @ref memory_format_propagation.cpp |
30 | /// |
31 | /// Memory format propagation is one of the central notions that needs to be |
32 | /// well-understood to use oneDNN correctly. |
33 | /// |
34 | /// Convolution and inner product primitives choose the memory format when you |
35 | /// create them with the placeholder memory format |
36 | /// #dnnl::memory::format_tag::any for input or output. The memory format |
37 | /// chosen is based on different circumstances such as hardware and |
38 | /// convolutional parameters. Using the placeholder memory format is the |
39 | /// recommended practice for convolutions, since they are the most |
40 | /// compute-intensive operations in most topologies where they are present. |
41 | /// |
42 | /// Other primitives, such as Elementwise, LRN, batch normalization and other, |
43 | /// on forward propagation should use the same memory format as the preceding |
44 | /// layer thus propagating the memory format through multiple oneDNN primitives. |
45 | /// This avoids unnecessary reorders which may be expensive and should be |
46 | /// avoided unless a compute-intensive primitive requires a different format. |
47 | /// For performance reasons, backward computations of such primitives requires |
48 | /// consistent memory format with the corresponding forward computations. |
49 | /// Hence, when initializing there primitives for backward computations you |
50 | /// should use #dnnl::memory::format_tag::any memory format tag as well. |
51 | /// |
52 | /// Below is the short summary when to use and not to use memory format |
53 | /// #dnnl::memory::format_tag::any during operation description initialization: |
54 | /// |
55 | /// | Primitive Kinds | Forward Propagation | Backward Propagation | No Propagation | |
56 | /// | :-- | :-- | :-- | :-- | |
57 | /// | Compute intensive: (De-)convolution, Inner product, RNN | Use #dnnl::memory::format_tag::any | Use #dnnl::memory::format_tag::any | N/A | |
58 | /// | Compute intensive (no propagation): Matrix Multiplication | N/A | N/A | Use #dnnl::memory::format_tag::any | |
59 | /// | Memory-bandwidth limited: Pooling, Layer and Batch Normalization, Local Response Normalization, Elementwise, Shuffle, Softmax | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs | Use #dnnl::memory::format_tag::any for gradient tensors, and actual memory formats for data tensors | N/A | |
60 | /// | Memory-bandwidth limited: Reorder, Concat, Sum, Binary | N/A | N/A | Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs | |
61 | /// |
62 | /// Additional format synchronization is required between forward and backward |
63 | /// computations when running training workloads. This topic is covered in |
64 | /// [Training-Specific Aspects](@ref dev_guide_inference_and_training_aspects_training). |
65 | /// |
66 | /// For better understanding of the architecture and design of oneDNN |
67 | /// as well as the concepts used in the library, please refer to @ref |
68 | /// dev_guide_understanding_memory_formats. |
69 | /// |
70 | /// @section memory_format_propagation_intro Introduction to the tutorial |
71 | /// |
72 | /// This C++ API example demonstrates how to use optimized memory formats |
73 | /// supported by oneDNN: |
74 | /// - How to configure primitives to use optimized memory formats. |
75 | /// - How to determine whether data needs to be reordered from/to optimized |
76 | /// memory formats. |
77 | /// |
78 | /// This tutorial assumes that the reader has already reviewed the |
79 | /// @ref getting_started_cpp tutorial. |
80 | /// |
81 | /// The example is built around a CNN consisting of a convolution followed by |
82 | /// a pooling and consists of the following steps: |
83 | /// 1. Create a pooling primitive descriptor based on the memory format chosen |
84 | /// by the convolution primitive. |
85 | /// 2. Create memory descriptors for input and output data in the NCHW memory |
86 | /// format. |
87 | /// 3. Determine if input and output data needs to be reordered from/to the |
88 | /// optimized memory format. |
89 | /// 4. Create memory objects; and necessary primitives and execute them. |
90 | /// |
91 | /// These steps are implemented in the @ref memory_format_propagation_tutorial |
92 | /// which in turn is called from `main()` which is also responsible for error |
93 | /// handling. |
94 | |
95 | #include "oneapi/dnnl/dnnl.hpp" |
96 | |
97 | #include "example_utils.hpp" |
98 | |
99 | using namespace dnnl; |
100 | |
101 | /// @page memory_format_propagation_cpp |
102 | /// @section memory_format_propagation_tutorial memory_format_propagation() function |
103 | /// |
104 | void memory_format_propagation_tutorial(engine::kind engine_kind) { |
105 | /// @page memory_format_propagation_cpp |
106 | /// @subsection memory_format_propagation_sub1 Initialization |
107 | /// |
108 | /// We start by creating an engine and a stream that we will use when |
109 | /// creating primitive descriptors and executing primitives. |
110 | /// |
111 | /// @snippet memory_format_propagation.cpp Initialize engine and stream |
112 | // [Initialize engine and stream] |
113 | engine eng(engine_kind, 0); |
114 | stream s(eng); |
115 | // [Initialize engine and stream] |
116 | |
117 | /// @page memory_format_propagation_cpp |
118 | /// @subsection memory_format_propagation_sub2 Create convolution and pooling primitives |
119 | /// |
120 | /// To specify that a primitive should pick an optimized format for the |
121 | /// specified computation parameters, we create memory descriptors with |
122 | /// memory format set to @ref dnnl::memory::format_tag::any. |
123 | /// |
124 | /// This approach works only for a limited set of primitives: convolutions |
125 | /// and inner products. Additionally, @ref dnnl::memory::format_tag::any |
126 | /// can be specified for destination memory descriptors which implies that |
127 | /// destination will have the same memory format as the source. |
128 | /// |
129 | /// @snippet memory_format_propagation.cpp Create placeholder memory descriptors |
130 | // [Create placeholder memory descriptors] |
131 | // Tensor and kernel dimensions. We use the same 3x3 kernel with padding=1 |
132 | // for both convolution and pooling primitives, which means that the |
133 | // activation tensor shapes do not change. |
134 | const int N = 1, H = 14, W = 14, IC = 128, OC = 256, KH = 3, KW = 3; |
135 | auto conv_src_md = memory::desc({N, IC, H, W}, memory::data_type::f32, |
136 | memory::format_tag::any // let convolution choose memory format |
137 | ); |
138 | auto conv_weights_md = memory::desc( |
139 | {OC, IC, KH, KW}, memory::data_type::f32, |
140 | memory::format_tag::any // let convolution choose memory format |
141 | ); |
142 | auto conv_dst_md = memory::desc({N, OC, H, W}, memory::data_type::f32, |
143 | memory::format_tag::any // let convolution choose memory format |
144 | ); |
145 | const auto &pool_dst_md = conv_dst_md; // shape does not change |
146 | // [Create placeholder memory descriptors] |
147 | |
148 | /// @page memory_format_propagation_cpp |
149 | /// |
150 | /// Next, we pass the memory descriptors to primitive descriptors |
151 | /// constructors. |
152 | /// |
153 | /// @snippet memory_format_propagation.cpp Create convolution and pooling primitive descriptors |
154 | // [Create convolution and pooling primitive descriptors] |
155 | auto conv_pd = convolution_forward::primitive_desc( |
156 | eng, prop_kind::forward_inference, algorithm::convolution_auto, |
157 | conv_src_md, conv_weights_md, |
158 | conv_dst_md, // shape information |
159 | {1, 1}, // strides |
160 | {1, 1}, {1, 1} // left and right padding |
161 | ); |
162 | |
163 | auto pool_pd |
164 | = pooling_forward::primitive_desc(eng, prop_kind::forward_inference, |
165 | algorithm::pooling_max, conv_pd.dst_desc(), |
166 | pool_dst_md, // shape information |
167 | {1, 1}, {KH, KW}, // strides and kernel |
168 | {0, 0}, // dilation |
169 | {1, 1}, {1, 1} // left and right padding |
170 | ); |
171 | // [Create convolution and pooling primitive descriptors] |
172 | |
173 | /// @page memory_format_propagation_cpp |
174 | /// @subsection memory_format_propagation_sub3 Create source and destination memory objects |
175 | /// |
176 | /// We assume that the 'user' source and destination memory format is |
177 | /// NCHW. Since there is no result validation in this tutorial, we do not |
178 | /// bother with filling the data with some values and let oneDNN |
179 | /// allocate the memory. |
180 | /// |
181 | /// @snippet memory_format_propagation.cpp Create source and destination memory objects |
182 | // [Create source and destination memory objects] |
183 | auto src_mem = memory( |
184 | {{N, IC, H, W}, memory::data_type::f32, memory::format_tag::nchw}, |
185 | eng); |
186 | auto weights_mem = memory({{OC, IC, KH, KW}, memory::data_type::f32, |
187 | memory::format_tag::oihw}, |
188 | eng); |
189 | auto dst_mem = memory( |
190 | {{N, OC, H, W}, memory::data_type::f32, memory::format_tag::nchw}, |
191 | eng); |
192 | // [Create source and destination memory objects] |
193 | |
194 | /// @page memory_format_propagation_cpp |
195 | /// @subsection memory_format_propagation_sub4 Determine if source and destination need to be reordered |
196 | /// |
197 | /// The idiomatic way to check if a reorder is necessary between the memory |
198 | /// format expected a primitive (the convolution in our case) and the |
199 | /// available memory format is to compare the corresponding memory |
200 | /// descriptors. |
201 | /// |
202 | /// @snippet memory_format_propagation.cpp Determine if source needs to be reordered |
203 | // [Determine if source needs to be reordered] |
204 | bool need_reorder_src = conv_pd.src_desc() != src_mem.get_desc(); |
205 | // [Determine if source needs to be reordered] |
206 | |
207 | /// @page memory_format_propagation_cpp |
208 | /// |
209 | /// @warning It is by design that it is not possible to just compare |
210 | /// memory tags. The reason behind this is that a memory format tags only |
211 | /// provide a partial description of how data is laid out in memory and do |
212 | /// not, for example, describe memory objects obtained via sub-memory |
213 | /// constructor. |
214 | /// |
215 | /// We repeat the process for the weights and destination memory format |
216 | /// descriptors as well. |
217 | /// |
218 | /// @snippet memory_format_propagation.cpp Determine if weights and destination need to be reordered |
219 | // [Determine if weights and destination need to be reordered] |
220 | bool need_reorder_weights |
221 | = conv_pd.weights_desc() != weights_mem.get_desc(); |
222 | bool need_reorder_dst = conv_pd.dst_desc() != dst_mem.get_desc(); |
223 | // [Determine if weights and destination need to be reordered] |
224 | |
225 | /// @page memory_format_propagation_cpp |
226 | /// @subsection memory_format_propagation_sub45 Allocate intermediate buffers if necessary |
227 | /// |
228 | /// Based on the flags computed before, we can now decide if we need extra |
229 | /// intermediate buffers to hold the source and weights data for the |
230 | /// convolution and the output of the pooling. |
231 | /// |
232 | /// Memory objects for the intermediate buffers are created based on the |
233 | /// memory descriptors obtained from the primitive descriptors to ensure |
234 | /// consistency. |
235 | /// |
236 | /// @snippet memory_format_propagation.cpp Allocate intermediate buffers if necessary |
237 | // [Allocate intermediate buffers if necessary] |
238 | auto conv_src_mem |
239 | = need_reorder_src ? memory(conv_pd.src_desc(), eng) : src_mem; |
240 | auto conv_weights_mem = need_reorder_weights |
241 | ? memory(conv_pd.weights_desc(), eng) |
242 | : weights_mem; |
243 | auto conv_dst_mem = memory(conv_pd.dst_desc(), eng); |
244 | auto pool_dst_mem |
245 | = need_reorder_dst ? memory(pool_pd.dst_desc(), eng) : dst_mem; |
246 | // [Allocate intermediate buffers if necessary] |
247 | |
248 | /// @page memory_format_propagation_cpp |
249 | /// @subsection memory_format_propagation_sub5 Perform reorders for source data if necessary |
250 | /// |
251 | /// Now we get to the part where we actually start executing things. We |
252 | /// check if reorders are necessary based on the flags computed before and |
253 | /// create and execute them immediately. |
254 | /// |
255 | /// @note We call @ref dnnl::stream::wait() before reorder primitives |
256 | /// get out of scope and destroyed to accommodate for potentially |
257 | /// asynchronous execution. |
258 | /// |
259 | /// @snippet memory_format_propagation.cpp Perform reorders for source data if necessary |
260 | // [Perform reorders for source data if necessary] |
261 | if (need_reorder_src) { |
262 | auto reorder_src = reorder(src_mem, conv_src_mem); |
263 | reorder_src.execute( |
264 | s, {{DNNL_ARG_FROM, src_mem}, {DNNL_ARG_TO, conv_src_mem}}); |
265 | s.wait(); // wait for the reorder to complete |
266 | } |
267 | |
268 | if (need_reorder_weights) { |
269 | auto reorder_weights = reorder(weights_mem, conv_weights_mem); |
270 | reorder_weights.execute(s, |
271 | {{DNNL_ARG_FROM, weights_mem}, |
272 | {DNNL_ARG_TO, conv_weights_mem}}); |
273 | s.wait(); // wait for the reorder to complete |
274 | } |
275 | // [Perform reorders for source data if necessary] |
276 | |
277 | /// @page memory_format_propagation_cpp |
278 | /// @subsection memory_format_propagation_sub6 Create and execute convolution and pooling primitives |
279 | /// |
280 | /// After the reorders, we are now ready to compute convolution and |
281 | /// pooling. |
282 | /// |
283 | /// @snippet memory_format_propagation.cpp Create and execute convolution and pooling primitives |
284 | // [Create and execute convolution and pooling primitives] |
285 | auto conv_scratchpad_mem = memory(conv_pd.scratchpad_desc(), eng); |
286 | auto conv = convolution_forward(conv_pd); |
287 | conv.execute(s, |
288 | {{DNNL_ARG_SRC, conv_src_mem}, {DNNL_ARG_WEIGHTS, conv_weights_mem}, |
289 | {DNNL_ARG_DST, conv_dst_mem}}); |
290 | auto pool_scratchpad_mem = memory(pool_pd.scratchpad_desc(), eng); |
291 | auto pool = pooling_forward(pool_pd); |
292 | pool.execute( |
293 | s, {{DNNL_ARG_SRC, conv_dst_mem}, {DNNL_ARG_DST, pool_dst_mem}}); |
294 | s.wait(); |
295 | // [Create and execute convolution and pooling primitives] |
296 | |
297 | /// @page memory_format_propagation_cpp |
298 | /// @subsection memory_format_propagation_sub7 Reorder destination data if necessary |
299 | /// |
300 | /// The only potentially remaining operation is a reorder from the pooling |
301 | /// destination memory object to the user's one. Similarly to the |
302 | /// reorders for the source and weights memory objects, it is performed |
303 | /// depending on the value of the previously computed flag. |
304 | /// |
305 | /// @snippet memory_format_propagation.cpp Reorder destination data if necessary |
306 | // [Reorder destination data if necessary] |
307 | if (need_reorder_dst) { |
308 | auto reorder_dst = reorder(pool_dst_mem, dst_mem); |
309 | reorder_dst.execute( |
310 | s, {{DNNL_ARG_FROM, pool_dst_mem}, {DNNL_ARG_TO, dst_mem}}); |
311 | s.wait(); |
312 | } |
313 | // [Reorder destination data if necessary] |
314 | } |
315 | |
316 | int main(int argc, char **argv) { |
317 | return handle_example_errors( |
318 | memory_format_propagation_tutorial, parse_engine_kind(argc, argv)); |
319 | } |
320 | |
321 | /// @page memory_format_propagation_cpp |
322 | /// @subsection memory_format_propagation_results Results |
323 | /// |
324 | /// Upon compiling and run the example the output should be just: |
325 | /// |
326 | /// ~~~sh |
327 | /// Example passed. |
328 | /// ~~~ |
329 | /// |
330 | /// It may be interesting to check what really happens during the run. We can |
331 | /// use `ONEDNN_VERBOSE` environment variable for that (see also @ref |
332 | /// dev_guide_verbose). Here's an example output: |
333 | /// |
334 | /// ~~~sh |
335 | /// $ ONEDNN_VERBOSE=1 ./memory-format-propagation-cpp |
336 | /// onednn_verbose,info,oneDNN <ver> (Git Hash <hash>) |
337 | /// onednn_verbose,info,cpu,runtime:OpenMP |
338 | /// onednn_verbose,info,cpu,isa:Intel AVX2 |
339 | /// onednn_verbose,info,gpu,runtime:none |
340 | /// onednn_verbose,exec,cpu,reorder,jit:uni,undef, |
341 | /// src_f32::blocked:abcd:f0 dst_f32::blocked:aBcd8b:f0,,,1x128x14x14,0.326904 |
342 | /// onednn_verbose,exec,cpu,reorder,jit:uni,undef, |
343 | /// src_f32::blocked:abcd:f0 dst_f32::blocked:ABcd8b8a:f0,,,256x128x3x3,0.244141 |
344 | /// onednn_verbose,exec,cpu,convolution,jit:avx2,forward_inference, |
345 | /// src_f32::blocked:aBcd8b:f0 wei_f32::blocked:ABcd8b8a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd8b:f0,, |
346 | /// alg:convolution_direct,mb1_ic128oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1,1.20312 |
347 | /// onednn_verbose,exec,cpu,pooling,jit:avx,forward_inference, |
348 | /// src_f32::blocked:aBcd8b:f0 dst_f32::blocked:aBcd8b:f0 ws_undef::undef::f0,, |
349 | /// alg:pooling_max,mb1ic256_ih14oh14kh3sh1ph1_iw14ow14kw3sw1pw1,0.187012 |
350 | /// onednn_verbose,exec,cpu,reorder,jit:uni,undef, |
351 | /// src_f32::blocked:aBcd8b:f0 dst_f32::blocked:abcd:f0,,,1x256x14x14,0.0419922 |
352 | /// Example passed on CPU. |
353 | /// ~~~ |
354 | /// |
355 | /// From this output we can deduce that: |
356 | /// * The convolution primitive picked up @ref |
357 | /// dnnl::memory::format_tag::aBcd8b optimized memory format for |
358 | /// activations. In this format the channels dimension (denoted by letter B |
359 | /// since it is the second dimension; see also @ref dev_guide_conventions) |
360 | /// is blocked by a factor of 8. Because of this memory format is different |
361 | /// from the NCHW format the tutorial uses, the source and destination had |
362 | /// to be reordered to and from this optimized memory layout. |
363 | /// * The convolution primitive picked up @ref |
364 | /// dnnl::memory::format_tag::ABcd8b8a optimized memory format (output (A) |
365 | /// and input (B) channel dimensions blocked by 8) which we also had to |
366 | /// reorder the initial weights to since they are in the OIHW memory format. |
367 | |