memory_format_propagation.cpp source code [oneDNN/examples/memory_format_propagation.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	/// @example memory_format_propagation.cpp
18	/// @copybrief memory_format_propagation_cpp
19	/// > Annotated version: @ref memory_format_propagation_cpp
20
21	#include <iostream>
22	#include <sstream>
23	#include <string>
24
25	/// @page memory_format_propagation_cpp Memory Format Propagation
26	/// This example demonstrates memory format propagation, which is critical for
27	/// deep learning applications performance.
28	///
29	/// > Example code: @ref memory_format_propagation.cpp
30	///
31	/// Memory format propagation is one of the central notions that needs to be
32	/// well-understood to use oneDNN correctly.
33	///
34	/// Convolution and inner product primitives choose the memory format when you
35	/// create them with the placeholder memory format
36	/// #dnnl::memory::format_tag::any for input or output. The memory format
37	/// chosen is based on different circumstances such as hardware and
38	/// convolutional parameters. Using the placeholder memory format is the
39	/// recommended practice for convolutions, since they are the most
40	/// compute-intensive operations in most topologies where they are present.
41	///
42	/// Other primitives, such as Elementwise, LRN, batch normalization and other,
43	/// on forward propagation should use the same memory format as the preceding
44	/// layer thus propagating the memory format through multiple oneDNN primitives.
45	/// This avoids unnecessary reorders which may be expensive and should be
46	/// avoided unless a compute-intensive primitive requires a different format.
47	/// For performance reasons, backward computations of such primitives requires
48	/// consistent memory format with the corresponding forward computations.
49	/// Hence, when initializing there primitives for backward computations you
50	/// should use #dnnl::memory::format_tag::any memory format tag as well.
51	///
52	/// Below is the short summary when to use and not to use memory format
53	/// #dnnl::memory::format_tag::any during operation description initialization:
54	///
55	/// \| Primitive Kinds \| Forward Propagation \| Backward Propagation \| No Propagation \|
56	/// \| :-- \| :-- \| :-- \| :-- \|
57	/// \| Compute intensive: (De-)convolution, Inner product, RNN \| Use #dnnl::memory::format_tag::any \| Use #dnnl::memory::format_tag::any \| N/A \|
58	/// \| Compute intensive (no propagation): Matrix Multiplication \| N/A \| N/A \| Use #dnnl::memory::format_tag::any \|
59	/// \| Memory-bandwidth limited: Pooling, Layer and Batch Normalization, Local Response Normalization, Elementwise, Shuffle, Softmax \| Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs \| Use #dnnl::memory::format_tag::any for gradient tensors, and actual memory formats for data tensors \| N/A \|
60	/// \| Memory-bandwidth limited: Reorder, Concat, Sum, Binary \| N/A \| N/A \| Use memory format from preceding layer for inputs, and #dnnl::memory::format_tag::any for outputs \|
61	///
62	/// Additional format synchronization is required between forward and backward
63	/// computations when running training workloads. This topic is covered in
64	/// [Training-Specific Aspects](@ref dev_guide_inference_and_training_aspects_training).
65	///
66	/// For better understanding of the architecture and design of oneDNN
67	/// as well as the concepts used in the library, please refer to @ref
68	/// dev_guide_understanding_memory_formats.
69	///
70	/// @section memory_format_propagation_intro Introduction to the tutorial
71	///
72	/// This C++ API example demonstrates how to use optimized memory formats
73	/// supported by oneDNN:
74	/// - How to configure primitives to use optimized memory formats.
75	/// - How to determine whether data needs to be reordered from/to optimized
76	/// memory formats.
77	///
78	/// This tutorial assumes that the reader has already reviewed the
79	/// @ref getting_started_cpp tutorial.
80	///
81	/// The example is built around a CNN consisting of a convolution followed by
82	/// a pooling and consists of the following steps:
83	/// 1. Create a pooling primitive descriptor based on the memory format chosen
84	/// by the convolution primitive.
85	/// 2. Create memory descriptors for input and output data in the NCHW memory
86	/// format.
87	/// 3. Determine if input and output data needs to be reordered from/to the
88	/// optimized memory format.
89	/// 4. Create memory objects; and necessary primitives and execute them.
90	///
91	/// These steps are implemented in the @ref memory_format_propagation_tutorial
92	/// which in turn is called from `main()` which is also responsible for error
93	/// handling.
94
95	#include "oneapi/dnnl/dnnl.hpp"
96
97	#include "example_utils.hpp"
98
99	using namespace dnnl;
100
101	/// @page memory_format_propagation_cpp
102	/// @section memory_format_propagation_tutorial memory_format_propagation() function
103	///
104	void memory_format_propagation_tutorial(engine::kind engine_kind) {
105	/// @page memory_format_propagation_cpp
106	/// @subsection memory_format_propagation_sub1 Initialization
107	///
108	/// We start by creating an engine and a stream that we will use when
109	/// creating primitive descriptors and executing primitives.
110	///
111	/// @snippet memory_format_propagation.cpp Initialize engine and stream
112	// [Initialize engine and stream]
113	engine eng(engine_kind, `0`);
114	stream s(eng);
115	// [Initialize engine and stream]
116
117	/// @page memory_format_propagation_cpp
118	/// @subsection memory_format_propagation_sub2 Create convolution and pooling primitives
119	///
120	/// To specify that a primitive should pick an optimized format for the
121	/// specified computation parameters, we create memory descriptors with
122	/// memory format set to @ref dnnl::memory::format_tag::any.
123	///
124	/// This approach works only for a limited set of primitives: convolutions
125	/// and inner products. Additionally, @ref dnnl::memory::format_tag::any
126	/// can be specified for destination memory descriptors which implies that
127	/// destination will have the same memory format as the source.
128	///
129	/// @snippet memory_format_propagation.cpp Create placeholder memory descriptors
130	// [Create placeholder memory descriptors]
131	// Tensor and kernel dimensions. We use the same 3x3 kernel with padding=1
132	// for both convolution and pooling primitives, which means that the
133	// activation tensor shapes do not change.
134	const int N = `1`, H = `14`, W = `14`, IC = `128`, OC = `256`, KH = `3`, KW = `3`;
135	auto conv_src_md = memory::desc({N, IC, H, W}, memory::data_type::f32,
136	memory::format_tag::any // let convolution choose memory format
137	);
138	auto conv_weights_md = memory::desc(
139	{OC, IC, KH, KW}, memory::data_type::f32,
140	memory::format_tag::any // let convolution choose memory format
141	);
142	auto conv_dst_md = memory::desc({N, OC, H, W}, memory::data_type::f32,
143	memory::format_tag::any // let convolution choose memory format
144	);
145	const auto &pool_dst_md = conv_dst_md; // shape does not change
146	// [Create placeholder memory descriptors]
147
148	/// @page memory_format_propagation_cpp
149	///
150	/// Next, we pass the memory descriptors to primitive descriptors
151	/// constructors.
152	///
153	/// @snippet memory_format_propagation.cpp Create convolution and pooling primitive descriptors
154	// [Create convolution and pooling primitive descriptors]
155	auto conv_pd = convolution_forward::primitive_desc(
156	eng, prop_kind::forward_inference, algorithm::convolution_auto,
157	conv_src_md, conv_weights_md,
158	conv_dst_md, // shape information
159	{`1`, `1`}, // strides
160	{`1`, `1`}, {`1`, `1`} // left and right padding
161	);
162
163	auto pool_pd
164	= pooling_forward::primitive_desc(eng, prop_kind::forward_inference,
165	algorithm::pooling_max, conv_pd.dst_desc(),
166	pool_dst_md, // shape information
167	{`1`, `1`}, {KH, KW}, // strides and kernel
168	{`0`, `0`}, // dilation
169	{`1`, `1`}, {`1`, `1`} // left and right padding
170	);
171	// [Create convolution and pooling primitive descriptors]
172
173	/// @page memory_format_propagation_cpp
174	/// @subsection memory_format_propagation_sub3 Create source and destination memory objects
175	///
176	/// We assume that the 'user' source and destination memory format is
177	/// NCHW. Since there is no result validation in this tutorial, we do not
178	/// bother with filling the data with some values and let oneDNN
179	/// allocate the memory.
180	///
181	/// @snippet memory_format_propagation.cpp Create source and destination memory objects
182	// [Create source and destination memory objects]
183	auto src_mem = memory(
184	{{N, IC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
185	eng);
186	auto weights_mem = memory({{OC, IC, KH, KW}, memory::data_type::f32,
187	memory::format_tag::oihw},
188	eng);
189	auto dst_mem = memory(
190	{{N, OC, H, W}, memory::data_type::f32, memory::format_tag::nchw},
191	eng);
192	// [Create source and destination memory objects]
193
194	/// @page memory_format_propagation_cpp
195	/// @subsection memory_format_propagation_sub4 Determine if source and destination need to be reordered
196	///
197	/// The idiomatic way to check if a reorder is necessary between the memory
198	/// format expected a primitive (the convolution in our case) and the
199	/// available memory format is to compare the corresponding memory
200	/// descriptors.
201	///
202	/// @snippet memory_format_propagation.cpp Determine if source needs to be reordered
203	// [Determine if source needs to be reordered]
204	bool need_reorder_src = conv_pd.src_desc() != src_mem.get_desc();
205	// [Determine if source needs to be reordered]
206
207	/// @page memory_format_propagation_cpp
208	///
209	/// @warning It is by design that it is not possible to just compare
210	/// memory tags. The reason behind this is that a memory format tags only
211	/// provide a partial description of how data is laid out in memory and do
212	/// not, for example, describe memory objects obtained via sub-memory
213	/// constructor.
214	///
215	/// We repeat the process for the weights and destination memory format
216	/// descriptors as well.
217	///
218	/// @snippet memory_format_propagation.cpp Determine if weights and destination need to be reordered
219	// [Determine if weights and destination need to be reordered]
220	bool need_reorder_weights
221	= conv_pd.weights_desc() != weights_mem.get_desc();
222	bool need_reorder_dst = conv_pd.dst_desc() != dst_mem.get_desc();
223	// [Determine if weights and destination need to be reordered]
224
225	/// @page memory_format_propagation_cpp
226	/// @subsection memory_format_propagation_sub45 Allocate intermediate buffers if necessary
227	///
228	/// Based on the flags computed before, we can now decide if we need extra
229	/// intermediate buffers to hold the source and weights data for the
230	/// convolution and the output of the pooling.
231	///
232	/// Memory objects for the intermediate buffers are created based on the
233	/// memory descriptors obtained from the primitive descriptors to ensure
234	/// consistency.
235	///
236	/// @snippet memory_format_propagation.cpp Allocate intermediate buffers if necessary
237	// [Allocate intermediate buffers if necessary]
238	auto conv_src_mem
239	= need_reorder_src ? memory(conv_pd.src_desc(), eng) : src_mem;
240	auto conv_weights_mem = need_reorder_weights
241	? memory(conv_pd.weights_desc(), eng)
242	: weights_mem;
243	auto conv_dst_mem = memory(conv_pd.dst_desc(), eng);
244	auto pool_dst_mem
245	= need_reorder_dst ? memory(pool_pd.dst_desc(), eng) : dst_mem;
246	// [Allocate intermediate buffers if necessary]
247
248	/// @page memory_format_propagation_cpp
249	/// @subsection memory_format_propagation_sub5 Perform reorders for source data if necessary
250	///
251	/// Now we get to the part where we actually start executing things. We
252	/// check if reorders are necessary based on the flags computed before and
253	/// create and execute them immediately.
254	///
255	/// @note We call @ref dnnl::stream::wait() before reorder primitives
256	/// get out of scope and destroyed to accommodate for potentially
257	/// asynchronous execution.
258	///
259	/// @snippet memory_format_propagation.cpp Perform reorders for source data if necessary
260	// [Perform reorders for source data if necessary]
261	if (need_reorder_src) {
262	auto reorder_src = reorder(src_mem, conv_src_mem);
263	reorder_src.execute(
264	s, {{DNNL_ARG_FROM, src_mem}, {DNNL_ARG_TO, conv_src_mem}});
265	s.wait(); // wait for the reorder to complete
266	}
267
268	if (need_reorder_weights) {
269	auto reorder_weights = reorder(weights_mem, conv_weights_mem);
270	reorder_weights.execute(s,
271	{{DNNL_ARG_FROM, weights_mem},
272	{DNNL_ARG_TO, conv_weights_mem}});
273	s.wait(); // wait for the reorder to complete
274	}
275	// [Perform reorders for source data if necessary]
276
277	/// @page memory_format_propagation_cpp
278	/// @subsection memory_format_propagation_sub6 Create and execute convolution and pooling primitives
279	///
280	/// After the reorders, we are now ready to compute convolution and
281	/// pooling.
282	///
283	/// @snippet memory_format_propagation.cpp Create and execute convolution and pooling primitives
284	// [Create and execute convolution and pooling primitives]
285	auto conv_scratchpad_mem = memory(conv_pd.scratchpad_desc(), eng);
286	auto conv = convolution_forward(conv_pd);
287	conv.execute(s,
288	{{DNNL_ARG_SRC, conv_src_mem}, {DNNL_ARG_WEIGHTS, conv_weights_mem},
289	{DNNL_ARG_DST, conv_dst_mem}});
290	auto pool_scratchpad_mem = memory(pool_pd.scratchpad_desc(), eng);
291	auto pool = pooling_forward(pool_pd);
292	pool.execute(
293	s, {{DNNL_ARG_SRC, conv_dst_mem}, {DNNL_ARG_DST, pool_dst_mem}});
294	s.wait();
295	// [Create and execute convolution and pooling primitives]
296
297	/// @page memory_format_propagation_cpp
298	/// @subsection memory_format_propagation_sub7 Reorder destination data if necessary
299	///
300	/// The only potentially remaining operation is a reorder from the pooling
301	/// destination memory object to the user's one. Similarly to the
302	/// reorders for the source and weights memory objects, it is performed
303	/// depending on the value of the previously computed flag.
304	///
305	/// @snippet memory_format_propagation.cpp Reorder destination data if necessary
306	// [Reorder destination data if necessary]
307	if (need_reorder_dst) {
308	auto reorder_dst = reorder(pool_dst_mem, dst_mem);
309	reorder_dst.execute(
310	s, {{DNNL_ARG_FROM, pool_dst_mem}, {DNNL_ARG_TO, dst_mem}});
311	s.wait();
312	}
313	// [Reorder destination data if necessary]
314	}
315
316	int main(int argc, char **argv) {
317	return handle_example_errors(
318	memory_format_propagation_tutorial, parse_engine_kind(argc, argv));
319	}
320
321	/// @page memory_format_propagation_cpp
322	/// @subsection memory_format_propagation_results Results
323	///
324	/// Upon compiling and run the example the output should be just:
325	///
326	/// ~~~sh
327	/// Example passed.
328	/// ~~~
329	///
330	/// It may be interesting to check what really happens during the run. We can
331	/// use `ONEDNN_VERBOSE` environment variable for that (see also @ref
332	/// dev_guide_verbose). Here's an example output:
333	///
334	/// ~~~sh
335	/// $ ONEDNN_VERBOSE=1 ./memory-format-propagation-cpp
336	/// onednn_verbose,info,oneDNN <ver> (Git Hash <hash>)
337	/// onednn_verbose,info,cpu,runtime:OpenMP
338	/// onednn_verbose,info,cpu,isa:Intel AVX2
339	/// onednn_verbose,info,gpu,runtime:none
340	/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,
341	/// src_f32::blocked:abcd:f0 dst_f32::blocked:aBcd8b:f0,,,1x128x14x14,0.326904
342	/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,
343	/// src_f32::blocked:abcd:f0 dst_f32::blocked:ABcd8b8a:f0,,,256x128x3x3,0.244141
344	/// onednn_verbose,exec,cpu,convolution,jit:avx2,forward_inference,
345	/// src_f32::blocked:aBcd8b:f0 wei_f32::blocked:ABcd8b8a:f0 bia_undef::undef::f0 dst_f32::blocked:aBcd8b:f0,,
346	/// alg:convolution_direct,mb1_ic128oc256_ih14oh14kh3sh1dh0ph1_iw14ow14kw3sw1dw0pw1,1.20312
347	/// onednn_verbose,exec,cpu,pooling,jit:avx,forward_inference,
348	/// src_f32::blocked:aBcd8b:f0 dst_f32::blocked:aBcd8b:f0 ws_undef::undef::f0,,
349	/// alg:pooling_max,mb1ic256_ih14oh14kh3sh1ph1_iw14ow14kw3sw1pw1,0.187012
350	/// onednn_verbose,exec,cpu,reorder,jit:uni,undef,
351	/// src_f32::blocked:aBcd8b:f0 dst_f32::blocked:abcd:f0,,,1x256x14x14,0.0419922
352	/// Example passed on CPU.
353	/// ~~~
354	///
355	/// From this output we can deduce that:
356	/// The convolution primitive picked up @ref*
357	/// dnnl::memory::format_tag::aBcd8b optimized memory format for
358	/// activations. In this format the channels dimension (denoted by letter B
359	/// since it is the second dimension; see also @ref dev_guide_conventions)
360	/// is blocked by a factor of 8. Because of this memory format is different
361	/// from the NCHW format the tutorial uses, the source and destination had
362	/// to be reordered to and from this optimized memory layout.
363	/// The convolution primitive picked up @ref*
364	/// dnnl::memory::format_tag::ABcd8b8a optimized memory format (output (A)
365	/// and input (B) channel dimensions blocked by 8) which we also had to
366	/// reorder the initial weights to since they are in the OIHW memory format.
367

Browse the source code of oneDNN/examples/memory_format_propagation.cpp