1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example getting_started.cpp
18/// @copybrief getting_started_cpp
19/// > Annotated version: @ref getting_started_cpp
20
21#include <cmath>
22#include <numeric>
23#include <stdexcept>
24#include <vector>
25
26#include "oneapi/dnnl/dnnl.hpp"
27#include "oneapi/dnnl/dnnl_debug.h"
28
29#include "example_utils.hpp"
30
31using namespace dnnl;
32// [Prologue]
33
34/// @page getting_started_cpp Getting started
35///
36/// This C++ API example demonstrates the basics of the oneDNN programming model.
37///
38/// > Example code: @ref getting_started.cpp
39///
40/// This C++ API example demonstrates the basics of the oneDNN programming model:
41/// - How to create oneDNN memory objects.
42/// - How to get data from the user's buffer into a oneDNN memory object.
43/// - How a tensor's logical dimensions and memory object formats relate.
44/// - How to create oneDNN primitives.
45/// - How to execute the primitives.
46///
47/// The example uses the ReLU operation and comprises the following steps:
48/// 1. Creating @ref getting_started_cpp_sub1 to execute a primitive.
49/// 2. Performing @ref getting_started_cpp_sub2.
50/// 3. @ref getting_started_cpp_sub3 (using different flavors).
51/// 4. @ref getting_started_cpp_sub4.
52/// 5. @ref getting_started_cpp_sub5.
53/// 6. @ref getting_started_cpp_sub6 (checking that the resulting image does
54/// not contain negative values).
55///
56/// These steps are implemented in the @ref getting_started_cpp_tutorial, which
57/// in turn is called from @ref getting_started_cpp_main (which is also
58/// responsible for error handling).
59///
60/// @section getting_started_cpp_headers Public headers
61///
62/// To start using oneDNN we must first include the @ref dnnl.hpp
63/// header file in the program. We also include @ref dnnl_debug.h in
64/// example_utils.hpp, which contains some debugging facilities like returning
65/// a string representation for common oneDNN C types.
66
67// [Prologue]
68
69/// @page getting_started_cpp
70/// @section getting_started_cpp_tutorial getting_started_tutorial() function
71///
72void getting_started_tutorial(engine::kind engine_kind) {
73 /// @page getting_started_cpp
74 /// @subsection getting_started_cpp_sub1 Engine and stream
75 ///
76 /// All oneDNN primitives and memory objects are attached to a
77 /// particular @ref dnnl::engine, which is an abstraction of a
78 /// computational device (see also @ref dev_guide_basic_concepts). The
79 /// primitives are created and optimized for the device they are attached
80 /// to and the memory objects refer to memory residing on the
81 /// corresponding device. In particular, that means neither memory objects
82 /// nor primitives that were created for one engine can be used on
83 /// another.
84 ///
85 /// To create an engine, we should specify the @ref dnnl::engine::kind
86 /// and the index of the device of the given kind.
87 ///
88 /// @snippet getting_started.cpp Initialize engine
89 // [Initialize engine]
90 engine eng(engine_kind, 0);
91 // [Initialize engine]
92
93 /// In addition to an engine, all primitives require a @ref dnnl::stream
94 /// for the execution. The stream encapsulates an execution context and is
95 /// tied to a particular engine.
96 ///
97 /// The creation is pretty straightforward:
98 /// @snippet getting_started.cpp Initialize stream
99 // [Initialize stream]
100 stream engine_stream(eng);
101 // [Initialize stream]
102
103 /// In the simple cases, when a program works with one device only (e.g.
104 /// only on CPU), an engine and a stream can be created once and used
105 /// throughout the program. Some frameworks create singleton objects that
106 /// hold oneDNN engine and stream and use them throughout the code.
107
108 /// @subsection getting_started_cpp_sub2 Data preparation (code outside of oneDNN)
109 ///
110 /// Now that the preparation work is done, let's create some data to work
111 /// with. We will create a 4D tensor in NHWC format, which is quite
112 /// popular in many frameworks.
113 ///
114 /// Note that even though we work with one image only, the image tensor
115 /// is still 4D. The extra dimension (here N) corresponds to the
116 /// batch, and, in case of a single image, is equal to 1. It is pretty
117 /// typical to have the batch dimension even when working with a single
118 /// image.
119 ///
120 /// In oneDNN, all CNN primitives assume that tensors have the batch
121 /// dimension, which is always the first logical dimension (see also @ref
122 /// dev_guide_conventions).
123 ///
124 /// @snippet getting_started.cpp Create user's data
125 // [Create user's data]
126 const int N = 1, H = 13, W = 13, C = 3;
127
128 // Compute physical strides for each dimension
129 const int stride_N = H * W * C;
130 const int stride_H = W * C;
131 const int stride_W = C;
132 const int stride_C = 1;
133
134 // An auxiliary function that maps logical index to the physical offset
135 auto offset = [=](int n, int h, int w, int c) {
136 return n * stride_N + h * stride_H + w * stride_W + c * stride_C;
137 };
138
139 // The image size
140 const int image_size = N * H * W * C;
141
142 // Allocate a buffer for the image
143 std::vector<float> image(image_size);
144
145 // Initialize the image with some values
146 for (int n = 0; n < N; ++n)
147 for (int h = 0; h < H; ++h)
148 for (int w = 0; w < W; ++w)
149 for (int c = 0; c < C; ++c) {
150 int off = offset(
151 n, h, w, c); // Get the physical offset of a pixel
152 image[off] = -std::cos(off / 10.f);
153 }
154 // [Create user's data]
155 /// @subsection getting_started_cpp_sub3 Wrapping data into a oneDNN memory object
156 ///
157 /// Now, having the image ready, let's wrap it in a @ref dnnl::memory
158 /// object to be able to pass the data to oneDNN primitives.
159 ///
160 /// Creating @ref dnnl::memory comprises two steps:
161 /// 1. Initializing the @ref dnnl::memory::desc struct (also referred to
162 /// as a memory descriptor), which only describes the tensor data and
163 /// doesn't contain the data itself. Memory descriptors are used to
164 /// create @ref dnnl::memory objects and to initialize primitive
165 /// descriptors (shown later in the example).
166 /// 2. Creating the @ref dnnl::memory object itself (also referred to as
167 /// a memory object), based on the memory descriptor initialized in
168 /// step 1, an engine, and, optionally, a handle to data. The
169 /// memory object is used when a primitive is executed.
170 ///
171 /// Thanks to the
172 /// [list initialization](https://en.cppreference.com/w/cpp/language/list_initialization)
173 /// introduced in C++11, it is possible to combine these two steps whenever
174 /// a memory descriptor is not used anywhere else but in creating a @ref
175 /// dnnl::memory object.
176 ///
177 /// However, for the sake of demonstration, we will show both steps
178 /// explicitly.
179
180 /// @subsubsection getting_started_cpp_sub31 Memory descriptor
181 ///
182 /// To initialize the @ref dnnl::memory::desc, we need to pass:
183 /// 1. The tensor's dimensions, **the semantic order** of which is
184 /// defined by **the primitive** that will use this memory
185 /// (descriptor).
186 ///
187 /// @warning
188 /// Memory descriptors and objects are not aware of any meaning of
189 /// the data they describe or contain.
190 /// 2. The data type for the tensor (@ref dnnl::memory::data_type).
191 /// 3. The memory format tag (@ref dnnl::memory::format_tag) that
192 /// describes how the data is going to be laid out in the device's
193 /// memory. The memory format is required for the primitive to
194 /// correctly handle the data.
195 ///
196 /// The code:
197 /// @snippet getting_started.cpp Init src_md
198 // [Init src_md]
199 auto src_md = memory::desc(
200 {N, C, H, W}, // logical dims, the order is defined by a primitive
201 memory::data_type::f32, // tensor's data type
202 memory::format_tag::nhwc // memory format, NHWC in this case
203 );
204 // [Init src_md]
205
206 /// The first thing to notice here is that we pass dimensions as `{N, C,
207 /// H, W}` while it might seem more natural to pass `{N, H, W, C}`, which
208 /// better corresponds to the user's code. This is because oneDNN
209 /// CNN primitives like ReLU always expect tensors in the following form:
210 ///
211 /// | Spatial dim | Tensor dimensions
212 /// | :-- | :--
213 /// | 0D | \f$N \times C\f$
214 /// | 1D | \f$N \times C \times W\f$
215 /// | 2D | \f$N \times C \times H \times W\f$
216 /// | 3D | \f$N \times C \times D \times H \times W\f$
217 ///
218 /// where:
219 /// - \f$N\f$ is a batch dimension (discussed above),
220 /// - \f$C\f$ is channel (aka feature maps) dimension, and
221 /// - \f$D\f$, \f$H\f$, and \f$W\f$ are spatial dimensions.
222 ///
223 /// Now that the logical order of dimension is defined, we need to specify
224 /// the memory format (the third parameter), which describes how logical
225 /// indices map to the offset in memory. This is the place where the user's
226 /// format NHWC comes into play. oneDNN has different @ref
227 /// dnnl::memory::format_tag values that cover the most popular memory
228 /// formats like NCHW, NHWC, CHWN, and some others.
229 ///
230 /// The memory descriptor for the image is called `src_md`. The `src` part
231 /// comes from the fact that the image will be a source for the ReLU
232 /// primitive (that is, we formulate memory names from the primitive
233 /// perspective; hence we will use `dst` to name the output memory). The
234 /// `md` is an initialism for Memory Descriptor.
235
236 /// @paragraph getting_started_cpp_sub311 Alternative way to create a memory descriptor
237 ///
238 /// Before we continue with memory creation, let us show the alternative
239 /// way to create the same memory descriptor: instead of using the
240 /// @ref dnnl::memory::format_tag, we can directly specify the strides
241 /// of each tensor dimension:
242 /// @snippet getting_started.cpp Init alt_src_md
243 // [Init alt_src_md]
244 auto alt_src_md = memory::desc(
245 {N, C, H, W}, // logical dims, the order is defined by a primitive
246 memory::data_type::f32, // tensor's data type
247 {stride_N, stride_C, stride_H, stride_W} // the strides
248 );
249
250 // Sanity check: the memory descriptors should be the same
251 if (src_md != alt_src_md)
252 throw std::logic_error("Memory descriptor initialization mismatch.");
253 // [Init alt_src_md]
254
255 /// Just as before, the tensor's dimensions come in the `N, C, H, W` order
256 /// as required by CNN primitives. To define the physical memory format,
257 /// the strides are passed as the third parameter. Note that the order of
258 /// the strides corresponds to the order of the tensor's dimensions.
259 ///
260 /// @warning
261 /// Using the wrong order might lead to incorrect results or even a
262 /// crash.
263
264 /// @subsubsection getting_started_cpp_sub32 Creating a memory object
265 ///
266 /// Having a memory descriptor and an engine prepared, let's create
267 /// input and output memory objects for a ReLU primitive.
268 /// @snippet getting_started.cpp Create memory objects
269 // [Create memory objects]
270 // src_mem contains a copy of image after write_to_dnnl_memory function
271 auto src_mem = memory(src_md, eng);
272 write_to_dnnl_memory(image.data(), src_mem);
273
274 // For dst_mem the library allocates buffer
275 auto dst_mem = memory(src_md, eng);
276 // [Create memory objects]
277
278 /// We already have a memory buffer for the source memory object. We pass
279 /// it to the
280 /// @ref dnnl::memory::memory(const dnnl::memory::desc &, const dnnl::engine &, void *)
281 /// constructor that takes a buffer pointer as its last argument.
282 ///
283 /// Let's use a constructor that instructs the library to allocate a
284 /// memory buffer for the `dst_mem` for educational purposes.
285 ///
286 /// The key difference between these two are:
287 /// 1. The library will own the memory for `dst_mem` and will deallocate
288 /// it when `dst_mem` is destroyed. That means the memory buffer can
289 /// be used only while `dst_mem` is alive.
290 /// 2. Library-allocated buffers have good alignment, which typically
291 /// results in better performance.
292 ///
293 /// @note
294 /// Memory allocated outside of the library and passed to oneDNN
295 /// should have good alignment for better performance.
296 ///
297 /// In the subsequent section we will show how to get the buffer (pointer)
298 /// from the `dst_mem` memory object.
299 /// @subsection getting_started_cpp_sub4 Creating a ReLU primitive
300 ///
301 /// Let's now create a ReLU primitive.
302 ///
303 /// The library implements ReLU primitive as a particular algorithm of a
304 /// more general @ref dev_guide_eltwise primitive, which applies a specified
305 /// function to each and every element of the source tensor.
306 ///
307 /// Just as in the case of @ref dnnl::memory, a user should always go
308 /// through (at least) two creation steps (which however, can be sometimes
309 /// combined thanks to C++11):
310 /// 1. Create an operation primitive descriptor (here @ref
311 /// dnnl::eltwise_forward::primitive_desc) that defines operation
312 /// parameters and is a **lightweight** descriptor of the actual
313 /// algorithm that **implements** the given operation.
314 /// The user can query different characteristics of the chosen
315 /// implementation such as memory consumptions and some others that will
316 /// be covered in the next topic (@ref memory_format_propagation_cpp).
317 /// 2. Create a primitive (here @ref dnnl::eltwise_forward) that can be
318 /// executed on memory objects to compute the operation.
319 ///
320 /// oneDNN separates steps 2 and 3 to enable the user to inspect details of a
321 /// primitive implementation prior to creating the primitive. This may be
322 /// expensive, because, for example, oneDNN generates the optimized
323 /// computational code on the fly.
324 ///
325 ///@note
326 /// Primitive creation might be a very expensive operation, so consider
327 /// creating primitive objects once and executing them multiple times.
328 ///
329 /// The code:
330 /// @snippet getting_started.cpp Create a ReLU primitive
331 // [Create a ReLU primitive]
332 // ReLU primitive descriptor, which corresponds to a particular
333 // implementation in the library
334 auto relu_pd = eltwise_forward::primitive_desc(
335 eng, // an engine the primitive will be created for
336 prop_kind::forward_inference, algorithm::eltwise_relu,
337 src_md, // source memory descriptor for an operation to work on
338 src_md, // destination memory descriptor for an operation to work on
339 0.f, // alpha parameter means negative slope in case of ReLU
340 0.f // beta parameter is ignored in case of ReLU
341 );
342
343 // ReLU primitive
344 auto relu = eltwise_forward(relu_pd); // !!! this can take quite some time
345 // [Create a ReLU primitive]
346
347 /// A note about variable names. Similar to the `_md` suffix used for
348 /// memory descriptor, we use `_d` for the operation descriptor names,
349 /// `_pd` for the primitive descriptors, and no suffix for primitives
350 /// themselves.
351 ///
352 /// It is worth mentioning that we specified the exact tensor and its
353 /// memory format when we were initializing the `relu_d`. That means
354 /// `relu` primitive would perform computations with memory objects that
355 /// correspond to this description. This is the one and only one way of
356 /// creating non-compute-intensive primitives like @ref dev_guide_eltwise,
357 /// @ref dev_guide_batch_normalization, and others.
358 ///
359 /// Compute-intensive primitives (like @ref dev_guide_convolution) have an
360 /// ability to define the appropriate memory format on their own. This is
361 /// one of the key features of the library and will be discussed in detail
362 /// in the next topic: @ref memory_format_propagation_cpp.
363
364 /// @subsection getting_started_cpp_sub5 Executing the ReLU primitive
365 ///
366 /// Finally, let's execute the primitive and wait for its completion.
367 ///
368 /// The input and output memory objects are passed to the `execute()`
369 /// method using a <tag, memory> map. Each tag specifies what kind of
370 /// tensor each memory object represents. All @ref dev_guide_eltwise
371 /// primitives require the map to have two elements: a source memory
372 /// object (input) and a destination memory (output).
373 ///
374 /// A primitive is executed in a stream (the first parameter of the
375 /// `execute()` method). Depending on a stream kind, an execution might be
376 /// blocking or non-blocking. This means that we need to call @ref
377 /// dnnl::stream::wait before accessing the results.
378 ///
379 /// @snippet getting_started.cpp Execute ReLU primitive
380 // [Execute ReLU primitive]
381 // Execute ReLU (out-of-place)
382 relu.execute(engine_stream, // The execution stream
383 {
384 // A map with all inputs and outputs
385 {DNNL_ARG_SRC, src_mem}, // Source tag and memory obj
386 {DNNL_ARG_DST, dst_mem}, // Destination tag and memory obj
387 });
388
389 // Wait the stream to complete the execution
390 engine_stream.wait();
391 // [Execute ReLU primitive]
392
393 /// The @ref dev_guide_eltwise is one of the primitives that support
394 /// in-place operations, meaning that the source and destination memory can
395 /// be the same. To perform in-place transformation, the user must pass the
396 /// same memory object for both the `DNNL_ARG_SRC` and
397 /// `DNNL_ARG_DST` tags:
398 /// @snippet getting_started.cpp Execute ReLU primitive in-place
399 // [Execute ReLU primitive in-place]
400 // Execute ReLU (in-place)
401 // relu.execute(engine_stream, {
402 // {DNNL_ARG_SRC, src_mem},
403 // {DNNL_ARG_DST, src_mem},
404 // });
405 // [Execute ReLU primitive in-place]
406
407 /// @page getting_started_cpp
408 /// @subsection getting_started_cpp_sub6 Obtaining the result and validation
409 ///
410 /// Now that we have the computed result, let's validate that it is
411 /// actually correct. The result is stored in the `dst_mem` memory object.
412 /// So we need to obtain the C++ pointer to a buffer with data via @ref
413 /// dnnl::memory::get_data_handle() and cast it to the proper data type
414 /// as shown below.
415 ///
416 /// @warning
417 /// The @ref dnnl::memory::get_data_handle() returns a raw handle
418 /// to the buffer, the type of which is engine specific. For the CPU
419 /// engine the buffer is always a pointer to `void`, which can safely
420 /// be used. However, for engines other than CPU the handle might be
421 /// runtime-specific type, such as `cl_mem` in case of GPU/OpenCL.
422 ///
423 /// @snippet getting_started.cpp Check the results
424 // [Check the results]
425 // Obtain a buffer for the `dst_mem` and cast it to `float *`.
426 // This is safe since we created `dst_mem` as f32 tensor with known
427 // memory format.
428 std::vector<float> relu_image(image_size);
429 read_from_dnnl_memory(relu_image.data(), dst_mem);
430 /*
431 // Check the results
432 for (int n = 0; n < N; ++n)
433 for (int h = 0; h < H; ++h)
434 for (int w = 0; w < W; ++w)
435 for (int c = 0; c < C; ++c) {
436 int off = offset(
437 n, h, w, c); // get the physical offset of a pixel
438 float expected = image[off] < 0
439 ? 0.f
440 : image[off]; // expected value
441 if (relu_image[off] != expected) {
442 std::cout << "At index(" << n << ", " << c << ", " << h
443 << ", " << w << ") expect " << expected
444 << " but got " << relu_image[off]
445 << std::endl;
446 throw std::logic_error("Accuracy check failed.");
447 }
448 }
449 // [Check the results]
450 */
451}
452
453/// @page getting_started_cpp
454///
455/// @section getting_started_cpp_main main() function
456///
457/// We now just call everything we prepared earlier.
458///
459/// Because we are using the oneDNN C++ API, we use exceptions to handle errors
460/// (see @ref dev_guide_c_and_cpp_apis).
461/// The oneDNN C++ API throws exceptions of type @ref dnnl::error,
462/// which contains the error status (of type @ref dnnl_status_t) and a
463/// human-readable error message accessible through regular `what()` method.
464/// @page getting_started_cpp
465/// @snippet getting_started.cpp Main
466// [Main]
467int main(int argc, char **argv) {
468 int exit_code = 0;
469
470 engine::kind engine_kind = parse_engine_kind(argc, argv);
471 try {
472 getting_started_tutorial(engine_kind);
473 } catch (dnnl::error &e) {
474 std::cout << "oneDNN error caught: " << std::endl
475 << "\tStatus: " << dnnl_status2str(e.status) << std::endl
476 << "\tMessage: " << e.what() << std::endl;
477 exit_code = 1;
478 } catch (std::string &e) {
479 std::cout << "Error in the example: " << e << "." << std::endl;
480 exit_code = 2;
481 }
482
483 std::cout << "Example " << (exit_code ? "failed" : "passed") << " on "
484 << engine_kind2str_upper(engine_kind) << "." << std::endl;
485 return exit_code;
486}
487// [Main]
488
489/// @page getting_started_cpp
490///
491/// <b></b>
492///
493/// Upon compiling and run the example the output should be just:
494///
495/// ~~~
496/// Example passed.
497/// ~~~
498///
499/// Users are encouraged to experiment with the code to familiarize themselves
500/// with the concepts. In particular, one of the changes that might be of
501/// interest is to spoil some of the library calls to check how error handling
502/// happens. For instance, if we replace
503///
504/// ~~~cpp
505/// relu.execute(engine_stream, {
506/// {DNNL_ARG_SRC, src_mem},
507/// {DNNL_ARG_DST, dst_mem},
508/// });
509/// ~~~
510///
511/// with
512///
513/// ~~~cpp
514/// relu.execute(engine_stream, {
515/// {DNNL_ARG_SRC, src_mem},
516/// // {DNNL_ARG_DST, dst_mem}, // Oops, forgot about this one
517/// });
518/// ~~~
519///
520/// we should get the following output:
521///
522/// ~~~
523/// oneDNN error caught:
524/// Status: invalid_arguments
525/// Message: could not execute a primitive
526/// Example failed.
527/// ~~~
528