1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example cross_engine_reorder.cpp
18/// @copybrief cross_engine_reorder_cpp
19/// > Annotated version: @ref cross_engine_reorder_cpp
20
21/// @page cross_engine_reorder_cpp Reorder between CPU and GPU engines
22/// This C++ API example demonstrates programming flow when reordering memory
23/// between CPU and GPU engines.
24///
25/// > Example code: @ref cross_engine_reorder.cpp
26///
27/// @section cross_engine_reorder_cpp_headers Public headers
28///
29/// To start using oneDNN, we must first include the @ref dnnl.hpp
30/// header file in the application. We also include @ref dnnl_debug.h, which
31/// contains some debugging facilities such as returning a string representation
32/// for common oneDNN C types.
33///
34/// All C++ API types and functions reside in the `dnnl` namespace.
35/// For simplicity of the example we import this namespace.
36/// @page cross_engine_reorder_cpp
37
38#include <iostream>
39#include <stdexcept>
40#include <vector>
41
42/// @snippet cross_engine_reorder.cpp Prologue
43// [Prologue]
44#include "example_utils.hpp"
45#include "oneapi/dnnl/dnnl.hpp"
46
47#include "example_utils.hpp"
48
49using namespace dnnl;
50
51using namespace std;
52// [Prologue]
53
54void fill(memory &mem, const memory::dims &adims) {
55 std::vector<float> array(product(adims));
56 for (size_t e = 0; e < array.size(); ++e) {
57 array[e] = e % 7 ? 1.0f : -1.0f;
58 }
59 write_to_dnnl_memory(array.data(), mem);
60}
61
62int find_negative(memory &mem, const memory::dims &adims) {
63 int negs = 0;
64 size_t nelems = product(adims);
65 std::vector<float> array(nelems);
66 read_from_dnnl_memory(array.data(), mem);
67
68 for (size_t e = 0; e < nelems; ++e)
69 negs += array[e] < 0.0f;
70 return negs;
71}
72
73/// @page cross_engine_reorder_cpp
74/// @section cross_engine_reorder_cpp_tutorial cross_engine_reorder_tutorial() function
75///
76void cross_engine_reorder_tutorial() {
77 /// @page cross_engine_reorder_cpp
78 /// @subsection cross_engine_reorder_cpp_sub1 Engine and stream
79 ///
80 /// All oneDNN primitives and memory objects are attached to a
81 /// particular @ref dnnl::engine, which is an abstraction of a
82 /// computational device (see also @ref dev_guide_basic_concepts). The
83 /// primitives are created and optimized for the device they are attached
84 /// to, and the memory objects refer to memory residing on the
85 /// corresponding device. In particular, that means neither memory objects
86 /// nor primitives that were created for one engine can be used on
87 /// another.
88 ///
89 /// To create engines, we must specify the @ref dnnl::engine::kind
90 /// and the index of the device of the given kind. There is only one CPU
91 /// engine and one GPU engine, so the index for both engines must be 0.
92 ///
93 /// @snippet cross_engine_reorder.cpp Initialize engine
94 // [Initialize engine]
95 auto cpu_engine = engine(validate_engine_kind(engine::kind::cpu), 0);
96 auto gpu_engine = engine(validate_engine_kind(engine::kind::gpu), 0);
97 // [Initialize engine]
98
99 /// In addition to an engine, all primitives require a @ref dnnl::stream
100 /// for the execution. The stream encapsulates an execution context and is
101 /// tied to a particular engine.
102 ///
103 /// In this example, a GPU stream is created.
104 ///
105 /// @snippet cross_engine_reorder.cpp Initialize stream
106 // [Initialize stream]
107 auto stream_gpu = stream(gpu_engine, stream::flags::in_order);
108 // [Initialize stream]
109
110 /// @subsection cross_engine_reorder_cpp_sub2 Wrapping data into oneDNN GPU memory object
111 /// Fill the data in CPU memory first, and then move data from CPU to GPU
112 /// memory by reorder.
113 /// @snippet cross_engine_reorder.cpp reorder cpu2gpu
114 // [reorder cpu2gpu]
115 const auto tz = memory::dims {2, 16, 1, 1};
116 auto m_cpu
117 = memory({{tz}, memory::data_type::f32, memory::format_tag::nchw},
118 cpu_engine);
119 auto m_gpu
120 = memory({{tz}, memory::data_type::f32, memory::format_tag::nchw},
121 gpu_engine);
122 fill(m_cpu, tz);
123 auto r1 = reorder(m_cpu, m_gpu);
124 // [reorder cpu2gpu]
125
126 /// @subsection cross_engine_reorder_cpp_sub3 Creating a ReLU primitive
127 ///
128 /// Let's now create a ReLU primitive for GPU.
129 ///
130 /// The library implements the ReLU primitive as a particular algorithm of a
131 /// more general @ref dev_guide_eltwise primitive, which applies a specified
132 /// function to each element of the source tensor.
133 ///
134 /// Just as in the case of @ref dnnl::memory, a user should always go
135 /// through (at least) three creation steps (which, however, can sometimes
136 /// be combined thanks to C++11):
137 /// 1. Create an operation primitive descriptor (here @ref
138 /// dnnl::eltwise_forward::primitive_desc) that defines the operation
139 /// parameters including a GPU memory descriptor, and GPU engine.
140 /// Primitive descriptor is a **lightweight** descriptor of the actual
141 /// algorithm that **implements** the given operation.
142 /// 2. Create a primitive (here @ref dnnl::eltwise_forward) that can be
143 /// executed on GPU memory objects to compute the operation by a GPU
144 /// engine.
145 ///
146 ///@note
147 /// Primitive creation might be a very expensive operation, so consider
148 /// creating primitive objects once and executing them multiple times.
149 ///
150 /// The code:
151 /// @snippet cross_engine_reorder.cpp Create a ReLU primitive
152 // [Create a ReLU primitive]
153 // ReLU primitive descriptor, which corresponds to a particular
154 // implementation in the library. Specify engine type for the ReLU
155 // primitive. Use a GPU engine here.
156 auto relu_pd = eltwise_forward::primitive_desc(gpu_engine,
157 prop_kind::forward, algorithm::eltwise_relu, m_gpu.get_desc(),
158 m_gpu.get_desc(), 0.0f);
159 // ReLU primitive
160 auto relu = eltwise_forward(relu_pd);
161 // [Create a ReLU primitive]
162
163 /// @subsection cross_engine_reorder_cpp_sub4 Getting results from a oneDNN GPU memory object
164 /// After the ReLU operation, users need to get data from GPU to CPU memory
165 /// by reorder.
166 /// @snippet cross_engine_reorder.cpp reorder gpu2cpu
167 // [reorder gpu2cpu]
168 auto r2 = reorder(m_gpu, m_cpu);
169 // [reorder gpu2cpu]
170
171 /// @subsection cross_engine_reorder_cpp_sub5 Executing all primitives
172 ///
173 /// Finally, let's execute all primitives and wait for their completion
174 /// via the following sequence:
175 ///
176 /// Reorder(CPU,GPU) -> ReLU -> Reorder(GPU,CPU).
177 ///
178 /// 1. After execution of the first Reorder, ReLU has source data in GPU.
179 ///
180 /// 2. The input and output memory objects are passed to the ReLU
181 /// `execute()` method using a <tag, memory> map. Each tag specifies what
182 /// kind of tensor each memory object represents. All @ref dev_guide_eltwise
183 /// primitives require the map to have two elements: a source memory
184 /// object (input) and a destination memory (output). For executing
185 /// on GPU engine, both source and destination memory object must use
186 /// GPU memory.
187 ///
188 /// 3. After the execution of the ReLU on GPU, the second Reorder moves
189 /// the results from GPU to CPU.
190 ///
191 /// @note
192 /// All primitives are executed in the SAME GPU stream (the first
193 /// parameter of the `execute()` method).
194 ///
195 /// Execution is asynchronous on GPU. This means that we need to call @ref
196 /// dnnl::stream::wait before accessing the results.
197 ///
198 /// @snippet cross_engine_reorder.cpp Execute primitives
199 // [Execute primitives]
200 // wrap source data from CPU to GPU
201 r1.execute(stream_gpu, m_cpu, m_gpu);
202 // Execute ReLU on a GPU stream
203 relu.execute(stream_gpu, {{DNNL_ARG_SRC, m_gpu}, {DNNL_ARG_DST, m_gpu}});
204 // Get result data from GPU to CPU
205 r2.execute(stream_gpu, m_gpu, m_cpu);
206
207 stream_gpu.wait();
208 // [Execute primitives]
209
210 /// @page cross_engine_reorder_cpp
211 /// @subsection cross_engine_reorder_cpp_sub6 Validate the result
212 ///
213 /// Now that we have the computed the result on CPU memory, let's validate
214 /// that it is actually correct.
215 ///
216 /// @snippet cross_engine_reorder.cpp Check the results
217 // [Check the results]
218 if (find_negative(m_cpu, tz) != 0)
219 throw std::logic_error(
220 "Unexpected output, find a negative value after the ReLU "
221 "execution.");
222 // [Check the results]
223}
224
225int main(int argc, char **argv) {
226 return handle_example_errors({engine::kind::cpu, engine::kind::gpu},
227 cross_engine_reorder_tutorial);
228}
229
230/// @page cross_engine_reorder_cpp
231///
232/// <b></b>
233///
234/// Upon compiling and running the example, the output should be just:
235///
236/// ~~~
237/// Example passed.
238/// ~~~
239///
240