1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example cross_engine_reorder.cpp |
18 | /// @copybrief cross_engine_reorder_cpp |
19 | /// > Annotated version: @ref cross_engine_reorder_cpp |
20 | |
21 | /// @page cross_engine_reorder_cpp Reorder between CPU and GPU engines |
22 | /// This C++ API example demonstrates programming flow when reordering memory |
23 | /// between CPU and GPU engines. |
24 | /// |
25 | /// > Example code: @ref cross_engine_reorder.cpp |
26 | /// |
27 | /// @section cross_engine_reorder_cpp_headers Public headers |
28 | /// |
29 | /// To start using oneDNN, we must first include the @ref dnnl.hpp |
30 | /// header file in the application. We also include @ref dnnl_debug.h, which |
31 | /// contains some debugging facilities such as returning a string representation |
32 | /// for common oneDNN C types. |
33 | /// |
34 | /// All C++ API types and functions reside in the `dnnl` namespace. |
35 | /// For simplicity of the example we import this namespace. |
36 | /// @page cross_engine_reorder_cpp |
37 | |
38 | #include <iostream> |
39 | #include <stdexcept> |
40 | #include <vector> |
41 | |
42 | /// @snippet cross_engine_reorder.cpp Prologue |
43 | // [Prologue] |
44 | #include "example_utils.hpp" |
45 | #include "oneapi/dnnl/dnnl.hpp" |
46 | |
47 | #include "example_utils.hpp" |
48 | |
49 | using namespace dnnl; |
50 | |
51 | using namespace std; |
52 | // [Prologue] |
53 | |
54 | void fill(memory &mem, const memory::dims &adims) { |
55 | std::vector<float> array(product(adims)); |
56 | for (size_t e = 0; e < array.size(); ++e) { |
57 | array[e] = e % 7 ? 1.0f : -1.0f; |
58 | } |
59 | write_to_dnnl_memory(array.data(), mem); |
60 | } |
61 | |
62 | int find_negative(memory &mem, const memory::dims &adims) { |
63 | int negs = 0; |
64 | size_t nelems = product(adims); |
65 | std::vector<float> array(nelems); |
66 | read_from_dnnl_memory(array.data(), mem); |
67 | |
68 | for (size_t e = 0; e < nelems; ++e) |
69 | negs += array[e] < 0.0f; |
70 | return negs; |
71 | } |
72 | |
73 | /// @page cross_engine_reorder_cpp |
74 | /// @section cross_engine_reorder_cpp_tutorial cross_engine_reorder_tutorial() function |
75 | /// |
76 | void cross_engine_reorder_tutorial() { |
77 | /// @page cross_engine_reorder_cpp |
78 | /// @subsection cross_engine_reorder_cpp_sub1 Engine and stream |
79 | /// |
80 | /// All oneDNN primitives and memory objects are attached to a |
81 | /// particular @ref dnnl::engine, which is an abstraction of a |
82 | /// computational device (see also @ref dev_guide_basic_concepts). The |
83 | /// primitives are created and optimized for the device they are attached |
84 | /// to, and the memory objects refer to memory residing on the |
85 | /// corresponding device. In particular, that means neither memory objects |
86 | /// nor primitives that were created for one engine can be used on |
87 | /// another. |
88 | /// |
89 | /// To create engines, we must specify the @ref dnnl::engine::kind |
90 | /// and the index of the device of the given kind. There is only one CPU |
91 | /// engine and one GPU engine, so the index for both engines must be 0. |
92 | /// |
93 | /// @snippet cross_engine_reorder.cpp Initialize engine |
94 | // [Initialize engine] |
95 | auto cpu_engine = engine(validate_engine_kind(engine::kind::cpu), 0); |
96 | auto gpu_engine = engine(validate_engine_kind(engine::kind::gpu), 0); |
97 | // [Initialize engine] |
98 | |
99 | /// In addition to an engine, all primitives require a @ref dnnl::stream |
100 | /// for the execution. The stream encapsulates an execution context and is |
101 | /// tied to a particular engine. |
102 | /// |
103 | /// In this example, a GPU stream is created. |
104 | /// |
105 | /// @snippet cross_engine_reorder.cpp Initialize stream |
106 | // [Initialize stream] |
107 | auto stream_gpu = stream(gpu_engine, stream::flags::in_order); |
108 | // [Initialize stream] |
109 | |
110 | /// @subsection cross_engine_reorder_cpp_sub2 Wrapping data into oneDNN GPU memory object |
111 | /// Fill the data in CPU memory first, and then move data from CPU to GPU |
112 | /// memory by reorder. |
113 | /// @snippet cross_engine_reorder.cpp reorder cpu2gpu |
114 | // [reorder cpu2gpu] |
115 | const auto tz = memory::dims {2, 16, 1, 1}; |
116 | auto m_cpu |
117 | = memory({{tz}, memory::data_type::f32, memory::format_tag::nchw}, |
118 | cpu_engine); |
119 | auto m_gpu |
120 | = memory({{tz}, memory::data_type::f32, memory::format_tag::nchw}, |
121 | gpu_engine); |
122 | fill(m_cpu, tz); |
123 | auto r1 = reorder(m_cpu, m_gpu); |
124 | // [reorder cpu2gpu] |
125 | |
126 | /// @subsection cross_engine_reorder_cpp_sub3 Creating a ReLU primitive |
127 | /// |
128 | /// Let's now create a ReLU primitive for GPU. |
129 | /// |
130 | /// The library implements the ReLU primitive as a particular algorithm of a |
131 | /// more general @ref dev_guide_eltwise primitive, which applies a specified |
132 | /// function to each element of the source tensor. |
133 | /// |
134 | /// Just as in the case of @ref dnnl::memory, a user should always go |
135 | /// through (at least) three creation steps (which, however, can sometimes |
136 | /// be combined thanks to C++11): |
137 | /// 1. Create an operation primitive descriptor (here @ref |
138 | /// dnnl::eltwise_forward::primitive_desc) that defines the operation |
139 | /// parameters including a GPU memory descriptor, and GPU engine. |
140 | /// Primitive descriptor is a **lightweight** descriptor of the actual |
141 | /// algorithm that **implements** the given operation. |
142 | /// 2. Create a primitive (here @ref dnnl::eltwise_forward) that can be |
143 | /// executed on GPU memory objects to compute the operation by a GPU |
144 | /// engine. |
145 | /// |
146 | ///@note |
147 | /// Primitive creation might be a very expensive operation, so consider |
148 | /// creating primitive objects once and executing them multiple times. |
149 | /// |
150 | /// The code: |
151 | /// @snippet cross_engine_reorder.cpp Create a ReLU primitive |
152 | // [Create a ReLU primitive] |
153 | // ReLU primitive descriptor, which corresponds to a particular |
154 | // implementation in the library. Specify engine type for the ReLU |
155 | // primitive. Use a GPU engine here. |
156 | auto relu_pd = eltwise_forward::primitive_desc(gpu_engine, |
157 | prop_kind::forward, algorithm::eltwise_relu, m_gpu.get_desc(), |
158 | m_gpu.get_desc(), 0.0f); |
159 | // ReLU primitive |
160 | auto relu = eltwise_forward(relu_pd); |
161 | // [Create a ReLU primitive] |
162 | |
163 | /// @subsection cross_engine_reorder_cpp_sub4 Getting results from a oneDNN GPU memory object |
164 | /// After the ReLU operation, users need to get data from GPU to CPU memory |
165 | /// by reorder. |
166 | /// @snippet cross_engine_reorder.cpp reorder gpu2cpu |
167 | // [reorder gpu2cpu] |
168 | auto r2 = reorder(m_gpu, m_cpu); |
169 | // [reorder gpu2cpu] |
170 | |
171 | /// @subsection cross_engine_reorder_cpp_sub5 Executing all primitives |
172 | /// |
173 | /// Finally, let's execute all primitives and wait for their completion |
174 | /// via the following sequence: |
175 | /// |
176 | /// Reorder(CPU,GPU) -> ReLU -> Reorder(GPU,CPU). |
177 | /// |
178 | /// 1. After execution of the first Reorder, ReLU has source data in GPU. |
179 | /// |
180 | /// 2. The input and output memory objects are passed to the ReLU |
181 | /// `execute()` method using a <tag, memory> map. Each tag specifies what |
182 | /// kind of tensor each memory object represents. All @ref dev_guide_eltwise |
183 | /// primitives require the map to have two elements: a source memory |
184 | /// object (input) and a destination memory (output). For executing |
185 | /// on GPU engine, both source and destination memory object must use |
186 | /// GPU memory. |
187 | /// |
188 | /// 3. After the execution of the ReLU on GPU, the second Reorder moves |
189 | /// the results from GPU to CPU. |
190 | /// |
191 | /// @note |
192 | /// All primitives are executed in the SAME GPU stream (the first |
193 | /// parameter of the `execute()` method). |
194 | /// |
195 | /// Execution is asynchronous on GPU. This means that we need to call @ref |
196 | /// dnnl::stream::wait before accessing the results. |
197 | /// |
198 | /// @snippet cross_engine_reorder.cpp Execute primitives |
199 | // [Execute primitives] |
200 | // wrap source data from CPU to GPU |
201 | r1.execute(stream_gpu, m_cpu, m_gpu); |
202 | // Execute ReLU on a GPU stream |
203 | relu.execute(stream_gpu, {{DNNL_ARG_SRC, m_gpu}, {DNNL_ARG_DST, m_gpu}}); |
204 | // Get result data from GPU to CPU |
205 | r2.execute(stream_gpu, m_gpu, m_cpu); |
206 | |
207 | stream_gpu.wait(); |
208 | // [Execute primitives] |
209 | |
210 | /// @page cross_engine_reorder_cpp |
211 | /// @subsection cross_engine_reorder_cpp_sub6 Validate the result |
212 | /// |
213 | /// Now that we have the computed the result on CPU memory, let's validate |
214 | /// that it is actually correct. |
215 | /// |
216 | /// @snippet cross_engine_reorder.cpp Check the results |
217 | // [Check the results] |
218 | if (find_negative(m_cpu, tz) != 0) |
219 | throw std::logic_error( |
220 | "Unexpected output, find a negative value after the ReLU " |
221 | "execution." ); |
222 | // [Check the results] |
223 | } |
224 | |
225 | int main(int argc, char **argv) { |
226 | return handle_example_errors({engine::kind::cpu, engine::kind::gpu}, |
227 | cross_engine_reorder_tutorial); |
228 | } |
229 | |
230 | /// @page cross_engine_reorder_cpp |
231 | /// |
232 | /// <b></b> |
233 | /// |
234 | /// Upon compiling and running the example, the output should be just: |
235 | /// |
236 | /// ~~~ |
237 | /// Example passed. |
238 | /// ~~~ |
239 | /// |
240 | |