cross_engine_reorder.cpp source code [oneDNN/examples/cross_engine_reorder.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	/// @example cross_engine_reorder.cpp
18	/// @copybrief cross_engine_reorder_cpp
19	/// > Annotated version: @ref cross_engine_reorder_cpp
20
21	/// @page cross_engine_reorder_cpp Reorder between CPU and GPU engines
22	/// This C++ API example demonstrates programming flow when reordering memory
23	/// between CPU and GPU engines.
24	///
25	/// > Example code: @ref cross_engine_reorder.cpp
26	///
27	/// @section cross_engine_reorder_cpp_headers Public headers
28	///
29	/// To start using oneDNN, we must first include the @ref dnnl.hpp
30	/// header file in the application. We also include @ref dnnl_debug.h, which
31	/// contains some debugging facilities such as returning a string representation
32	/// for common oneDNN C types.
33	///
34	/// All C++ API types and functions reside in the `dnnl` namespace.
35	/// For simplicity of the example we import this namespace.
36	/// @page cross_engine_reorder_cpp
37
38	#include <iostream>
39	#include <stdexcept>
40	#include <vector>
41
42	/// @snippet cross_engine_reorder.cpp Prologue
43	// [Prologue]
44	#include "example_utils.hpp"
45	#include "oneapi/dnnl/dnnl.hpp"
46
47	#include "example_utils.hpp"
48
49	using namespace dnnl;
50
51	using namespace std;
52	// [Prologue]
53
54	void fill(memory &mem, const memory::dims &adims) {
55	std::vector<float> array(product(adims));
56	for (size_t e = `0`; e < array.size(); ++e) {
57	array[e] = e % `7` ? `1.0f` : -`1.0f`;
58	}
59	write_to_dnnl_memory(array.data(), mem);
60	}
61
62	int find_negative(memory &mem, const memory::dims &adims) {
63	int negs = `0`;
64	size_t nelems = product(adims);
65	std::vector<float> array(nelems);
66	read_from_dnnl_memory(array.data(), mem);
67
68	for (size_t e = `0`; e < nelems; ++e)
69	negs += array[e] < `0.0f`;
70	return negs;
71	}
72
73	/// @page cross_engine_reorder_cpp
74	/// @section cross_engine_reorder_cpp_tutorial cross_engine_reorder_tutorial() function
75	///
76	void cross_engine_reorder_tutorial() {
77	/// @page cross_engine_reorder_cpp
78	/// @subsection cross_engine_reorder_cpp_sub1 Engine and stream
79	///
80	/// All oneDNN primitives and memory objects are attached to a
81	/// particular @ref dnnl::engine, which is an abstraction of a
82	/// computational device (see also @ref dev_guide_basic_concepts). The
83	/// primitives are created and optimized for the device they are attached
84	/// to, and the memory objects refer to memory residing on the
85	/// corresponding device. In particular, that means neither memory objects
86	/// nor primitives that were created for one engine can be used on
87	/// another.
88	///
89	/// To create engines, we must specify the @ref dnnl::engine::kind
90	/// and the index of the device of the given kind. There is only one CPU
91	/// engine and one GPU engine, so the index for both engines must be 0.
92	///
93	/// @snippet cross_engine_reorder.cpp Initialize engine
94	// [Initialize engine]
95	auto cpu_engine = engine(validate_engine_kind(engine::kind::cpu), `0`);
96	auto gpu_engine = engine(validate_engine_kind(engine::kind::gpu), `0`);
97	// [Initialize engine]
98
99	/// In addition to an engine, all primitives require a @ref dnnl::stream
100	/// for the execution. The stream encapsulates an execution context and is
101	/// tied to a particular engine.
102	///
103	/// In this example, a GPU stream is created.
104	///
105	/// @snippet cross_engine_reorder.cpp Initialize stream
106	// [Initialize stream]
107	auto stream_gpu = stream(gpu_engine, stream::flags::in_order);
108	// [Initialize stream]
109
110	/// @subsection cross_engine_reorder_cpp_sub2 Wrapping data into oneDNN GPU memory object
111	/// Fill the data in CPU memory first, and then move data from CPU to GPU
112	/// memory by reorder.
113	/// @snippet cross_engine_reorder.cpp reorder cpu2gpu
114	// [reorder cpu2gpu]
115	const auto tz = memory::dims {`2`, `16`, `1`, `1`};
116	auto m_cpu
117	= memory({{tz}, memory::data_type::f32, memory::format_tag::nchw},
118	cpu_engine);
119	auto m_gpu
120	= memory({{tz}, memory::data_type::f32, memory::format_tag::nchw},
121	gpu_engine);
122	fill(m_cpu, tz);
123	auto r1 = reorder(m_cpu, m_gpu);
124	// [reorder cpu2gpu]
125
126	/// @subsection cross_engine_reorder_cpp_sub3 Creating a ReLU primitive
127	///
128	/// Let's now create a ReLU primitive for GPU.
129	///
130	/// The library implements the ReLU primitive as a particular algorithm of a
131	/// more general @ref dev_guide_eltwise primitive, which applies a specified
132	/// function to each element of the source tensor.
133	///
134	/// Just as in the case of @ref dnnl::memory, a user should always go
135	/// through (at least) three creation steps (which, however, can sometimes
136	/// be combined thanks to C++11):
137	/// 1. Create an operation primitive descriptor (here @ref
138	/// dnnl::eltwise_forward::primitive_desc) that defines the operation
139	/// parameters including a GPU memory descriptor, and GPU engine.
140	/// Primitive descriptor is a lightweight* descriptor of the actual*
141	/// algorithm that implements* the given operation.*
142	/// 2. Create a primitive (here @ref dnnl::eltwise_forward) that can be
143	/// executed on GPU memory objects to compute the operation by a GPU
144	/// engine.
145	///
146	///@note
147	/// Primitive creation might be a very expensive operation, so consider
148	/// creating primitive objects once and executing them multiple times.
149	///
150	/// The code:
151	/// @snippet cross_engine_reorder.cpp Create a ReLU primitive
152	// [Create a ReLU primitive]
153	// ReLU primitive descriptor, which corresponds to a particular
154	// implementation in the library. Specify engine type for the ReLU
155	// primitive. Use a GPU engine here.
156	auto relu_pd = eltwise_forward::primitive_desc(gpu_engine,
157	prop_kind::forward, algorithm::eltwise_relu, m_gpu.get_desc(),
158	m_gpu.get_desc(), `0.0f`);
159	// ReLU primitive
160	auto relu = eltwise_forward(relu_pd);
161	// [Create a ReLU primitive]
162
163	/// @subsection cross_engine_reorder_cpp_sub4 Getting results from a oneDNN GPU memory object
164	/// After the ReLU operation, users need to get data from GPU to CPU memory
165	/// by reorder.
166	/// @snippet cross_engine_reorder.cpp reorder gpu2cpu
167	// [reorder gpu2cpu]
168	auto r2 = reorder(m_gpu, m_cpu);
169	// [reorder gpu2cpu]
170
171	/// @subsection cross_engine_reorder_cpp_sub5 Executing all primitives
172	///
173	/// Finally, let's execute all primitives and wait for their completion
174	/// via the following sequence:
175	///
176	/// Reorder(CPU,GPU) -> ReLU -> Reorder(GPU,CPU).
177	///
178	/// 1. After execution of the first Reorder, ReLU has source data in GPU.
179	///
180	/// 2. The input and output memory objects are passed to the ReLU
181	/// `execute()` method using a <tag, memory> map. Each tag specifies what
182	/// kind of tensor each memory object represents. All @ref dev_guide_eltwise
183	/// primitives require the map to have two elements: a source memory
184	/// object (input) and a destination memory (output). For executing
185	/// on GPU engine, both source and destination memory object must use
186	/// GPU memory.
187	///
188	/// 3. After the execution of the ReLU on GPU, the second Reorder moves
189	/// the results from GPU to CPU.
190	///
191	/// @note
192	/// All primitives are executed in the SAME GPU stream (the first
193	/// parameter of the `execute()` method).
194	///
195	/// Execution is asynchronous on GPU. This means that we need to call @ref
196	/// dnnl::stream::wait before accessing the results.
197	///
198	/// @snippet cross_engine_reorder.cpp Execute primitives
199	// [Execute primitives]
200	// wrap source data from CPU to GPU
201	r1.execute(stream_gpu, m_cpu, m_gpu);
202	// Execute ReLU on a GPU stream
203	relu.execute(stream_gpu, {{DNNL_ARG_SRC, m_gpu}, {DNNL_ARG_DST, m_gpu}});
204	// Get result data from GPU to CPU
205	r2.execute(stream_gpu, m_gpu, m_cpu);
206
207	stream_gpu.wait();
208	// [Execute primitives]
209
210	/// @page cross_engine_reorder_cpp
211	/// @subsection cross_engine_reorder_cpp_sub6 Validate the result
212	///
213	/// Now that we have the computed the result on CPU memory, let's validate
214	/// that it is actually correct.
215	///
216	/// @snippet cross_engine_reorder.cpp Check the results
217	// [Check the results]
218	if (find_negative(m_cpu, tz) != `0`)
219	throw std::logic_error(
220	"Unexpected output, find a negative value after the ReLU "
221	"execution.");
222	// [Check the results]
223	}
224
225	int main(int argc, char **argv) {
226	return handle_example_errors({engine::kind::cpu, engine::kind::gpu},
227	cross_engine_reorder_tutorial);
228	}
229
230	/// @page cross_engine_reorder_cpp
231	///
232	/// <b></b>
233	///
234	/// Upon compiling and running the example, the output should be just:
235	///
236	/// ~~~
237	/// Example passed.
238	/// ~~~
239	///
240

Browse the source code of oneDNN/examples/cross_engine_reorder.cpp