1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example gpu_opencl_interop.cpp
18/// @copybrief gpu_opencl_interop_cpp
19/// > Annotated version: @ref gpu_opencl_interop_cpp
20
21/// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
22/// This C++ API example demonstrates programming for Intel(R) Processor
23/// Graphics with OpenCL* extensions API in oneDNN.
24///
25/// > Example code: @ref gpu_opencl_interop.cpp
26///
27/// The workflow includes following steps:
28/// - Create a GPU engine. It uses OpenCL as the runtime in this sample.
29/// - Create a GPU memory descriptor/object.
30/// - Create an OpenCL kernel for GPU data initialization
31/// - Access a GPU memory via OpenCL interoperability interface
32/// - Access a GPU command queue via OpenCL interoperability interface
33/// - Execute a OpenCL kernel with related GPU command queue and GPU memory
34/// - Create operation descriptor/operation primitives descriptor/primitive .
35/// - Execute the primitive with the initialized GPU memory
36/// - Validate the result by mapping the OpenCL memory via OpenCL interoperability
37/// interface
38///
39
40/// @page gpu_opencl_interop_cpp
41/// @section gpu_opencl_interop_cpp_headers Public headers
42///
43/// To start using oneDNN, we must first include the @ref dnnl.hpp
44/// header file in the application. We also include CL/cl.h for using
45/// OpenCL APIs and @ref dnnl_debug.h, which contains some debugging
46/// facilities such as returning a string representation
47/// for common oneDNN C types.
48/// All C++ API types and functions reside in the `dnnl` namespace.
49/// For simplicity of the example we import this namespace.
50/// @page gpu_opencl_interop_cpp
51/// @snippet gpu_opencl_interop.cpp Prologue
52// [Prologue]
53#include <iostream>
54#include <numeric>
55#include <stdexcept>
56
57#include <CL/cl.h>
58
59#include "oneapi/dnnl/dnnl.hpp"
60#include "oneapi/dnnl/dnnl_ocl.hpp"
61
62#include "example_utils.hpp"
63
64using namespace dnnl;
65using namespace std;
66// [Prologue]
67
68#define OCL_CHECK(x) \
69 do { \
70 cl_int s = (x); \
71 if (s != CL_SUCCESS) { \
72 std::cout << "[" << __FILE__ << ":" << __LINE__ << "] '" << #x \
73 << "' failed (status code: " << s << ")." << std::endl; \
74 exit(1); \
75 } \
76 } while (0)
77
78cl_kernel create_init_opencl_kernel(
79 cl_context ocl_ctx, const char *kernel_name, const char *ocl_code) {
80 cl_int err;
81 const char *sources[] = {ocl_code};
82 cl_program ocl_program
83 = clCreateProgramWithSource(ocl_ctx, 1, sources, nullptr, &err);
84 OCL_CHECK(err);
85
86 OCL_CHECK(
87 clBuildProgram(ocl_program, 0, nullptr, nullptr, nullptr, nullptr));
88
89 cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err);
90 OCL_CHECK(err);
91
92 OCL_CHECK(clReleaseProgram(ocl_program));
93 return ocl_kernel;
94}
95
96/// @page gpu_opencl_interop_cpp
97/// @section gpu_opencl_interop_cpp_tutorial gpu_opencl_interop_tutorial() function
98///
99void gpu_opencl_interop_tutorial() {
100 /// @page gpu_opencl_interop_cpp
101 /// @subsection gpu_opencl_interop_cpp_sub1 Engine and stream
102 ///
103 /// All oneDNN primitives and memory objects are attached to a
104 /// particular @ref dnnl::engine, which is an abstraction of a
105 /// computational device (see also @ref dev_guide_basic_concepts). The
106 /// primitives are created and optimized for the device to which they are
107 /// attached, and the memory objects refer to memory residing on the
108 /// corresponding device. In particular, that means neither memory objects
109 /// nor primitives that were created for one engine can be used on
110 /// another.
111 ///
112 /// To create engines, we must specify the @ref dnnl::engine::kind
113 /// and the index of the device of the given kind. In this example we use
114 /// the first available GPU engine, so the index for the engine is 0.
115 /// This example assumes OpenCL being a runtime for GPU. In such case,
116 /// during engine creation, an OpenCL context is also created and attaches
117 /// to the created engine.
118 ///
119 /// @snippet gpu_opencl_interop.cpp Initialize engine
120 // [Initialize engine]
121 engine eng(engine::kind::gpu, 0);
122 // [Initialize engine]
123
124 /// In addition to an engine, all primitives require a @ref dnnl::stream
125 /// for the execution. The stream encapsulates an execution context and is
126 /// tied to a particular engine.
127 ///
128 /// In this example, a GPU stream is created.
129 /// This example assumes OpenCL being a runtime for GPU. During stream creation,
130 /// an OpenCL command queue is also created and attaches to this stream.
131 ///
132 /// @snippet gpu_opencl_interop.cpp Initialize stream
133 // [Initialize stream]
134 dnnl::stream strm(eng);
135 // [Initialize stream]
136
137 /// @subsection gpu_opencl_interop_cpp_sub2 Wrapping data into oneDNN memory object
138 ///
139 /// Next, we create a memory object. We need to specify dimensions of our
140 /// memory by passing a memory::dims object. Then we create a memory
141 /// descriptor with these dimensions, with the dnnl::memory::data_type::f32
142 /// data type, and with the dnnl::memory::format_tag::nchw memory format.
143 /// Finally, we construct a memory object and pass the memory descriptor.
144 /// The library allocates memory internally.
145 /// @snippet gpu_opencl_interop.cpp memory alloc
146 // [memory alloc]
147 memory::dims tz_dims = {2, 3, 4, 5};
148 const size_t N = std::accumulate(tz_dims.begin(), tz_dims.end(), (size_t)1,
149 std::multiplies<size_t>());
150
151 memory::desc mem_d(
152 tz_dims, memory::data_type::f32, memory::format_tag::nchw);
153
154 memory mem(mem_d, eng);
155 // [memory alloc]
156
157 /// @subsection gpu_opencl_interop_cpp_sub3 Initialize the data by executing a custom OpenCL kernel
158 /// We are going to create an OpenCL kernel that will initialize our data.
159 /// It requires writing a bit of C code to create an OpenCL program from a
160 /// string literal source. The kernel initializes the data by the
161 /// 0, -1, 2, -3, ... sequence: `data[i] = (-1)^i * i`.
162 /// @snippet gpu_opencl_interop.cpp ocl kernel
163 // [ocl kernel]
164 const char *ocl_code
165 = "__kernel void init(__global float *data) {"
166 " int id = get_global_id(0);"
167 " data[id] = (id % 2) ? -id : id;"
168 "}";
169 // [ocl kernel]
170
171 /// Create/Build Opencl kernel by `create_init_opencl_kernel()` function.
172 /// Refer to the full code example for the `create_init_opencl_kernel()`
173 /// function.
174 /// @snippet gpu_opencl_interop.cpp oclkernel create
175 // [oclkernel create]
176 const char *kernel_name = "init";
177 cl_kernel ocl_init_kernel = create_init_opencl_kernel(
178 ocl_interop::get_context(eng), kernel_name, ocl_code);
179 // [oclkernel create]
180
181 /// The next step is to execute our OpenCL kernel by setting its arguments
182 /// and enqueueing to an OpenCL queue. You can extract the underlying OpenCL
183 /// buffer from the memory object using the interoperability interface:
184 /// dnnl::memory::get_ocl_mem_object() . For simplicity we can just construct a
185 /// stream, extract the underlying OpenCL queue, and enqueue the kernel to
186 /// this queue.
187 /// @snippet gpu_opencl_interop.cpp oclexecution
188 // [oclexecution]
189 cl_mem ocl_buf = ocl_interop::get_mem_object(mem);
190 OCL_CHECK(clSetKernelArg(ocl_init_kernel, 0, sizeof(ocl_buf), &ocl_buf));
191
192 cl_command_queue ocl_queue = ocl_interop::get_command_queue(strm);
193 OCL_CHECK(clEnqueueNDRangeKernel(ocl_queue, ocl_init_kernel, 1, nullptr, &N,
194 nullptr, 0, nullptr, nullptr));
195 // [oclexecution]
196
197 /// @subsection gpu_opencl_interop_cpp_sub4 Create and execute a primitive
198 /// There are three steps to create an operation primitive in oneDNN:
199 /// 1. Create an operation descriptor.
200 /// 2. Create a primitive descriptor.
201 /// 3. Create a primitive.
202 ///
203 /// Let's create the primitive to perform the ReLU (rectified linear unit)
204 /// operation: x = max(0, x). An operation descriptor has no dependency on a
205 /// specific engine - it just describes some operation. On the contrary,
206 /// primitive descriptors are attached to a specific engine and represent
207 /// some implementation for this engine. A primitive object is a realization
208 /// of a primitive descriptor, and its construction is usually much
209 /// "heavier".
210 /// @snippet gpu_opencl_interop.cpp relu creation
211 // [relu creation]
212 auto relu_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward,
213 algorithm::eltwise_relu, mem_d, mem_d, 0.0f);
214 auto relu = eltwise_forward(relu_pd);
215 // [relu creation]
216
217 /// Next, execute the primitive.
218 /// @snippet gpu_opencl_interop.cpp relu exec
219 // [relu exec]
220 relu.execute(strm, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}});
221 strm.wait();
222 // [relu exec]
223 ///
224 ///@note
225 /// Our primitive mem serves as both input and output parameter.
226 ///
227 ///
228 ///@note
229 /// Primitive submission on GPU is asynchronous; However, the user can
230 /// call dnnl:stream::wait() to synchronize the stream and ensure that all
231 /// previously submitted primitives are completed.
232 ///
233
234 /// @page gpu_opencl_interop_cpp
235 /// @subsection gpu_opencl_interop_cpp_sub5 Validate the results
236 ///
237 /// Before running validation codes, we need to copy the OpenCL memory to
238 /// the host. This can be done using OpenCL API. For convenience, we use a
239 /// utility function read_from_dnnl_memory() implementing required OpenCL API
240 /// calls. After we read the data to the host, we can run validation codes
241 /// on the host accordingly.
242 /// @snippet gpu_opencl_interop.cpp Check the results
243 // [Check the results]
244 std::vector<float> mem_data(N);
245 read_from_dnnl_memory(mem_data.data(), mem);
246 for (size_t i = 0; i < N; i++) {
247 float expected = (i % 2) ? 0.0f : (float)i;
248 if (mem_data[i] != expected) {
249 std::cout << "Expect " << expected << " but got " << mem_data[i]
250 << "." << std::endl;
251 throw std::logic_error("Accuracy check failed.");
252 }
253 }
254 // [Check the results]
255
256 OCL_CHECK(clReleaseKernel(ocl_init_kernel));
257}
258
259int main(int argc, char **argv) {
260 return handle_example_errors(
261 {engine::kind::gpu}, gpu_opencl_interop_tutorial);
262}
263
264/// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API
265///
266/// <b></b>
267///
268/// Upon compiling and running the example, the output should be just:
269///
270/// ~~~
271/// Example passed.
272/// ~~~
273///
274