1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example gpu_opencl_interop.cpp |
18 | /// @copybrief gpu_opencl_interop_cpp |
19 | /// > Annotated version: @ref gpu_opencl_interop_cpp |
20 | |
21 | /// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API |
22 | /// This C++ API example demonstrates programming for Intel(R) Processor |
23 | /// Graphics with OpenCL* extensions API in oneDNN. |
24 | /// |
25 | /// > Example code: @ref gpu_opencl_interop.cpp |
26 | /// |
27 | /// The workflow includes following steps: |
28 | /// - Create a GPU engine. It uses OpenCL as the runtime in this sample. |
29 | /// - Create a GPU memory descriptor/object. |
30 | /// - Create an OpenCL kernel for GPU data initialization |
31 | /// - Access a GPU memory via OpenCL interoperability interface |
32 | /// - Access a GPU command queue via OpenCL interoperability interface |
33 | /// - Execute a OpenCL kernel with related GPU command queue and GPU memory |
34 | /// - Create operation descriptor/operation primitives descriptor/primitive . |
35 | /// - Execute the primitive with the initialized GPU memory |
36 | /// - Validate the result by mapping the OpenCL memory via OpenCL interoperability |
37 | /// interface |
38 | /// |
39 | |
40 | /// @page gpu_opencl_interop_cpp |
41 | /// @section gpu_opencl_interop_cpp_headers Public headers |
42 | /// |
43 | /// To start using oneDNN, we must first include the @ref dnnl.hpp |
44 | /// header file in the application. We also include CL/cl.h for using |
45 | /// OpenCL APIs and @ref dnnl_debug.h, which contains some debugging |
46 | /// facilities such as returning a string representation |
47 | /// for common oneDNN C types. |
48 | /// All C++ API types and functions reside in the `dnnl` namespace. |
49 | /// For simplicity of the example we import this namespace. |
50 | /// @page gpu_opencl_interop_cpp |
51 | /// @snippet gpu_opencl_interop.cpp Prologue |
52 | // [Prologue] |
53 | #include <iostream> |
54 | #include <numeric> |
55 | #include <stdexcept> |
56 | |
57 | #include <CL/cl.h> |
58 | |
59 | #include "oneapi/dnnl/dnnl.hpp" |
60 | #include "oneapi/dnnl/dnnl_ocl.hpp" |
61 | |
62 | #include "example_utils.hpp" |
63 | |
64 | using namespace dnnl; |
65 | using namespace std; |
66 | // [Prologue] |
67 | |
68 | #define OCL_CHECK(x) \ |
69 | do { \ |
70 | cl_int s = (x); \ |
71 | if (s != CL_SUCCESS) { \ |
72 | std::cout << "[" << __FILE__ << ":" << __LINE__ << "] '" << #x \ |
73 | << "' failed (status code: " << s << ")." << std::endl; \ |
74 | exit(1); \ |
75 | } \ |
76 | } while (0) |
77 | |
78 | cl_kernel create_init_opencl_kernel( |
79 | cl_context ocl_ctx, const char *kernel_name, const char *ocl_code) { |
80 | cl_int err; |
81 | const char *sources[] = {ocl_code}; |
82 | cl_program ocl_program |
83 | = clCreateProgramWithSource(ocl_ctx, 1, sources, nullptr, &err); |
84 | OCL_CHECK(err); |
85 | |
86 | OCL_CHECK( |
87 | clBuildProgram(ocl_program, 0, nullptr, nullptr, nullptr, nullptr)); |
88 | |
89 | cl_kernel ocl_kernel = clCreateKernel(ocl_program, kernel_name, &err); |
90 | OCL_CHECK(err); |
91 | |
92 | OCL_CHECK(clReleaseProgram(ocl_program)); |
93 | return ocl_kernel; |
94 | } |
95 | |
96 | /// @page gpu_opencl_interop_cpp |
97 | /// @section gpu_opencl_interop_cpp_tutorial gpu_opencl_interop_tutorial() function |
98 | /// |
99 | void gpu_opencl_interop_tutorial() { |
100 | /// @page gpu_opencl_interop_cpp |
101 | /// @subsection gpu_opencl_interop_cpp_sub1 Engine and stream |
102 | /// |
103 | /// All oneDNN primitives and memory objects are attached to a |
104 | /// particular @ref dnnl::engine, which is an abstraction of a |
105 | /// computational device (see also @ref dev_guide_basic_concepts). The |
106 | /// primitives are created and optimized for the device to which they are |
107 | /// attached, and the memory objects refer to memory residing on the |
108 | /// corresponding device. In particular, that means neither memory objects |
109 | /// nor primitives that were created for one engine can be used on |
110 | /// another. |
111 | /// |
112 | /// To create engines, we must specify the @ref dnnl::engine::kind |
113 | /// and the index of the device of the given kind. In this example we use |
114 | /// the first available GPU engine, so the index for the engine is 0. |
115 | /// This example assumes OpenCL being a runtime for GPU. In such case, |
116 | /// during engine creation, an OpenCL context is also created and attaches |
117 | /// to the created engine. |
118 | /// |
119 | /// @snippet gpu_opencl_interop.cpp Initialize engine |
120 | // [Initialize engine] |
121 | engine eng(engine::kind::gpu, 0); |
122 | // [Initialize engine] |
123 | |
124 | /// In addition to an engine, all primitives require a @ref dnnl::stream |
125 | /// for the execution. The stream encapsulates an execution context and is |
126 | /// tied to a particular engine. |
127 | /// |
128 | /// In this example, a GPU stream is created. |
129 | /// This example assumes OpenCL being a runtime for GPU. During stream creation, |
130 | /// an OpenCL command queue is also created and attaches to this stream. |
131 | /// |
132 | /// @snippet gpu_opencl_interop.cpp Initialize stream |
133 | // [Initialize stream] |
134 | dnnl::stream strm(eng); |
135 | // [Initialize stream] |
136 | |
137 | /// @subsection gpu_opencl_interop_cpp_sub2 Wrapping data into oneDNN memory object |
138 | /// |
139 | /// Next, we create a memory object. We need to specify dimensions of our |
140 | /// memory by passing a memory::dims object. Then we create a memory |
141 | /// descriptor with these dimensions, with the dnnl::memory::data_type::f32 |
142 | /// data type, and with the dnnl::memory::format_tag::nchw memory format. |
143 | /// Finally, we construct a memory object and pass the memory descriptor. |
144 | /// The library allocates memory internally. |
145 | /// @snippet gpu_opencl_interop.cpp memory alloc |
146 | // [memory alloc] |
147 | memory::dims tz_dims = {2, 3, 4, 5}; |
148 | const size_t N = std::accumulate(tz_dims.begin(), tz_dims.end(), (size_t)1, |
149 | std::multiplies<size_t>()); |
150 | |
151 | memory::desc mem_d( |
152 | tz_dims, memory::data_type::f32, memory::format_tag::nchw); |
153 | |
154 | memory mem(mem_d, eng); |
155 | // [memory alloc] |
156 | |
157 | /// @subsection gpu_opencl_interop_cpp_sub3 Initialize the data by executing a custom OpenCL kernel |
158 | /// We are going to create an OpenCL kernel that will initialize our data. |
159 | /// It requires writing a bit of C code to create an OpenCL program from a |
160 | /// string literal source. The kernel initializes the data by the |
161 | /// 0, -1, 2, -3, ... sequence: `data[i] = (-1)^i * i`. |
162 | /// @snippet gpu_opencl_interop.cpp ocl kernel |
163 | // [ocl kernel] |
164 | const char *ocl_code |
165 | = "__kernel void init(__global float *data) {" |
166 | " int id = get_global_id(0);" |
167 | " data[id] = (id % 2) ? -id : id;" |
168 | "}" ; |
169 | // [ocl kernel] |
170 | |
171 | /// Create/Build Opencl kernel by `create_init_opencl_kernel()` function. |
172 | /// Refer to the full code example for the `create_init_opencl_kernel()` |
173 | /// function. |
174 | /// @snippet gpu_opencl_interop.cpp oclkernel create |
175 | // [oclkernel create] |
176 | const char *kernel_name = "init" ; |
177 | cl_kernel ocl_init_kernel = create_init_opencl_kernel( |
178 | ocl_interop::get_context(eng), kernel_name, ocl_code); |
179 | // [oclkernel create] |
180 | |
181 | /// The next step is to execute our OpenCL kernel by setting its arguments |
182 | /// and enqueueing to an OpenCL queue. You can extract the underlying OpenCL |
183 | /// buffer from the memory object using the interoperability interface: |
184 | /// dnnl::memory::get_ocl_mem_object() . For simplicity we can just construct a |
185 | /// stream, extract the underlying OpenCL queue, and enqueue the kernel to |
186 | /// this queue. |
187 | /// @snippet gpu_opencl_interop.cpp oclexecution |
188 | // [oclexecution] |
189 | cl_mem ocl_buf = ocl_interop::get_mem_object(mem); |
190 | OCL_CHECK(clSetKernelArg(ocl_init_kernel, 0, sizeof(ocl_buf), &ocl_buf)); |
191 | |
192 | cl_command_queue ocl_queue = ocl_interop::get_command_queue(strm); |
193 | OCL_CHECK(clEnqueueNDRangeKernel(ocl_queue, ocl_init_kernel, 1, nullptr, &N, |
194 | nullptr, 0, nullptr, nullptr)); |
195 | // [oclexecution] |
196 | |
197 | /// @subsection gpu_opencl_interop_cpp_sub4 Create and execute a primitive |
198 | /// There are three steps to create an operation primitive in oneDNN: |
199 | /// 1. Create an operation descriptor. |
200 | /// 2. Create a primitive descriptor. |
201 | /// 3. Create a primitive. |
202 | /// |
203 | /// Let's create the primitive to perform the ReLU (rectified linear unit) |
204 | /// operation: x = max(0, x). An operation descriptor has no dependency on a |
205 | /// specific engine - it just describes some operation. On the contrary, |
206 | /// primitive descriptors are attached to a specific engine and represent |
207 | /// some implementation for this engine. A primitive object is a realization |
208 | /// of a primitive descriptor, and its construction is usually much |
209 | /// "heavier". |
210 | /// @snippet gpu_opencl_interop.cpp relu creation |
211 | // [relu creation] |
212 | auto relu_pd = eltwise_forward::primitive_desc(eng, prop_kind::forward, |
213 | algorithm::eltwise_relu, mem_d, mem_d, 0.0f); |
214 | auto relu = eltwise_forward(relu_pd); |
215 | // [relu creation] |
216 | |
217 | /// Next, execute the primitive. |
218 | /// @snippet gpu_opencl_interop.cpp relu exec |
219 | // [relu exec] |
220 | relu.execute(strm, {{DNNL_ARG_SRC, mem}, {DNNL_ARG_DST, mem}}); |
221 | strm.wait(); |
222 | // [relu exec] |
223 | /// |
224 | ///@note |
225 | /// Our primitive mem serves as both input and output parameter. |
226 | /// |
227 | /// |
228 | ///@note |
229 | /// Primitive submission on GPU is asynchronous; However, the user can |
230 | /// call dnnl:stream::wait() to synchronize the stream and ensure that all |
231 | /// previously submitted primitives are completed. |
232 | /// |
233 | |
234 | /// @page gpu_opencl_interop_cpp |
235 | /// @subsection gpu_opencl_interop_cpp_sub5 Validate the results |
236 | /// |
237 | /// Before running validation codes, we need to copy the OpenCL memory to |
238 | /// the host. This can be done using OpenCL API. For convenience, we use a |
239 | /// utility function read_from_dnnl_memory() implementing required OpenCL API |
240 | /// calls. After we read the data to the host, we can run validation codes |
241 | /// on the host accordingly. |
242 | /// @snippet gpu_opencl_interop.cpp Check the results |
243 | // [Check the results] |
244 | std::vector<float> mem_data(N); |
245 | read_from_dnnl_memory(mem_data.data(), mem); |
246 | for (size_t i = 0; i < N; i++) { |
247 | float expected = (i % 2) ? 0.0f : (float)i; |
248 | if (mem_data[i] != expected) { |
249 | std::cout << "Expect " << expected << " but got " << mem_data[i] |
250 | << "." << std::endl; |
251 | throw std::logic_error("Accuracy check failed." ); |
252 | } |
253 | } |
254 | // [Check the results] |
255 | |
256 | OCL_CHECK(clReleaseKernel(ocl_init_kernel)); |
257 | } |
258 | |
259 | int main(int argc, char **argv) { |
260 | return handle_example_errors( |
261 | {engine::kind::gpu}, gpu_opencl_interop_tutorial); |
262 | } |
263 | |
264 | /// @page gpu_opencl_interop_cpp Getting started on GPU with OpenCL extensions API |
265 | /// |
266 | /// <b></b> |
267 | /// |
268 | /// Upon compiling and running the example, the output should be just: |
269 | /// |
270 | /// ~~~ |
271 | /// Example passed. |
272 | /// ~~~ |
273 | /// |
274 | |