1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example convolution.cpp
18/// > Annotated version: @ref convolution_example_cpp
19///
20/// @page convolution_example_cpp_short
21///
22/// This C++ API example demonstrates how to create and execute a
23/// [Convolution](@ref dev_guide_convolution) primitive in forward propagation
24/// mode in two configurations - with and without groups.
25///
26/// Key optimizations included in this example:
27/// - Creation of optimized memory format from the primitive descriptor;
28/// - Primitive attributes with fused post-ops.
29///
30/// @page convolution_example_cpp Convolution Primitive Example
31/// @copydetails convolution_example_cpp_short
32///
33/// @include convolution.cpp
34
35#include <algorithm>
36#include <cmath>
37#include <iostream>
38#include <string>
39#include <vector>
40
41#include "example_utils.hpp"
42#include "oneapi/dnnl/dnnl.hpp"
43
44using namespace dnnl;
45
46using tag = memory::format_tag;
47using dt = memory::data_type;
48
49void convolution_example(dnnl::engine::kind engine_kind) {
50
51 // Create execution dnnl::engine.
52 dnnl::engine engine(engine_kind, 0);
53
54 // Create dnnl::stream.
55 dnnl::stream engine_stream(engine);
56
57 // Tensor dimensions.
58 const memory::dim N = 3, // batch size
59 IC = 32, // input channels
60 IH = 13, // input height
61 IW = 13, // input width
62 OC = 64, // output channels
63 KH = 3, // weights height
64 KW = 3, // weights width
65 PH_L = 1, // height padding: left
66 PH_R = 1, // height padding: right
67 PW_L = 1, // width padding: left
68 PW_R = 1, // width padding: right
69 SH = 4, // height-wise stride
70 SW = 4, // width-wise stride
71 OH = (IH - KH + PH_L + PH_R) / SH + 1, // output height
72 OW = (IW - KW + PW_L + PW_R) / SW + 1; // output width
73
74 // Source (src), weights, bias, and destination (dst) tensors
75 // dimensions.
76 memory::dims src_dims = {N, IC, IH, IW};
77 memory::dims weights_dims = {OC, IC, KH, KW};
78 memory::dims bias_dims = {OC};
79 memory::dims dst_dims = {N, OC, OH, OW};
80
81 // Strides, padding dimensions.
82 memory::dims strides_dims = {SH, SW};
83 memory::dims padding_dims_l = {PH_L, PW_L};
84 memory::dims padding_dims_r = {PH_R, PW_R};
85
86 // Allocate buffers.
87 std::vector<float> src_data(product(src_dims));
88 std::vector<float> weights_data(product(weights_dims));
89 std::vector<float> bias_data(OC);
90 std::vector<float> dst_data(product(dst_dims));
91
92 // Initialize src, weights, and dst tensors.
93 std::generate(src_data.begin(), src_data.end(), []() {
94 static int i = 0;
95 return std::cos(i++ / 10.f);
96 });
97 std::generate(weights_data.begin(), weights_data.end(), []() {
98 static int i = 0;
99 return std::sin(i++ * 2.f);
100 });
101 std::generate(bias_data.begin(), bias_data.end(), []() {
102 static int i = 0;
103 return std::tanh(float(i++));
104 });
105
106 // Create memory objects for tensor data (src, weights, dst). In this
107 // example, NCHW layout is assumed for src and dst, and OIHW for weights.
108 auto user_src_mem = memory({src_dims, dt::f32, tag::nchw}, engine);
109 auto user_weights_mem = memory({weights_dims, dt::f32, tag::oihw}, engine);
110 auto user_dst_mem = memory({dst_dims, dt::f32, tag::nchw}, engine);
111
112 // Create memory descriptors with format_tag::any for the primitive. This
113 // enables the convolution primitive to choose memory layouts for an
114 // optimized primitive implementation, and these layouts may differ from the
115 // ones provided by the user.
116 auto conv_src_md = memory::desc(src_dims, dt::f32, tag::any);
117 auto conv_weights_md = memory::desc(weights_dims, dt::f32, tag::any);
118 auto conv_dst_md = memory::desc(dst_dims, dt::f32, tag::any);
119
120 // Create memory descriptor and memory object for input bias.
121 auto user_bias_md = memory::desc(bias_dims, dt::f32, tag::a);
122 auto user_bias_mem = memory(user_bias_md, engine);
123
124 // Write data to memory object's handle.
125 write_to_dnnl_memory(src_data.data(), user_src_mem);
126 write_to_dnnl_memory(weights_data.data(), user_weights_mem);
127 write_to_dnnl_memory(bias_data.data(), user_bias_mem);
128
129 // Create primitive post-ops (ReLU).
130 const float alpha = 0.f;
131 const float beta = 0.f;
132 post_ops conv_ops;
133 conv_ops.append_eltwise(algorithm::eltwise_relu, alpha, beta);
134 primitive_attr conv_attr;
135 conv_attr.set_post_ops(conv_ops);
136
137 // Create primitive descriptor.
138 auto conv_pd = convolution_forward::primitive_desc(engine,
139 prop_kind::forward_training, algorithm::convolution_direct,
140 conv_src_md, conv_weights_md, user_bias_md, conv_dst_md,
141 strides_dims, padding_dims_l, padding_dims_r, conv_attr);
142
143 // For now, assume that the src, weights, and dst memory layouts generated
144 // by the primitive and the ones provided by the user are identical.
145 auto conv_src_mem = user_src_mem;
146 auto conv_weights_mem = user_weights_mem;
147 auto conv_dst_mem = user_dst_mem;
148
149 // Reorder the data in case the src and weights memory layouts generated by
150 // the primitive and the ones provided by the user are different. In this
151 // case, we create additional memory objects with internal buffers that will
152 // contain the reordered data. The data in dst will be reordered after the
153 // convolution computation has finalized.
154 if (conv_pd.src_desc() != user_src_mem.get_desc()) {
155 conv_src_mem = memory(conv_pd.src_desc(), engine);
156 reorder(user_src_mem, conv_src_mem)
157 .execute(engine_stream, user_src_mem, conv_src_mem);
158 }
159
160 if (conv_pd.weights_desc() != user_weights_mem.get_desc()) {
161 conv_weights_mem = memory(conv_pd.weights_desc(), engine);
162 reorder(user_weights_mem, conv_weights_mem)
163 .execute(engine_stream, user_weights_mem, conv_weights_mem);
164 }
165
166 if (conv_pd.dst_desc() != user_dst_mem.get_desc()) {
167 conv_dst_mem = memory(conv_pd.dst_desc(), engine);
168 }
169
170 // Create the primitive.
171 auto conv_prim = convolution_forward(conv_pd);
172
173 // Primitive arguments.
174 std::unordered_map<int, memory> conv_args;
175 conv_args.insert({DNNL_ARG_SRC, conv_src_mem});
176 conv_args.insert({DNNL_ARG_WEIGHTS, conv_weights_mem});
177 conv_args.insert({DNNL_ARG_BIAS, user_bias_mem});
178 conv_args.insert({DNNL_ARG_DST, conv_dst_mem});
179
180 // Primitive execution: convolution with ReLU.
181 conv_prim.execute(engine_stream, conv_args);
182
183 // Reorder the data in case the dst memory descriptor generated by the
184 // primitive and the one provided by the user are different.
185 if (conv_pd.dst_desc() != user_dst_mem.get_desc()) {
186 reorder(conv_dst_mem, user_dst_mem)
187 .execute(engine_stream, conv_dst_mem, user_dst_mem);
188 } else
189 user_dst_mem = conv_dst_mem;
190
191 // Wait for the computation to finalize.
192 engine_stream.wait();
193
194 // Read data from memory object's handle.
195 read_from_dnnl_memory(dst_data.data(), user_dst_mem);
196}
197
198void depthwise_convolution_example(dnnl::engine::kind engine_kind) {
199
200 // Create execution dnnl::engine.
201 dnnl::engine engine(engine_kind, 0);
202
203 // Create dnnl::stream.
204 dnnl::stream engine_stream(engine);
205
206 // Tensor dimensions.
207 const memory::dim N = 3, // batch size
208 G = 32, // channel groups
209 IC = 32, // input channels
210 IH = 13, // input height
211 IW = 13, // input width
212 OC = 32, // output channels
213 KH = 3, // weights height
214 KW = 3, // weights width
215 PH_L = 1, // height padding: left
216 PH_R = 1, // height padding: right
217 PW_L = 1, // width padding: left
218 PW_R = 1, // width padding: right
219 SH = 4, // height-wise stride
220 SW = 4, // width-wise stride
221 OH = (IH - KH + PH_L + PH_R) / SH + 1, // output height
222 OW = (IW - KW + PW_L + PW_R) / SW + 1; // output width
223
224 // Source (src), weights, bias, and destination (dst) tensors dimensions.
225 memory::dims src_dims = {N, IC, IH, IW};
226 memory::dims weights_dims = {G, OC / G, IC / G, KH, KW};
227 memory::dims bias_dims = {OC};
228 memory::dims dst_dims = {N, OC, OH, OW};
229
230 // Strides, padding dimensions.
231 memory::dims strides_dims = {SH, SW};
232 memory::dims padding_dims_l = {PH_L, PW_L};
233 memory::dims padding_dims_r = {PH_R, PW_R};
234
235 // Allocate buffers.
236 std::vector<float> src_data(product(src_dims));
237 std::vector<float> weights_data(product(weights_dims));
238 std::vector<float> bias_data(OC);
239 std::vector<float> dst_data(product(dst_dims));
240
241 // Initialize src, weights, and dst tensors.
242 std::generate(src_data.begin(), src_data.end(), []() {
243 static int i = 0;
244 return std::cos(i++ / 10.f);
245 });
246 std::generate(weights_data.begin(), weights_data.end(), []() {
247 static int i = 0;
248 return std::sin(i++ * 2.f);
249 });
250 std::generate(bias_data.begin(), bias_data.end(), []() {
251 static int i = 0;
252 return std::tanh(float(i++));
253 });
254
255 // Create memory objects for tensor data (src, weights, dst). In this
256 // example, NCHW layout is assumed for src and dst, and OIHW for weights.
257 auto user_src_mem = memory({src_dims, dt::f32, tag::nchw}, engine);
258 auto user_weights_mem = memory({weights_dims, dt::f32, tag::goihw}, engine);
259 auto user_dst_mem = memory({dst_dims, dt::f32, tag::nchw}, engine);
260
261 // Create memory descriptors with format_tag::any for the primitive. This
262 // enables the convolution primitive to choose memory layouts for an
263 // optimized primitive implementation, and these layouts may differ from the
264 // ones provided by the user.
265 auto conv_src_md = memory::desc(src_dims, dt::f32, tag::any);
266 auto conv_weights_md = memory::desc(weights_dims, dt::f32, tag::any);
267 auto conv_dst_md = memory::desc(dst_dims, dt::f32, tag::any);
268
269 // Create memory descriptor and memory object for input bias.
270 auto user_bias_md = memory::desc(bias_dims, dt::f32, tag::a);
271 auto user_bias_mem = memory(user_bias_md, engine);
272
273 // Write data to memory object's handle.
274 write_to_dnnl_memory(src_data.data(), user_src_mem);
275 write_to_dnnl_memory(weights_data.data(), user_weights_mem);
276 write_to_dnnl_memory(bias_data.data(), user_bias_mem);
277
278 // Create primitive post-ops (ReLU).
279 const float alpha = 0.f;
280 const float beta = 0.f;
281 post_ops conv_ops;
282 conv_ops.append_eltwise(algorithm::eltwise_relu, alpha, beta);
283 primitive_attr conv_attr;
284 conv_attr.set_post_ops(conv_ops);
285
286 // Create primitive descriptor.
287 auto conv_pd = convolution_forward::primitive_desc(engine,
288 prop_kind::forward_training, algorithm::convolution_direct,
289 conv_src_md, conv_weights_md, user_bias_md, conv_dst_md,
290 strides_dims, padding_dims_l, padding_dims_r, conv_attr);
291
292 // For now, assume that the src, weights, and dst memory layouts generated
293 // by the primitive and the ones provided by the user are identical.
294 auto conv_src_mem = user_src_mem;
295 auto conv_weights_mem = user_weights_mem;
296 auto conv_dst_mem = user_dst_mem;
297
298 // Reorder the data in case the src and weights memory layouts generated by
299 // the primitive and the ones provided by the user are different. In this
300 // case, we create additional memory objects with internal buffers that will
301 // contain the reordered data. The data in dst will be reordered after the
302 // convolution computation has finalized.
303 if (conv_pd.src_desc() != user_src_mem.get_desc()) {
304 conv_src_mem = memory(conv_pd.src_desc(), engine);
305 reorder(user_src_mem, conv_src_mem)
306 .execute(engine_stream, user_src_mem, conv_src_mem);
307 }
308
309 if (conv_pd.weights_desc() != user_weights_mem.get_desc()) {
310 conv_weights_mem = memory(conv_pd.weights_desc(), engine);
311 reorder(user_weights_mem, conv_weights_mem)
312 .execute(engine_stream, user_weights_mem, conv_weights_mem);
313 }
314
315 if (conv_pd.dst_desc() != user_dst_mem.get_desc()) {
316 conv_dst_mem = memory(conv_pd.dst_desc(), engine);
317 }
318
319 // Create the primitive.
320 auto conv_prim = convolution_forward(conv_pd);
321
322 // Primitive arguments.
323 std::unordered_map<int, memory> conv_args;
324 conv_args.insert({DNNL_ARG_SRC, conv_src_mem});
325 conv_args.insert({DNNL_ARG_WEIGHTS, conv_weights_mem});
326 conv_args.insert({DNNL_ARG_BIAS, user_bias_mem});
327 conv_args.insert({DNNL_ARG_DST, conv_dst_mem});
328
329 // Primitive execution: convolution with ReLU.
330 conv_prim.execute(engine_stream, conv_args);
331
332 // Reorder the data in case the dst memory descriptor generated by the
333 // primitive and the one provided by the user are different.
334 if (conv_pd.dst_desc() != user_dst_mem.get_desc()) {
335 reorder(conv_dst_mem, user_dst_mem)
336 .execute(engine_stream, conv_dst_mem, user_dst_mem);
337 } else
338 user_dst_mem = conv_dst_mem;
339
340 // Wait for the computation to finalize.
341 engine_stream.wait();
342
343 // Read data from memory object's handle.
344 read_from_dnnl_memory(dst_data.data(), user_dst_mem);
345}
346
347int main(int argc, char **argv) {
348 auto exit_code = handle_example_errors(
349 convolution_example, parse_engine_kind(argc, argv));
350 if (exit_code != 0) return exit_code;
351
352 return handle_example_errors(
353 depthwise_convolution_example, parse_engine_kind(argc, argv));
354}
355