1/*******************************************************************************
2* Copyright 2016-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example cnn_inference_f32.cpp
18/// @copybrief cnn_inference_f32_cpp
19/// > Annotated version: @ref cnn_inference_f32_cpp
20
21/// @page cnn_inference_f32_cpp CNN f32 inference example
22/// This C++ API example demonstrates how to build an AlexNet neural
23/// network topology for forward-pass inference.
24///
25/// > Example code: @ref cnn_inference_f32.cpp
26///
27/// Some key take-aways include:
28///
29/// * How tensors are implemented and submitted to primitives.
30/// * How primitives are created.
31/// * How primitives are sequentially submitted to the network, where the output
32/// from primitives is passed as input to the next primitive. The latter
33/// specifies a dependency between the primitive input and output data.
34/// * Specific 'inference-only' configurations.
35/// * Limiting the number of reorders performed that are detrimental
36/// to performance.
37///
38/// The example implements the AlexNet layers
39/// as numbered primitives (for example, conv1, pool1, conv2).
40
41#include <assert.h>
42
43#include <chrono>
44#include <vector>
45#include <unordered_map>
46
47#include "oneapi/dnnl/dnnl.hpp"
48
49#include "example_utils.hpp"
50
51using namespace dnnl;
52
53void simple_net(engine::kind engine_kind, int times = 100) {
54 using tag = memory::format_tag;
55 using dt = memory::data_type;
56
57 /// Initialize an engine and stream. The last parameter in the call represents
58 /// the index of the engine.
59 /// @snippet cnn_inference_f32.cpp Initialize engine and stream
60 //[Initialize engine and stream]
61 engine eng(engine_kind, 0);
62 stream s(eng);
63 //[Initialize engine and stream]
64
65 /// Create a vector for the primitives and a vector to hold memory
66 /// that will be used as arguments.
67 /// @snippet cnn_inference_f32.cpp Create network
68 //[Create network]
69 std::vector<primitive> net;
70 std::vector<std::unordered_map<int, memory>> net_args;
71 //[Create network]
72
73 const memory::dim batch = 1;
74
75 // AlexNet: conv1
76 // {batch, 3, 227, 227} (x) {96, 3, 11, 11} -> {batch, 96, 55, 55}
77 // strides: {4, 4}
78 memory::dims conv1_src_tz = {batch, 3, 227, 227};
79 memory::dims conv1_weights_tz = {96, 3, 11, 11};
80 memory::dims conv1_bias_tz = {96};
81 memory::dims conv1_dst_tz = {batch, 96, 55, 55};
82 memory::dims conv1_strides = {4, 4};
83 memory::dims conv1_padding = {0, 0};
84
85 /// Allocate buffers for input and output data, weights, and bias.
86 /// @snippet cnn_inference_f32.cpp Allocate buffers
87 //[Allocate buffers]
88 std::vector<float> user_src(batch * 3 * 227 * 227);
89 std::vector<float> user_dst(batch * 1000);
90 std::vector<float> conv1_weights(product(conv1_weights_tz));
91 std::vector<float> conv1_bias(product(conv1_bias_tz));
92 //[Allocate buffers]
93
94 /// Create memory that describes data layout in the buffers. This example uses
95 /// tag::nchw (batch-channels-height-width) for input data and tag::oihw
96 /// for weights.
97 /// @snippet cnn_inference_f32.cpp Create user memory
98 //[Create user memory]
99 auto user_src_memory = memory({{conv1_src_tz}, dt::f32, tag::nchw}, eng);
100 write_to_dnnl_memory(user_src.data(), user_src_memory);
101 auto user_weights_memory
102 = memory({{conv1_weights_tz}, dt::f32, tag::oihw}, eng);
103 write_to_dnnl_memory(conv1_weights.data(), user_weights_memory);
104 auto conv1_user_bias_memory
105 = memory({{conv1_bias_tz}, dt::f32, tag::x}, eng);
106 write_to_dnnl_memory(conv1_bias.data(), conv1_user_bias_memory);
107 //[Create user memory]
108
109 /// Create memory descriptors with layout tag::any. The `any` format enables
110 /// the convolution primitive to choose the data format that will result in
111 /// best performance based on its input parameters (convolution kernel
112 /// sizes, strides, padding, and so on). If the resulting format is different
113 /// from `nchw`, the user data must be transformed to the format required for
114 /// the convolution (as explained below).
115 /// @snippet cnn_inference_f32.cpp Create convolution memory descriptors
116 //[Create convolution memory descriptors]
117 auto conv1_src_md = memory::desc({conv1_src_tz}, dt::f32, tag::any);
118 auto conv1_bias_md = memory::desc({conv1_bias_tz}, dt::f32, tag::any);
119 auto conv1_weights_md = memory::desc({conv1_weights_tz}, dt::f32, tag::any);
120 auto conv1_dst_md = memory::desc({conv1_dst_tz}, dt::f32, tag::any);
121 //[Create convolution memory descriptors]
122
123 /// Create a convolution primitive descriptor by specifying engine,
124 /// propagation kind, [convolution algorithm](@ref dev_guide_convolution),
125 /// shapes of input, weights, bias, output, convolution strides, padding,
126 /// and kind of padding.
127 /// Propagation kind is set to prop_kind::forward_inference to optimize for
128 /// inference execution and omit computations that are necessary only for
129 /// backward propagation.
130 /// Once created, it has specific formats instead of the `any` format.
131 /// @snippet cnn_inference_f32.cpp Create convolution primitive descriptor
132 //[Create convolution primitive descriptor]
133 auto conv1_prim_desc = convolution_forward::primitive_desc(eng,
134 prop_kind::forward_inference, algorithm::convolution_direct,
135 conv1_src_md, conv1_weights_md, conv1_bias_md, conv1_dst_md,
136 conv1_strides, conv1_padding, conv1_padding);
137 //[Create convolution primitive descriptor]
138
139 /// Check whether data and weights formats required by convolution is different
140 /// from the user format. In case it is different change the layout using
141 /// reorder primitive.
142 /// @snippet cnn_inference_f32.cpp Reorder data and weights
143 //[Reorder data and weights]
144 auto conv1_src_memory = user_src_memory;
145 if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) {
146 conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng);
147 net.push_back(reorder(user_src_memory, conv1_src_memory));
148 net_args.push_back({{DNNL_ARG_FROM, user_src_memory},
149 {DNNL_ARG_TO, conv1_src_memory}});
150 }
151
152 auto conv1_weights_memory = user_weights_memory;
153 if (conv1_prim_desc.weights_desc() != user_weights_memory.get_desc()) {
154 conv1_weights_memory = memory(conv1_prim_desc.weights_desc(), eng);
155 reorder(user_weights_memory, conv1_weights_memory)
156 .execute(s, user_weights_memory, conv1_weights_memory);
157 }
158 //[Reorder data and weights]
159
160 /// Create a memory primitive for output.
161 /// @snippet cnn_inference_f32.cpp Create memory for output
162 //[Create memory for output]
163 auto conv1_dst_memory = memory(conv1_prim_desc.dst_desc(), eng);
164 //[Create memory for output]
165
166 /// Create a convolution primitive and add it to the net.
167 /// @snippet cnn_inference_f32.cpp Create memory for output
168 //[Create convolution primitive]
169 net.push_back(convolution_forward(conv1_prim_desc));
170 net_args.push_back({{DNNL_ARG_SRC, conv1_src_memory},
171 {DNNL_ARG_WEIGHTS, conv1_weights_memory},
172 {DNNL_ARG_BIAS, conv1_user_bias_memory},
173 {DNNL_ARG_DST, conv1_dst_memory}});
174 //[Create convolution primitive]
175
176 // AlexNet: relu1
177 // {batch, 96, 55, 55} -> {batch, 96, 55, 55}
178 const float negative1_slope = 0.0f;
179
180 /// Create the relu primitive. For better performance, keep the input data
181 /// format for ReLU (as well as for other operation primitives until another
182 /// convolution or inner product is encountered) the same as the one chosen
183 /// for convolution. Also note that ReLU is done in-place by using conv1 memory.
184 /// @snippet cnn_inference_f32.cpp Create relu primitive
185 //[Create relu primitive]
186 auto relu1_prim_desc
187 = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference,
188 algorithm::eltwise_relu, conv1_dst_memory.get_desc(),
189 conv1_dst_memory.get_desc(), negative1_slope);
190
191 net.push_back(eltwise_forward(relu1_prim_desc));
192 net_args.push_back({{DNNL_ARG_SRC, conv1_dst_memory},
193 {DNNL_ARG_DST, conv1_dst_memory}});
194 //[Create relu primitive]
195
196 // AlexNet: lrn1
197 // {batch, 96, 55, 55} -> {batch, 96, 55, 55}
198 // local size: 5
199 // alpha1: 0.0001
200 // beta1: 0.75
201 const memory::dim local1_size = 5;
202 const float alpha1 = 0.0001f;
203 const float beta1 = 0.75f;
204 const float k1 = 1.0f;
205
206 // create lrn primitive and add it to net
207 auto lrn1_prim_desc = lrn_forward::primitive_desc(eng,
208 prop_kind::forward_inference, algorithm::lrn_across_channels,
209 conv1_dst_memory.get_desc(), conv1_dst_memory.get_desc(),
210 local1_size, alpha1, beta1, k1);
211 auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_desc(), eng);
212
213 net.push_back(lrn_forward(lrn1_prim_desc));
214 net_args.push_back({{DNNL_ARG_SRC, conv1_dst_memory},
215 {DNNL_ARG_DST, lrn1_dst_memory}});
216
217 // AlexNet: pool1
218 // {batch, 96, 55, 55} -> {batch, 96, 27, 27}
219 // kernel: {3, 3}
220 // strides: {2, 2}
221 memory::dims pool1_dst_tz = {batch, 96, 27, 27};
222 memory::dims pool1_kernel = {3, 3};
223 memory::dims pool1_strides = {2, 2};
224 memory::dims pool_dilation = {0, 0};
225 memory::dims pool_padding = {0, 0};
226
227 auto pool1_dst_md = memory::desc({pool1_dst_tz}, dt::f32, tag::any);
228
229 /// For training execution, pooling requires a private workspace memory
230 /// to perform the backward pass. However, pooling should not use 'workspace'
231 /// for inference, because this is detrimental to performance.
232 /// @snippet cnn_inference_f32.cpp Create pooling primitive
233 ///
234 /// The example continues to create more layers according
235 /// to the AlexNet topology.
236 //[Create pooling primitive]
237 auto pool1_pd = pooling_forward::primitive_desc(eng,
238 prop_kind::forward_inference, algorithm::pooling_max,
239 lrn1_dst_memory.get_desc(), pool1_dst_md, pool1_strides,
240 pool1_kernel, pool_dilation, pool_padding, pool_padding);
241 auto pool1_dst_memory = memory(pool1_pd.dst_desc(), eng);
242
243 net.push_back(pooling_forward(pool1_pd));
244 net_args.push_back({{DNNL_ARG_SRC, lrn1_dst_memory},
245 {DNNL_ARG_DST, pool1_dst_memory}});
246 //[Create pooling primitive]
247
248 // AlexNet: conv2
249 // {batch, 96, 27, 27} (x) {2, 128, 48, 5, 5} -> {batch, 256, 27, 27}
250 // strides: {1, 1}
251 memory::dims conv2_src_tz = {batch, 96, 27, 27};
252 memory::dims conv2_weights_tz = {2, 128, 48, 5, 5};
253 memory::dims conv2_bias_tz = {256};
254 memory::dims conv2_dst_tz = {batch, 256, 27, 27};
255 memory::dims conv2_strides = {1, 1};
256 memory::dims conv2_padding = {2, 2};
257
258 std::vector<float> conv2_weights(product(conv2_weights_tz));
259 std::vector<float> conv2_bias(product(conv2_bias_tz));
260
261 // create memory for user data
262 auto conv2_user_weights_memory
263 = memory({{conv2_weights_tz}, dt::f32, tag::goihw}, eng);
264 write_to_dnnl_memory(conv2_weights.data(), conv2_user_weights_memory);
265 auto conv2_user_bias_memory
266 = memory({{conv2_bias_tz}, dt::f32, tag::x}, eng);
267 write_to_dnnl_memory(conv2_bias.data(), conv2_user_bias_memory);
268
269 // create memory descriptors for convolution data w/ no specified format
270 auto conv2_src_md = memory::desc({conv2_src_tz}, dt::f32, tag::any);
271 auto conv2_bias_md = memory::desc({conv2_bias_tz}, dt::f32, tag::any);
272 auto conv2_weights_md = memory::desc({conv2_weights_tz}, dt::f32, tag::any);
273 auto conv2_dst_md = memory::desc({conv2_dst_tz}, dt::f32, tag::any);
274
275 // create a convolution
276 auto conv2_prim_desc = convolution_forward::primitive_desc(eng,
277 prop_kind::forward_inference, algorithm::convolution_direct,
278 conv2_src_md, conv2_weights_md, conv2_bias_md, conv2_dst_md,
279 conv2_strides, conv2_padding, conv2_padding);
280
281 auto conv2_src_memory = pool1_dst_memory;
282 if (conv2_prim_desc.src_desc() != conv2_src_memory.get_desc()) {
283 conv2_src_memory = memory(conv2_prim_desc.src_desc(), eng);
284 net.push_back(reorder(pool1_dst_memory, conv2_src_memory));
285 net_args.push_back({{DNNL_ARG_FROM, pool1_dst_memory},
286 {DNNL_ARG_TO, conv2_src_memory}});
287 }
288
289 auto conv2_weights_memory = conv2_user_weights_memory;
290 if (conv2_prim_desc.weights_desc()
291 != conv2_user_weights_memory.get_desc()) {
292 conv2_weights_memory = memory(conv2_prim_desc.weights_desc(), eng);
293 reorder(conv2_user_weights_memory, conv2_weights_memory)
294 .execute(s, conv2_user_weights_memory, conv2_weights_memory);
295 }
296
297 auto conv2_dst_memory = memory(conv2_prim_desc.dst_desc(), eng);
298
299 // create convolution primitive and add it to net
300 net.push_back(convolution_forward(conv2_prim_desc));
301 net_args.push_back({{DNNL_ARG_SRC, conv2_src_memory},
302 {DNNL_ARG_WEIGHTS, conv2_weights_memory},
303 {DNNL_ARG_BIAS, conv2_user_bias_memory},
304 {DNNL_ARG_DST, conv2_dst_memory}});
305
306 // AlexNet: relu2
307 // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
308 const float negative2_slope = 0.0f;
309
310 // create relu primitive and add it to net
311 auto relu2_prim_desc
312 = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference,
313 algorithm::eltwise_relu, conv2_dst_memory.get_desc(),
314 conv2_dst_memory.get_desc(), negative2_slope);
315
316 net.push_back(eltwise_forward(relu2_prim_desc));
317 net_args.push_back({{DNNL_ARG_SRC, conv2_dst_memory},
318 {DNNL_ARG_DST, conv2_dst_memory}});
319
320 // AlexNet: lrn2
321 // {batch, 256, 27, 27} -> {batch, 256, 27, 27}
322 // local size: 5
323 // alpha2: 0.0001
324 // beta2: 0.75
325 const memory::dim local2_size = 5;
326 const float alpha2 = 0.0001f;
327 const float beta2 = 0.75f;
328 const float k2 = 1.0f;
329
330 // create lrn primitive and add it to net
331 auto lrn2_prim_desc
332 = lrn_forward::primitive_desc(eng, prop_kind::forward_inference,
333 algorithm::lrn_across_channels, conv2_prim_desc.dst_desc(),
334 conv2_prim_desc.dst_desc(), local2_size, alpha2, beta2, k2);
335 auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_desc(), eng);
336
337 net.push_back(lrn_forward(lrn2_prim_desc));
338 net_args.push_back({{DNNL_ARG_SRC, conv2_dst_memory},
339 {DNNL_ARG_DST, lrn2_dst_memory}});
340
341 // AlexNet: pool2
342 // {batch, 256, 27, 27} -> {batch, 256, 13, 13}
343 // kernel: {3, 3}
344 // strides: {2, 2}
345 memory::dims pool2_dst_tz = {batch, 256, 13, 13};
346 memory::dims pool2_kernel = {3, 3};
347 memory::dims pool2_strides = {2, 2};
348 memory::dims pool2_dilation = {0, 0};
349 memory::dims pool2_padding = {0, 0};
350
351 auto pool2_dst_md = memory::desc({pool2_dst_tz}, dt::f32, tag::any);
352
353 // create a pooling
354 auto pool2_pd = pooling_forward::primitive_desc(eng,
355 prop_kind::forward_inference, algorithm::pooling_max,
356 lrn2_dst_memory.get_desc(), pool2_dst_md, pool2_strides,
357 pool2_kernel, pool2_dilation, pool2_padding, pool2_padding);
358 auto pool2_dst_memory = memory(pool2_pd.dst_desc(), eng);
359
360 // create pooling primitive an add it to net
361 net.push_back(pooling_forward(pool2_pd));
362 net_args.push_back({{DNNL_ARG_SRC, lrn2_dst_memory},
363 {DNNL_ARG_DST, pool2_dst_memory}});
364
365 // AlexNet: conv3
366 // {batch, 256, 13, 13} (x) {384, 256, 3, 3}; -> {batch, 384, 13, 13};
367 // strides: {1, 1}
368 memory::dims conv3_src_tz = {batch, 256, 13, 13};
369 memory::dims conv3_weights_tz = {384, 256, 3, 3};
370 memory::dims conv3_bias_tz = {384};
371 memory::dims conv3_dst_tz = {batch, 384, 13, 13};
372 memory::dims conv3_strides = {1, 1};
373 memory::dims conv3_padding = {1, 1};
374
375 std::vector<float> conv3_weights(product(conv3_weights_tz));
376 std::vector<float> conv3_bias(product(conv3_bias_tz));
377
378 // create memory for user data
379 auto conv3_user_weights_memory
380 = memory({{conv3_weights_tz}, dt::f32, tag::oihw}, eng);
381 write_to_dnnl_memory(conv3_weights.data(), conv3_user_weights_memory);
382 auto conv3_user_bias_memory
383 = memory({{conv3_bias_tz}, dt::f32, tag::x}, eng);
384 write_to_dnnl_memory(conv3_bias.data(), conv3_user_bias_memory);
385
386 // create memory descriptors for convolution data w/ no specified format
387 auto conv3_src_md = memory::desc({conv3_src_tz}, dt::f32, tag::any);
388 auto conv3_bias_md = memory::desc({conv3_bias_tz}, dt::f32, tag::any);
389 auto conv3_weights_md = memory::desc({conv3_weights_tz}, dt::f32, tag::any);
390 auto conv3_dst_md = memory::desc({conv3_dst_tz}, dt::f32, tag::any);
391
392 // create a convolution
393 auto conv3_prim_desc = convolution_forward::primitive_desc(eng,
394 prop_kind::forward_inference, algorithm::convolution_direct,
395 conv3_src_md, conv3_weights_md, conv3_bias_md, conv3_dst_md,
396 conv3_strides, conv3_padding, conv3_padding);
397
398 auto conv3_src_memory = pool2_dst_memory;
399 if (conv3_prim_desc.src_desc() != conv3_src_memory.get_desc()) {
400 conv3_src_memory = memory(conv3_prim_desc.src_desc(), eng);
401 net.push_back(reorder(pool2_dst_memory, conv3_src_memory));
402 net_args.push_back({{DNNL_ARG_FROM, pool2_dst_memory},
403 {DNNL_ARG_TO, conv3_src_memory}});
404 }
405
406 auto conv3_weights_memory = conv3_user_weights_memory;
407 if (conv3_prim_desc.weights_desc()
408 != conv3_user_weights_memory.get_desc()) {
409 conv3_weights_memory = memory(conv3_prim_desc.weights_desc(), eng);
410 reorder(conv3_user_weights_memory, conv3_weights_memory)
411 .execute(s, conv3_user_weights_memory, conv3_weights_memory);
412 }
413
414 auto conv3_dst_memory = memory(conv3_prim_desc.dst_desc(), eng);
415
416 // create convolution primitive and add it to net
417 net.push_back(convolution_forward(conv3_prim_desc));
418 net_args.push_back({{DNNL_ARG_SRC, conv3_src_memory},
419 {DNNL_ARG_WEIGHTS, conv3_weights_memory},
420 {DNNL_ARG_BIAS, conv3_user_bias_memory},
421 {DNNL_ARG_DST, conv3_dst_memory}});
422
423 // AlexNet: relu3
424 // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
425 const float negative3_slope = 0.0f;
426
427 // create relu primitive and add it to net
428 auto relu3_prim_desc
429 = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference,
430 algorithm::eltwise_relu, conv3_dst_memory.get_desc(),
431 conv3_dst_memory.get_desc(), negative3_slope);
432
433 net.push_back(eltwise_forward(relu3_prim_desc));
434 net_args.push_back({{DNNL_ARG_SRC, conv3_dst_memory},
435 {DNNL_ARG_DST, conv3_dst_memory}});
436
437 // AlexNet: conv4
438 // {batch, 384, 13, 13} (x) {2, 192, 192, 3, 3}; ->
439 // {batch, 384, 13, 13};
440 // strides: {1, 1}
441 memory::dims conv4_src_tz = {batch, 384, 13, 13};
442 memory::dims conv4_weights_tz = {2, 192, 192, 3, 3};
443 memory::dims conv4_bias_tz = {384};
444 memory::dims conv4_dst_tz = {batch, 384, 13, 13};
445 memory::dims conv4_strides = {1, 1};
446 memory::dims conv4_padding = {1, 1};
447
448 std::vector<float> conv4_weights(product(conv4_weights_tz));
449 std::vector<float> conv4_bias(product(conv4_bias_tz));
450
451 // create memory for user data
452 auto conv4_user_weights_memory
453 = memory({{conv4_weights_tz}, dt::f32, tag::goihw}, eng);
454 write_to_dnnl_memory(conv4_weights.data(), conv4_user_weights_memory);
455 auto conv4_user_bias_memory
456 = memory({{conv4_bias_tz}, dt::f32, tag::x}, eng);
457 write_to_dnnl_memory(conv4_bias.data(), conv4_user_bias_memory);
458
459 // create memory descriptors for convolution data w/ no specified format
460 auto conv4_src_md = memory::desc({conv4_src_tz}, dt::f32, tag::any);
461 auto conv4_bias_md = memory::desc({conv4_bias_tz}, dt::f32, tag::any);
462 auto conv4_weights_md = memory::desc({conv4_weights_tz}, dt::f32, tag::any);
463 auto conv4_dst_md = memory::desc({conv4_dst_tz}, dt::f32, tag::any);
464
465 // create a convolution
466 auto conv4_prim_desc = convolution_forward::primitive_desc(eng,
467 prop_kind::forward_inference, algorithm::convolution_direct,
468 conv4_src_md, conv4_weights_md, conv4_bias_md, conv4_dst_md,
469 conv4_strides, conv4_padding, conv4_padding);
470
471 auto conv4_src_memory = conv3_dst_memory;
472 if (conv4_prim_desc.src_desc() != conv4_src_memory.get_desc()) {
473 conv4_src_memory = memory(conv4_prim_desc.src_desc(), eng);
474 net.push_back(reorder(conv3_dst_memory, conv4_src_memory));
475 net_args.push_back({{DNNL_ARG_FROM, conv3_dst_memory},
476 {DNNL_ARG_TO, conv4_src_memory}});
477 }
478
479 auto conv4_weights_memory = conv4_user_weights_memory;
480 if (conv4_prim_desc.weights_desc()
481 != conv4_user_weights_memory.get_desc()) {
482 conv4_weights_memory = memory(conv4_prim_desc.weights_desc(), eng);
483 reorder(conv4_user_weights_memory, conv4_weights_memory)
484 .execute(s, conv4_user_weights_memory, conv4_weights_memory);
485 }
486
487 auto conv4_dst_memory = memory(conv4_prim_desc.dst_desc(), eng);
488
489 // create convolution primitive and add it to net
490 net.push_back(convolution_forward(conv4_prim_desc));
491 net_args.push_back({{DNNL_ARG_SRC, conv4_src_memory},
492 {DNNL_ARG_WEIGHTS, conv4_weights_memory},
493 {DNNL_ARG_BIAS, conv4_user_bias_memory},
494 {DNNL_ARG_DST, conv4_dst_memory}});
495
496 // AlexNet: relu4
497 // {batch, 384, 13, 13} -> {batch, 384, 13, 13}
498 const float negative4_slope = 0.0f;
499
500 // create relu primitive and add it to net
501 auto relu4_prim_desc
502 = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference,
503 algorithm::eltwise_relu, conv4_dst_memory.get_desc(),
504 conv4_dst_memory.get_desc(), negative4_slope);
505
506 net.push_back(eltwise_forward(relu4_prim_desc));
507 net_args.push_back({{DNNL_ARG_SRC, conv4_dst_memory},
508 {DNNL_ARG_DST, conv4_dst_memory}});
509
510 // AlexNet: conv5
511 // {batch, 384, 13, 13} (x) {2, 128, 192, 3, 3}; -> {batch, 256, 13, 13};
512 // strides: {1, 1}
513 memory::dims conv5_src_tz = {batch, 384, 13, 13};
514 memory::dims conv5_weights_tz = {2, 128, 192, 3, 3};
515 memory::dims conv5_bias_tz = {256};
516 memory::dims conv5_dst_tz = {batch, 256, 13, 13};
517 memory::dims conv5_strides = {1, 1};
518 memory::dims conv5_padding = {1, 1};
519
520 std::vector<float> conv5_weights(product(conv5_weights_tz));
521 std::vector<float> conv5_bias(product(conv5_bias_tz));
522
523 // create memory for user data
524 auto conv5_user_weights_memory
525 = memory({{conv5_weights_tz}, dt::f32, tag::goihw}, eng);
526 write_to_dnnl_memory(conv5_weights.data(), conv5_user_weights_memory);
527 auto conv5_user_bias_memory
528 = memory({{conv5_bias_tz}, dt::f32, tag::x}, eng);
529 write_to_dnnl_memory(conv5_bias.data(), conv5_user_bias_memory);
530
531 // create memory descriptors for convolution data w/ no specified format
532 auto conv5_src_md = memory::desc({conv5_src_tz}, dt::f32, tag::any);
533 auto conv5_weights_md = memory::desc({conv5_weights_tz}, dt::f32, tag::any);
534 auto conv5_bias_md = memory::desc({conv5_bias_tz}, dt::f32, tag::any);
535 auto conv5_dst_md = memory::desc({conv5_dst_tz}, dt::f32, tag::any);
536
537 // create a convolution
538 auto conv5_prim_desc = convolution_forward::primitive_desc(eng,
539 prop_kind::forward_inference, algorithm::convolution_direct,
540 conv5_src_md, conv5_weights_md, conv5_bias_md, conv5_dst_md,
541 conv5_strides, conv5_padding, conv5_padding);
542
543 auto conv5_src_memory = conv4_dst_memory;
544 if (conv5_prim_desc.src_desc() != conv5_src_memory.get_desc()) {
545 conv5_src_memory = memory(conv5_prim_desc.src_desc(), eng);
546 net.push_back(reorder(conv4_dst_memory, conv5_src_memory));
547 net_args.push_back({{DNNL_ARG_FROM, conv4_dst_memory},
548 {DNNL_ARG_TO, conv5_src_memory}});
549 }
550
551 auto conv5_weights_memory = conv5_user_weights_memory;
552 if (conv5_prim_desc.weights_desc()
553 != conv5_user_weights_memory.get_desc()) {
554 conv5_weights_memory = memory(conv5_prim_desc.weights_desc(), eng);
555 reorder(conv5_user_weights_memory, conv5_weights_memory)
556 .execute(s, conv5_user_weights_memory, conv5_weights_memory);
557 }
558
559 auto conv5_dst_memory = memory(conv5_prim_desc.dst_desc(), eng);
560
561 // create convolution primitive and add it to net
562 net.push_back(convolution_forward(conv5_prim_desc));
563 net_args.push_back({{DNNL_ARG_SRC, conv5_src_memory},
564 {DNNL_ARG_WEIGHTS, conv5_weights_memory},
565 {DNNL_ARG_BIAS, conv5_user_bias_memory},
566 {DNNL_ARG_DST, conv5_dst_memory}});
567
568 // AlexNet: relu5
569 // {batch, 256, 13, 13} -> {batch, 256, 13, 13}
570 const float negative5_slope = 0.0f;
571
572 // create relu primitive and add it to net
573 auto relu5_prim_desc
574 = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference,
575 algorithm::eltwise_relu, conv5_dst_memory.get_desc(),
576 conv5_dst_memory.get_desc(), negative5_slope);
577
578 net.push_back(eltwise_forward(relu5_prim_desc));
579 net_args.push_back({{DNNL_ARG_SRC, conv5_dst_memory},
580 {DNNL_ARG_DST, conv5_dst_memory}});
581
582 // AlexNet: pool5
583 // {batch, 256, 13, 13} -> {batch, 256, 6, 6}
584 // kernel: {3, 3}
585 // strides: {2, 2}
586 memory::dims pool5_dst_tz = {batch, 256, 6, 6};
587 memory::dims pool5_kernel = {3, 3};
588 memory::dims pool5_strides = {2, 2};
589 memory::dims pool5_dilation = {0, 0};
590 memory::dims pool5_padding = {0, 0};
591
592 std::vector<float> pool5_dst(product(pool5_dst_tz));
593
594 auto pool5_dst_md = memory::desc({pool5_dst_tz}, dt::f32, tag::any);
595
596 // create a pooling
597 auto pool5_pd = pooling_forward::primitive_desc(eng,
598 prop_kind::forward_inference, algorithm::pooling_max,
599 conv5_dst_memory.get_desc(), pool5_dst_md, pool5_strides,
600 pool5_kernel, pool5_dilation, pool5_padding, pool5_padding);
601
602 auto pool5_dst_memory = memory(pool5_pd.dst_desc(), eng);
603
604 // create pooling primitive an add it to net
605 net.push_back(pooling_forward(pool5_pd));
606 net_args.push_back({{DNNL_ARG_SRC, conv5_dst_memory},
607 {DNNL_ARG_DST, pool5_dst_memory}});
608
609 // fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch,
610 // 4096}
611 memory::dims fc6_src_tz = {batch, 256, 6, 6};
612 memory::dims fc6_weights_tz = {4096, 256, 6, 6};
613 memory::dims fc6_bias_tz = {4096};
614 memory::dims fc6_dst_tz = {batch, 4096};
615
616 std::vector<float> fc6_weights(product(fc6_weights_tz));
617 std::vector<float> fc6_bias(product(fc6_bias_tz));
618
619 // create memory for user data
620 auto fc6_user_weights_memory
621 = memory({{fc6_weights_tz}, dt::f32, tag::oihw}, eng);
622 write_to_dnnl_memory(fc6_weights.data(), fc6_user_weights_memory);
623 auto fc6_user_bias_memory = memory({{fc6_bias_tz}, dt::f32, tag::x}, eng);
624 write_to_dnnl_memory(fc6_bias.data(), fc6_user_bias_memory);
625
626 // create memory descriptors for convolution data w/ no specified format
627 auto fc6_src_md = memory::desc({fc6_src_tz}, dt::f32, tag::any);
628 auto fc6_bias_md = memory::desc({fc6_bias_tz}, dt::f32, tag::any);
629 auto fc6_weights_md = memory::desc({fc6_weights_tz}, dt::f32, tag::any);
630 auto fc6_dst_md = memory::desc({fc6_dst_tz}, dt::f32, tag::any);
631
632 // create a inner_product
633 auto fc6_prim_desc = inner_product_forward::primitive_desc(eng,
634 prop_kind::forward_inference, fc6_src_md, fc6_weights_md,
635 fc6_bias_md, fc6_dst_md);
636
637 auto fc6_src_memory = pool5_dst_memory;
638 if (fc6_prim_desc.src_desc() != fc6_src_memory.get_desc()) {
639 fc6_src_memory = memory(fc6_prim_desc.src_desc(), eng);
640 net.push_back(reorder(pool5_dst_memory, fc6_src_memory));
641 net_args.push_back({{DNNL_ARG_FROM, pool5_dst_memory},
642 {DNNL_ARG_TO, fc6_src_memory}});
643 }
644
645 auto fc6_weights_memory = fc6_user_weights_memory;
646 if (fc6_prim_desc.weights_desc() != fc6_user_weights_memory.get_desc()) {
647 fc6_weights_memory = memory(fc6_prim_desc.weights_desc(), eng);
648 reorder(fc6_user_weights_memory, fc6_weights_memory)
649 .execute(s, fc6_user_weights_memory, fc6_weights_memory);
650 }
651
652 auto fc6_dst_memory = memory(fc6_prim_desc.dst_desc(), eng);
653
654 // create convolution primitive and add it to net
655 net.push_back(inner_product_forward(fc6_prim_desc));
656 net_args.push_back({{DNNL_ARG_SRC, fc6_src_memory},
657 {DNNL_ARG_WEIGHTS, fc6_weights_memory},
658 {DNNL_ARG_BIAS, fc6_user_bias_memory},
659 {DNNL_ARG_DST, fc6_dst_memory}});
660
661 // fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096}
662 memory::dims fc7_weights_tz = {4096, 4096};
663 memory::dims fc7_bias_tz = {4096};
664 memory::dims fc7_dst_tz = {batch, 4096};
665
666 std::vector<float> fc7_weights(product(fc7_weights_tz));
667 std::vector<float> fc7_bias(product(fc7_bias_tz));
668
669 // create memory for user data
670 auto fc7_user_weights_memory
671 = memory({{fc7_weights_tz}, dt::f32, tag::nc}, eng);
672 write_to_dnnl_memory(fc7_weights.data(), fc7_user_weights_memory);
673
674 auto fc7_user_bias_memory = memory({{fc7_bias_tz}, dt::f32, tag::x}, eng);
675 write_to_dnnl_memory(fc7_bias.data(), fc7_user_bias_memory);
676
677 // create memory descriptors for convolution data w/ no specified format
678 auto fc7_bias_md = memory::desc({fc7_bias_tz}, dt::f32, tag::any);
679 auto fc7_weights_md = memory::desc({fc7_weights_tz}, dt::f32, tag::any);
680 auto fc7_dst_md = memory::desc({fc7_dst_tz}, dt::f32, tag::any);
681
682 // create a inner_product
683 auto fc7_prim_desc = inner_product_forward::primitive_desc(eng,
684 prop_kind::forward_inference, fc6_dst_memory.get_desc(),
685 fc7_weights_md, fc7_bias_md, fc7_dst_md);
686
687 auto fc7_weights_memory = fc7_user_weights_memory;
688 if (fc7_prim_desc.weights_desc() != fc7_user_weights_memory.get_desc()) {
689 fc7_weights_memory = memory(fc7_prim_desc.weights_desc(), eng);
690 reorder(fc7_user_weights_memory, fc7_weights_memory)
691 .execute(s, fc7_user_weights_memory, fc7_weights_memory);
692 }
693
694 auto fc7_dst_memory = memory(fc7_prim_desc.dst_desc(), eng);
695
696 // create convolution primitive and add it to net
697 net.push_back(inner_product_forward(fc7_prim_desc));
698 net_args.push_back({{DNNL_ARG_SRC, fc6_dst_memory},
699 {DNNL_ARG_WEIGHTS, fc7_weights_memory},
700 {DNNL_ARG_BIAS, fc7_user_bias_memory},
701 {DNNL_ARG_DST, fc7_dst_memory}});
702
703 // fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000}
704 memory::dims fc8_weights_tz = {1000, 4096};
705 memory::dims fc8_bias_tz = {1000};
706 memory::dims fc8_dst_tz = {batch, 1000};
707
708 std::vector<float> fc8_weights(product(fc8_weights_tz));
709 std::vector<float> fc8_bias(product(fc8_bias_tz));
710
711 // create memory for user data
712 auto fc8_user_weights_memory
713 = memory({{fc8_weights_tz}, dt::f32, tag::nc}, eng);
714 write_to_dnnl_memory(fc8_weights.data(), fc8_user_weights_memory);
715 auto fc8_user_bias_memory = memory({{fc8_bias_tz}, dt::f32, tag::x}, eng);
716 write_to_dnnl_memory(fc8_bias.data(), fc8_user_bias_memory);
717 auto user_dst_memory = memory({{fc8_dst_tz}, dt::f32, tag::nc}, eng);
718 write_to_dnnl_memory(user_dst.data(), user_dst_memory);
719
720 // create memory descriptors for convolution data w/ no specified format
721 auto fc8_bias_md = memory::desc({fc8_bias_tz}, dt::f32, tag::any);
722 auto fc8_weights_md = memory::desc({fc8_weights_tz}, dt::f32, tag::any);
723 auto fc8_dst_md = memory::desc({fc8_dst_tz}, dt::f32, tag::any);
724
725 // create a inner_product
726 auto fc8_prim_desc = inner_product_forward::primitive_desc(eng,
727 prop_kind::forward_inference, fc7_dst_memory.get_desc(),
728 fc8_weights_md, fc8_bias_md, fc8_dst_md);
729
730 auto fc8_weights_memory = fc8_user_weights_memory;
731 if (fc8_prim_desc.weights_desc() != fc8_user_weights_memory.get_desc()) {
732 fc8_weights_memory = memory(fc8_prim_desc.weights_desc(), eng);
733 reorder(fc8_user_weights_memory, fc8_weights_memory)
734 .execute(s, fc8_user_weights_memory, fc8_weights_memory);
735 }
736
737 auto fc8_dst_memory = memory(fc8_prim_desc.dst_desc(), eng);
738
739 // create convolution primitive and add it to net
740 net.push_back(inner_product_forward(fc8_prim_desc));
741 net_args.push_back({{DNNL_ARG_SRC, fc7_dst_memory},
742 {DNNL_ARG_WEIGHTS, fc8_weights_memory},
743 {DNNL_ARG_BIAS, fc8_user_bias_memory},
744 {DNNL_ARG_DST, fc8_dst_memory}});
745
746 // create reorder between internal and user data if it is needed and
747 // add it to net after pooling
748 if (fc8_dst_memory != user_dst_memory) {
749 net.push_back(reorder(fc8_dst_memory, user_dst_memory));
750 net_args.push_back({{DNNL_ARG_FROM, fc8_dst_memory},
751 {DNNL_ARG_TO, user_dst_memory}});
752 }
753
754 /// @page cnn_inference_f32_cpp
755 /// Finally, execute the primitives. For this example, the net is executed
756 /// multiple times and each execution is timed individually.
757 /// @snippet cnn_inference_f32.cpp Execute model
758 //[Execute model]
759 for (int j = 0; j < times; ++j) {
760 assert(net.size() == net_args.size() && "something is missing");
761 for (size_t i = 0; i < net.size(); ++i)
762 net.at(i).execute(s, net_args.at(i));
763 }
764 //[Execute model]
765
766 s.wait();
767}
768
769void cnn_inference_f32(engine::kind engine_kind) {
770 auto begin = std::chrono::duration_cast<std::chrono::milliseconds>(
771 std::chrono::steady_clock::now().time_since_epoch())
772 .count();
773 int times = 100;
774 simple_net(engine_kind, times);
775 auto end = std::chrono::duration_cast<std::chrono::milliseconds>(
776 std::chrono::steady_clock::now().time_since_epoch())
777 .count();
778 std::cout << "Use time: " << (end - begin) / (times + 0.0)
779 << " ms per iteration." << std::endl;
780}
781
782int main(int argc, char **argv) {
783 return handle_example_errors(
784 cnn_inference_f32, parse_engine_kind(argc, argv));
785}
786