1 | /******************************************************************************* |
2 | * Copyright 2016-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example cnn_inference_f32.cpp |
18 | /// @copybrief cnn_inference_f32_cpp |
19 | /// > Annotated version: @ref cnn_inference_f32_cpp |
20 | |
21 | /// @page cnn_inference_f32_cpp CNN f32 inference example |
22 | /// This C++ API example demonstrates how to build an AlexNet neural |
23 | /// network topology for forward-pass inference. |
24 | /// |
25 | /// > Example code: @ref cnn_inference_f32.cpp |
26 | /// |
27 | /// Some key take-aways include: |
28 | /// |
29 | /// * How tensors are implemented and submitted to primitives. |
30 | /// * How primitives are created. |
31 | /// * How primitives are sequentially submitted to the network, where the output |
32 | /// from primitives is passed as input to the next primitive. The latter |
33 | /// specifies a dependency between the primitive input and output data. |
34 | /// * Specific 'inference-only' configurations. |
35 | /// * Limiting the number of reorders performed that are detrimental |
36 | /// to performance. |
37 | /// |
38 | /// The example implements the AlexNet layers |
39 | /// as numbered primitives (for example, conv1, pool1, conv2). |
40 | |
41 | #include <assert.h> |
42 | |
43 | #include <chrono> |
44 | #include <vector> |
45 | #include <unordered_map> |
46 | |
47 | #include "oneapi/dnnl/dnnl.hpp" |
48 | |
49 | #include "example_utils.hpp" |
50 | |
51 | using namespace dnnl; |
52 | |
53 | void simple_net(engine::kind engine_kind, int times = 100) { |
54 | using tag = memory::format_tag; |
55 | using dt = memory::data_type; |
56 | |
57 | /// Initialize an engine and stream. The last parameter in the call represents |
58 | /// the index of the engine. |
59 | /// @snippet cnn_inference_f32.cpp Initialize engine and stream |
60 | //[Initialize engine and stream] |
61 | engine eng(engine_kind, 0); |
62 | stream s(eng); |
63 | //[Initialize engine and stream] |
64 | |
65 | /// Create a vector for the primitives and a vector to hold memory |
66 | /// that will be used as arguments. |
67 | /// @snippet cnn_inference_f32.cpp Create network |
68 | //[Create network] |
69 | std::vector<primitive> net; |
70 | std::vector<std::unordered_map<int, memory>> net_args; |
71 | //[Create network] |
72 | |
73 | const memory::dim batch = 1; |
74 | |
75 | // AlexNet: conv1 |
76 | // {batch, 3, 227, 227} (x) {96, 3, 11, 11} -> {batch, 96, 55, 55} |
77 | // strides: {4, 4} |
78 | memory::dims conv1_src_tz = {batch, 3, 227, 227}; |
79 | memory::dims conv1_weights_tz = {96, 3, 11, 11}; |
80 | memory::dims conv1_bias_tz = {96}; |
81 | memory::dims conv1_dst_tz = {batch, 96, 55, 55}; |
82 | memory::dims conv1_strides = {4, 4}; |
83 | memory::dims conv1_padding = {0, 0}; |
84 | |
85 | /// Allocate buffers for input and output data, weights, and bias. |
86 | /// @snippet cnn_inference_f32.cpp Allocate buffers |
87 | //[Allocate buffers] |
88 | std::vector<float> user_src(batch * 3 * 227 * 227); |
89 | std::vector<float> user_dst(batch * 1000); |
90 | std::vector<float> conv1_weights(product(conv1_weights_tz)); |
91 | std::vector<float> conv1_bias(product(conv1_bias_tz)); |
92 | //[Allocate buffers] |
93 | |
94 | /// Create memory that describes data layout in the buffers. This example uses |
95 | /// tag::nchw (batch-channels-height-width) for input data and tag::oihw |
96 | /// for weights. |
97 | /// @snippet cnn_inference_f32.cpp Create user memory |
98 | //[Create user memory] |
99 | auto user_src_memory = memory({{conv1_src_tz}, dt::f32, tag::nchw}, eng); |
100 | write_to_dnnl_memory(user_src.data(), user_src_memory); |
101 | auto user_weights_memory |
102 | = memory({{conv1_weights_tz}, dt::f32, tag::oihw}, eng); |
103 | write_to_dnnl_memory(conv1_weights.data(), user_weights_memory); |
104 | auto conv1_user_bias_memory |
105 | = memory({{conv1_bias_tz}, dt::f32, tag::x}, eng); |
106 | write_to_dnnl_memory(conv1_bias.data(), conv1_user_bias_memory); |
107 | //[Create user memory] |
108 | |
109 | /// Create memory descriptors with layout tag::any. The `any` format enables |
110 | /// the convolution primitive to choose the data format that will result in |
111 | /// best performance based on its input parameters (convolution kernel |
112 | /// sizes, strides, padding, and so on). If the resulting format is different |
113 | /// from `nchw`, the user data must be transformed to the format required for |
114 | /// the convolution (as explained below). |
115 | /// @snippet cnn_inference_f32.cpp Create convolution memory descriptors |
116 | //[Create convolution memory descriptors] |
117 | auto conv1_src_md = memory::desc({conv1_src_tz}, dt::f32, tag::any); |
118 | auto conv1_bias_md = memory::desc({conv1_bias_tz}, dt::f32, tag::any); |
119 | auto conv1_weights_md = memory::desc({conv1_weights_tz}, dt::f32, tag::any); |
120 | auto conv1_dst_md = memory::desc({conv1_dst_tz}, dt::f32, tag::any); |
121 | //[Create convolution memory descriptors] |
122 | |
123 | /// Create a convolution primitive descriptor by specifying engine, |
124 | /// propagation kind, [convolution algorithm](@ref dev_guide_convolution), |
125 | /// shapes of input, weights, bias, output, convolution strides, padding, |
126 | /// and kind of padding. |
127 | /// Propagation kind is set to prop_kind::forward_inference to optimize for |
128 | /// inference execution and omit computations that are necessary only for |
129 | /// backward propagation. |
130 | /// Once created, it has specific formats instead of the `any` format. |
131 | /// @snippet cnn_inference_f32.cpp Create convolution primitive descriptor |
132 | //[Create convolution primitive descriptor] |
133 | auto conv1_prim_desc = convolution_forward::primitive_desc(eng, |
134 | prop_kind::forward_inference, algorithm::convolution_direct, |
135 | conv1_src_md, conv1_weights_md, conv1_bias_md, conv1_dst_md, |
136 | conv1_strides, conv1_padding, conv1_padding); |
137 | //[Create convolution primitive descriptor] |
138 | |
139 | /// Check whether data and weights formats required by convolution is different |
140 | /// from the user format. In case it is different change the layout using |
141 | /// reorder primitive. |
142 | /// @snippet cnn_inference_f32.cpp Reorder data and weights |
143 | //[Reorder data and weights] |
144 | auto conv1_src_memory = user_src_memory; |
145 | if (conv1_prim_desc.src_desc() != user_src_memory.get_desc()) { |
146 | conv1_src_memory = memory(conv1_prim_desc.src_desc(), eng); |
147 | net.push_back(reorder(user_src_memory, conv1_src_memory)); |
148 | net_args.push_back({{DNNL_ARG_FROM, user_src_memory}, |
149 | {DNNL_ARG_TO, conv1_src_memory}}); |
150 | } |
151 | |
152 | auto conv1_weights_memory = user_weights_memory; |
153 | if (conv1_prim_desc.weights_desc() != user_weights_memory.get_desc()) { |
154 | conv1_weights_memory = memory(conv1_prim_desc.weights_desc(), eng); |
155 | reorder(user_weights_memory, conv1_weights_memory) |
156 | .execute(s, user_weights_memory, conv1_weights_memory); |
157 | } |
158 | //[Reorder data and weights] |
159 | |
160 | /// Create a memory primitive for output. |
161 | /// @snippet cnn_inference_f32.cpp Create memory for output |
162 | //[Create memory for output] |
163 | auto conv1_dst_memory = memory(conv1_prim_desc.dst_desc(), eng); |
164 | //[Create memory for output] |
165 | |
166 | /// Create a convolution primitive and add it to the net. |
167 | /// @snippet cnn_inference_f32.cpp Create memory for output |
168 | //[Create convolution primitive] |
169 | net.push_back(convolution_forward(conv1_prim_desc)); |
170 | net_args.push_back({{DNNL_ARG_SRC, conv1_src_memory}, |
171 | {DNNL_ARG_WEIGHTS, conv1_weights_memory}, |
172 | {DNNL_ARG_BIAS, conv1_user_bias_memory}, |
173 | {DNNL_ARG_DST, conv1_dst_memory}}); |
174 | //[Create convolution primitive] |
175 | |
176 | // AlexNet: relu1 |
177 | // {batch, 96, 55, 55} -> {batch, 96, 55, 55} |
178 | const float negative1_slope = 0.0f; |
179 | |
180 | /// Create the relu primitive. For better performance, keep the input data |
181 | /// format for ReLU (as well as for other operation primitives until another |
182 | /// convolution or inner product is encountered) the same as the one chosen |
183 | /// for convolution. Also note that ReLU is done in-place by using conv1 memory. |
184 | /// @snippet cnn_inference_f32.cpp Create relu primitive |
185 | //[Create relu primitive] |
186 | auto relu1_prim_desc |
187 | = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference, |
188 | algorithm::eltwise_relu, conv1_dst_memory.get_desc(), |
189 | conv1_dst_memory.get_desc(), negative1_slope); |
190 | |
191 | net.push_back(eltwise_forward(relu1_prim_desc)); |
192 | net_args.push_back({{DNNL_ARG_SRC, conv1_dst_memory}, |
193 | {DNNL_ARG_DST, conv1_dst_memory}}); |
194 | //[Create relu primitive] |
195 | |
196 | // AlexNet: lrn1 |
197 | // {batch, 96, 55, 55} -> {batch, 96, 55, 55} |
198 | // local size: 5 |
199 | // alpha1: 0.0001 |
200 | // beta1: 0.75 |
201 | const memory::dim local1_size = 5; |
202 | const float alpha1 = 0.0001f; |
203 | const float beta1 = 0.75f; |
204 | const float k1 = 1.0f; |
205 | |
206 | // create lrn primitive and add it to net |
207 | auto lrn1_prim_desc = lrn_forward::primitive_desc(eng, |
208 | prop_kind::forward_inference, algorithm::lrn_across_channels, |
209 | conv1_dst_memory.get_desc(), conv1_dst_memory.get_desc(), |
210 | local1_size, alpha1, beta1, k1); |
211 | auto lrn1_dst_memory = memory(lrn1_prim_desc.dst_desc(), eng); |
212 | |
213 | net.push_back(lrn_forward(lrn1_prim_desc)); |
214 | net_args.push_back({{DNNL_ARG_SRC, conv1_dst_memory}, |
215 | {DNNL_ARG_DST, lrn1_dst_memory}}); |
216 | |
217 | // AlexNet: pool1 |
218 | // {batch, 96, 55, 55} -> {batch, 96, 27, 27} |
219 | // kernel: {3, 3} |
220 | // strides: {2, 2} |
221 | memory::dims pool1_dst_tz = {batch, 96, 27, 27}; |
222 | memory::dims pool1_kernel = {3, 3}; |
223 | memory::dims pool1_strides = {2, 2}; |
224 | memory::dims pool_dilation = {0, 0}; |
225 | memory::dims pool_padding = {0, 0}; |
226 | |
227 | auto pool1_dst_md = memory::desc({pool1_dst_tz}, dt::f32, tag::any); |
228 | |
229 | /// For training execution, pooling requires a private workspace memory |
230 | /// to perform the backward pass. However, pooling should not use 'workspace' |
231 | /// for inference, because this is detrimental to performance. |
232 | /// @snippet cnn_inference_f32.cpp Create pooling primitive |
233 | /// |
234 | /// The example continues to create more layers according |
235 | /// to the AlexNet topology. |
236 | //[Create pooling primitive] |
237 | auto pool1_pd = pooling_forward::primitive_desc(eng, |
238 | prop_kind::forward_inference, algorithm::pooling_max, |
239 | lrn1_dst_memory.get_desc(), pool1_dst_md, pool1_strides, |
240 | pool1_kernel, pool_dilation, pool_padding, pool_padding); |
241 | auto pool1_dst_memory = memory(pool1_pd.dst_desc(), eng); |
242 | |
243 | net.push_back(pooling_forward(pool1_pd)); |
244 | net_args.push_back({{DNNL_ARG_SRC, lrn1_dst_memory}, |
245 | {DNNL_ARG_DST, pool1_dst_memory}}); |
246 | //[Create pooling primitive] |
247 | |
248 | // AlexNet: conv2 |
249 | // {batch, 96, 27, 27} (x) {2, 128, 48, 5, 5} -> {batch, 256, 27, 27} |
250 | // strides: {1, 1} |
251 | memory::dims conv2_src_tz = {batch, 96, 27, 27}; |
252 | memory::dims conv2_weights_tz = {2, 128, 48, 5, 5}; |
253 | memory::dims conv2_bias_tz = {256}; |
254 | memory::dims conv2_dst_tz = {batch, 256, 27, 27}; |
255 | memory::dims conv2_strides = {1, 1}; |
256 | memory::dims conv2_padding = {2, 2}; |
257 | |
258 | std::vector<float> conv2_weights(product(conv2_weights_tz)); |
259 | std::vector<float> conv2_bias(product(conv2_bias_tz)); |
260 | |
261 | // create memory for user data |
262 | auto conv2_user_weights_memory |
263 | = memory({{conv2_weights_tz}, dt::f32, tag::goihw}, eng); |
264 | write_to_dnnl_memory(conv2_weights.data(), conv2_user_weights_memory); |
265 | auto conv2_user_bias_memory |
266 | = memory({{conv2_bias_tz}, dt::f32, tag::x}, eng); |
267 | write_to_dnnl_memory(conv2_bias.data(), conv2_user_bias_memory); |
268 | |
269 | // create memory descriptors for convolution data w/ no specified format |
270 | auto conv2_src_md = memory::desc({conv2_src_tz}, dt::f32, tag::any); |
271 | auto conv2_bias_md = memory::desc({conv2_bias_tz}, dt::f32, tag::any); |
272 | auto conv2_weights_md = memory::desc({conv2_weights_tz}, dt::f32, tag::any); |
273 | auto conv2_dst_md = memory::desc({conv2_dst_tz}, dt::f32, tag::any); |
274 | |
275 | // create a convolution |
276 | auto conv2_prim_desc = convolution_forward::primitive_desc(eng, |
277 | prop_kind::forward_inference, algorithm::convolution_direct, |
278 | conv2_src_md, conv2_weights_md, conv2_bias_md, conv2_dst_md, |
279 | conv2_strides, conv2_padding, conv2_padding); |
280 | |
281 | auto conv2_src_memory = pool1_dst_memory; |
282 | if (conv2_prim_desc.src_desc() != conv2_src_memory.get_desc()) { |
283 | conv2_src_memory = memory(conv2_prim_desc.src_desc(), eng); |
284 | net.push_back(reorder(pool1_dst_memory, conv2_src_memory)); |
285 | net_args.push_back({{DNNL_ARG_FROM, pool1_dst_memory}, |
286 | {DNNL_ARG_TO, conv2_src_memory}}); |
287 | } |
288 | |
289 | auto conv2_weights_memory = conv2_user_weights_memory; |
290 | if (conv2_prim_desc.weights_desc() |
291 | != conv2_user_weights_memory.get_desc()) { |
292 | conv2_weights_memory = memory(conv2_prim_desc.weights_desc(), eng); |
293 | reorder(conv2_user_weights_memory, conv2_weights_memory) |
294 | .execute(s, conv2_user_weights_memory, conv2_weights_memory); |
295 | } |
296 | |
297 | auto conv2_dst_memory = memory(conv2_prim_desc.dst_desc(), eng); |
298 | |
299 | // create convolution primitive and add it to net |
300 | net.push_back(convolution_forward(conv2_prim_desc)); |
301 | net_args.push_back({{DNNL_ARG_SRC, conv2_src_memory}, |
302 | {DNNL_ARG_WEIGHTS, conv2_weights_memory}, |
303 | {DNNL_ARG_BIAS, conv2_user_bias_memory}, |
304 | {DNNL_ARG_DST, conv2_dst_memory}}); |
305 | |
306 | // AlexNet: relu2 |
307 | // {batch, 256, 27, 27} -> {batch, 256, 27, 27} |
308 | const float negative2_slope = 0.0f; |
309 | |
310 | // create relu primitive and add it to net |
311 | auto relu2_prim_desc |
312 | = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference, |
313 | algorithm::eltwise_relu, conv2_dst_memory.get_desc(), |
314 | conv2_dst_memory.get_desc(), negative2_slope); |
315 | |
316 | net.push_back(eltwise_forward(relu2_prim_desc)); |
317 | net_args.push_back({{DNNL_ARG_SRC, conv2_dst_memory}, |
318 | {DNNL_ARG_DST, conv2_dst_memory}}); |
319 | |
320 | // AlexNet: lrn2 |
321 | // {batch, 256, 27, 27} -> {batch, 256, 27, 27} |
322 | // local size: 5 |
323 | // alpha2: 0.0001 |
324 | // beta2: 0.75 |
325 | const memory::dim local2_size = 5; |
326 | const float alpha2 = 0.0001f; |
327 | const float beta2 = 0.75f; |
328 | const float k2 = 1.0f; |
329 | |
330 | // create lrn primitive and add it to net |
331 | auto lrn2_prim_desc |
332 | = lrn_forward::primitive_desc(eng, prop_kind::forward_inference, |
333 | algorithm::lrn_across_channels, conv2_prim_desc.dst_desc(), |
334 | conv2_prim_desc.dst_desc(), local2_size, alpha2, beta2, k2); |
335 | auto lrn2_dst_memory = memory(lrn2_prim_desc.dst_desc(), eng); |
336 | |
337 | net.push_back(lrn_forward(lrn2_prim_desc)); |
338 | net_args.push_back({{DNNL_ARG_SRC, conv2_dst_memory}, |
339 | {DNNL_ARG_DST, lrn2_dst_memory}}); |
340 | |
341 | // AlexNet: pool2 |
342 | // {batch, 256, 27, 27} -> {batch, 256, 13, 13} |
343 | // kernel: {3, 3} |
344 | // strides: {2, 2} |
345 | memory::dims pool2_dst_tz = {batch, 256, 13, 13}; |
346 | memory::dims pool2_kernel = {3, 3}; |
347 | memory::dims pool2_strides = {2, 2}; |
348 | memory::dims pool2_dilation = {0, 0}; |
349 | memory::dims pool2_padding = {0, 0}; |
350 | |
351 | auto pool2_dst_md = memory::desc({pool2_dst_tz}, dt::f32, tag::any); |
352 | |
353 | // create a pooling |
354 | auto pool2_pd = pooling_forward::primitive_desc(eng, |
355 | prop_kind::forward_inference, algorithm::pooling_max, |
356 | lrn2_dst_memory.get_desc(), pool2_dst_md, pool2_strides, |
357 | pool2_kernel, pool2_dilation, pool2_padding, pool2_padding); |
358 | auto pool2_dst_memory = memory(pool2_pd.dst_desc(), eng); |
359 | |
360 | // create pooling primitive an add it to net |
361 | net.push_back(pooling_forward(pool2_pd)); |
362 | net_args.push_back({{DNNL_ARG_SRC, lrn2_dst_memory}, |
363 | {DNNL_ARG_DST, pool2_dst_memory}}); |
364 | |
365 | // AlexNet: conv3 |
366 | // {batch, 256, 13, 13} (x) {384, 256, 3, 3}; -> {batch, 384, 13, 13}; |
367 | // strides: {1, 1} |
368 | memory::dims conv3_src_tz = {batch, 256, 13, 13}; |
369 | memory::dims conv3_weights_tz = {384, 256, 3, 3}; |
370 | memory::dims conv3_bias_tz = {384}; |
371 | memory::dims conv3_dst_tz = {batch, 384, 13, 13}; |
372 | memory::dims conv3_strides = {1, 1}; |
373 | memory::dims conv3_padding = {1, 1}; |
374 | |
375 | std::vector<float> conv3_weights(product(conv3_weights_tz)); |
376 | std::vector<float> conv3_bias(product(conv3_bias_tz)); |
377 | |
378 | // create memory for user data |
379 | auto conv3_user_weights_memory |
380 | = memory({{conv3_weights_tz}, dt::f32, tag::oihw}, eng); |
381 | write_to_dnnl_memory(conv3_weights.data(), conv3_user_weights_memory); |
382 | auto conv3_user_bias_memory |
383 | = memory({{conv3_bias_tz}, dt::f32, tag::x}, eng); |
384 | write_to_dnnl_memory(conv3_bias.data(), conv3_user_bias_memory); |
385 | |
386 | // create memory descriptors for convolution data w/ no specified format |
387 | auto conv3_src_md = memory::desc({conv3_src_tz}, dt::f32, tag::any); |
388 | auto conv3_bias_md = memory::desc({conv3_bias_tz}, dt::f32, tag::any); |
389 | auto conv3_weights_md = memory::desc({conv3_weights_tz}, dt::f32, tag::any); |
390 | auto conv3_dst_md = memory::desc({conv3_dst_tz}, dt::f32, tag::any); |
391 | |
392 | // create a convolution |
393 | auto conv3_prim_desc = convolution_forward::primitive_desc(eng, |
394 | prop_kind::forward_inference, algorithm::convolution_direct, |
395 | conv3_src_md, conv3_weights_md, conv3_bias_md, conv3_dst_md, |
396 | conv3_strides, conv3_padding, conv3_padding); |
397 | |
398 | auto conv3_src_memory = pool2_dst_memory; |
399 | if (conv3_prim_desc.src_desc() != conv3_src_memory.get_desc()) { |
400 | conv3_src_memory = memory(conv3_prim_desc.src_desc(), eng); |
401 | net.push_back(reorder(pool2_dst_memory, conv3_src_memory)); |
402 | net_args.push_back({{DNNL_ARG_FROM, pool2_dst_memory}, |
403 | {DNNL_ARG_TO, conv3_src_memory}}); |
404 | } |
405 | |
406 | auto conv3_weights_memory = conv3_user_weights_memory; |
407 | if (conv3_prim_desc.weights_desc() |
408 | != conv3_user_weights_memory.get_desc()) { |
409 | conv3_weights_memory = memory(conv3_prim_desc.weights_desc(), eng); |
410 | reorder(conv3_user_weights_memory, conv3_weights_memory) |
411 | .execute(s, conv3_user_weights_memory, conv3_weights_memory); |
412 | } |
413 | |
414 | auto conv3_dst_memory = memory(conv3_prim_desc.dst_desc(), eng); |
415 | |
416 | // create convolution primitive and add it to net |
417 | net.push_back(convolution_forward(conv3_prim_desc)); |
418 | net_args.push_back({{DNNL_ARG_SRC, conv3_src_memory}, |
419 | {DNNL_ARG_WEIGHTS, conv3_weights_memory}, |
420 | {DNNL_ARG_BIAS, conv3_user_bias_memory}, |
421 | {DNNL_ARG_DST, conv3_dst_memory}}); |
422 | |
423 | // AlexNet: relu3 |
424 | // {batch, 384, 13, 13} -> {batch, 384, 13, 13} |
425 | const float negative3_slope = 0.0f; |
426 | |
427 | // create relu primitive and add it to net |
428 | auto relu3_prim_desc |
429 | = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference, |
430 | algorithm::eltwise_relu, conv3_dst_memory.get_desc(), |
431 | conv3_dst_memory.get_desc(), negative3_slope); |
432 | |
433 | net.push_back(eltwise_forward(relu3_prim_desc)); |
434 | net_args.push_back({{DNNL_ARG_SRC, conv3_dst_memory}, |
435 | {DNNL_ARG_DST, conv3_dst_memory}}); |
436 | |
437 | // AlexNet: conv4 |
438 | // {batch, 384, 13, 13} (x) {2, 192, 192, 3, 3}; -> |
439 | // {batch, 384, 13, 13}; |
440 | // strides: {1, 1} |
441 | memory::dims conv4_src_tz = {batch, 384, 13, 13}; |
442 | memory::dims conv4_weights_tz = {2, 192, 192, 3, 3}; |
443 | memory::dims conv4_bias_tz = {384}; |
444 | memory::dims conv4_dst_tz = {batch, 384, 13, 13}; |
445 | memory::dims conv4_strides = {1, 1}; |
446 | memory::dims conv4_padding = {1, 1}; |
447 | |
448 | std::vector<float> conv4_weights(product(conv4_weights_tz)); |
449 | std::vector<float> conv4_bias(product(conv4_bias_tz)); |
450 | |
451 | // create memory for user data |
452 | auto conv4_user_weights_memory |
453 | = memory({{conv4_weights_tz}, dt::f32, tag::goihw}, eng); |
454 | write_to_dnnl_memory(conv4_weights.data(), conv4_user_weights_memory); |
455 | auto conv4_user_bias_memory |
456 | = memory({{conv4_bias_tz}, dt::f32, tag::x}, eng); |
457 | write_to_dnnl_memory(conv4_bias.data(), conv4_user_bias_memory); |
458 | |
459 | // create memory descriptors for convolution data w/ no specified format |
460 | auto conv4_src_md = memory::desc({conv4_src_tz}, dt::f32, tag::any); |
461 | auto conv4_bias_md = memory::desc({conv4_bias_tz}, dt::f32, tag::any); |
462 | auto conv4_weights_md = memory::desc({conv4_weights_tz}, dt::f32, tag::any); |
463 | auto conv4_dst_md = memory::desc({conv4_dst_tz}, dt::f32, tag::any); |
464 | |
465 | // create a convolution |
466 | auto conv4_prim_desc = convolution_forward::primitive_desc(eng, |
467 | prop_kind::forward_inference, algorithm::convolution_direct, |
468 | conv4_src_md, conv4_weights_md, conv4_bias_md, conv4_dst_md, |
469 | conv4_strides, conv4_padding, conv4_padding); |
470 | |
471 | auto conv4_src_memory = conv3_dst_memory; |
472 | if (conv4_prim_desc.src_desc() != conv4_src_memory.get_desc()) { |
473 | conv4_src_memory = memory(conv4_prim_desc.src_desc(), eng); |
474 | net.push_back(reorder(conv3_dst_memory, conv4_src_memory)); |
475 | net_args.push_back({{DNNL_ARG_FROM, conv3_dst_memory}, |
476 | {DNNL_ARG_TO, conv4_src_memory}}); |
477 | } |
478 | |
479 | auto conv4_weights_memory = conv4_user_weights_memory; |
480 | if (conv4_prim_desc.weights_desc() |
481 | != conv4_user_weights_memory.get_desc()) { |
482 | conv4_weights_memory = memory(conv4_prim_desc.weights_desc(), eng); |
483 | reorder(conv4_user_weights_memory, conv4_weights_memory) |
484 | .execute(s, conv4_user_weights_memory, conv4_weights_memory); |
485 | } |
486 | |
487 | auto conv4_dst_memory = memory(conv4_prim_desc.dst_desc(), eng); |
488 | |
489 | // create convolution primitive and add it to net |
490 | net.push_back(convolution_forward(conv4_prim_desc)); |
491 | net_args.push_back({{DNNL_ARG_SRC, conv4_src_memory}, |
492 | {DNNL_ARG_WEIGHTS, conv4_weights_memory}, |
493 | {DNNL_ARG_BIAS, conv4_user_bias_memory}, |
494 | {DNNL_ARG_DST, conv4_dst_memory}}); |
495 | |
496 | // AlexNet: relu4 |
497 | // {batch, 384, 13, 13} -> {batch, 384, 13, 13} |
498 | const float negative4_slope = 0.0f; |
499 | |
500 | // create relu primitive and add it to net |
501 | auto relu4_prim_desc |
502 | = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference, |
503 | algorithm::eltwise_relu, conv4_dst_memory.get_desc(), |
504 | conv4_dst_memory.get_desc(), negative4_slope); |
505 | |
506 | net.push_back(eltwise_forward(relu4_prim_desc)); |
507 | net_args.push_back({{DNNL_ARG_SRC, conv4_dst_memory}, |
508 | {DNNL_ARG_DST, conv4_dst_memory}}); |
509 | |
510 | // AlexNet: conv5 |
511 | // {batch, 384, 13, 13} (x) {2, 128, 192, 3, 3}; -> {batch, 256, 13, 13}; |
512 | // strides: {1, 1} |
513 | memory::dims conv5_src_tz = {batch, 384, 13, 13}; |
514 | memory::dims conv5_weights_tz = {2, 128, 192, 3, 3}; |
515 | memory::dims conv5_bias_tz = {256}; |
516 | memory::dims conv5_dst_tz = {batch, 256, 13, 13}; |
517 | memory::dims conv5_strides = {1, 1}; |
518 | memory::dims conv5_padding = {1, 1}; |
519 | |
520 | std::vector<float> conv5_weights(product(conv5_weights_tz)); |
521 | std::vector<float> conv5_bias(product(conv5_bias_tz)); |
522 | |
523 | // create memory for user data |
524 | auto conv5_user_weights_memory |
525 | = memory({{conv5_weights_tz}, dt::f32, tag::goihw}, eng); |
526 | write_to_dnnl_memory(conv5_weights.data(), conv5_user_weights_memory); |
527 | auto conv5_user_bias_memory |
528 | = memory({{conv5_bias_tz}, dt::f32, tag::x}, eng); |
529 | write_to_dnnl_memory(conv5_bias.data(), conv5_user_bias_memory); |
530 | |
531 | // create memory descriptors for convolution data w/ no specified format |
532 | auto conv5_src_md = memory::desc({conv5_src_tz}, dt::f32, tag::any); |
533 | auto conv5_weights_md = memory::desc({conv5_weights_tz}, dt::f32, tag::any); |
534 | auto conv5_bias_md = memory::desc({conv5_bias_tz}, dt::f32, tag::any); |
535 | auto conv5_dst_md = memory::desc({conv5_dst_tz}, dt::f32, tag::any); |
536 | |
537 | // create a convolution |
538 | auto conv5_prim_desc = convolution_forward::primitive_desc(eng, |
539 | prop_kind::forward_inference, algorithm::convolution_direct, |
540 | conv5_src_md, conv5_weights_md, conv5_bias_md, conv5_dst_md, |
541 | conv5_strides, conv5_padding, conv5_padding); |
542 | |
543 | auto conv5_src_memory = conv4_dst_memory; |
544 | if (conv5_prim_desc.src_desc() != conv5_src_memory.get_desc()) { |
545 | conv5_src_memory = memory(conv5_prim_desc.src_desc(), eng); |
546 | net.push_back(reorder(conv4_dst_memory, conv5_src_memory)); |
547 | net_args.push_back({{DNNL_ARG_FROM, conv4_dst_memory}, |
548 | {DNNL_ARG_TO, conv5_src_memory}}); |
549 | } |
550 | |
551 | auto conv5_weights_memory = conv5_user_weights_memory; |
552 | if (conv5_prim_desc.weights_desc() |
553 | != conv5_user_weights_memory.get_desc()) { |
554 | conv5_weights_memory = memory(conv5_prim_desc.weights_desc(), eng); |
555 | reorder(conv5_user_weights_memory, conv5_weights_memory) |
556 | .execute(s, conv5_user_weights_memory, conv5_weights_memory); |
557 | } |
558 | |
559 | auto conv5_dst_memory = memory(conv5_prim_desc.dst_desc(), eng); |
560 | |
561 | // create convolution primitive and add it to net |
562 | net.push_back(convolution_forward(conv5_prim_desc)); |
563 | net_args.push_back({{DNNL_ARG_SRC, conv5_src_memory}, |
564 | {DNNL_ARG_WEIGHTS, conv5_weights_memory}, |
565 | {DNNL_ARG_BIAS, conv5_user_bias_memory}, |
566 | {DNNL_ARG_DST, conv5_dst_memory}}); |
567 | |
568 | // AlexNet: relu5 |
569 | // {batch, 256, 13, 13} -> {batch, 256, 13, 13} |
570 | const float negative5_slope = 0.0f; |
571 | |
572 | // create relu primitive and add it to net |
573 | auto relu5_prim_desc |
574 | = eltwise_forward::primitive_desc(eng, prop_kind::forward_inference, |
575 | algorithm::eltwise_relu, conv5_dst_memory.get_desc(), |
576 | conv5_dst_memory.get_desc(), negative5_slope); |
577 | |
578 | net.push_back(eltwise_forward(relu5_prim_desc)); |
579 | net_args.push_back({{DNNL_ARG_SRC, conv5_dst_memory}, |
580 | {DNNL_ARG_DST, conv5_dst_memory}}); |
581 | |
582 | // AlexNet: pool5 |
583 | // {batch, 256, 13, 13} -> {batch, 256, 6, 6} |
584 | // kernel: {3, 3} |
585 | // strides: {2, 2} |
586 | memory::dims pool5_dst_tz = {batch, 256, 6, 6}; |
587 | memory::dims pool5_kernel = {3, 3}; |
588 | memory::dims pool5_strides = {2, 2}; |
589 | memory::dims pool5_dilation = {0, 0}; |
590 | memory::dims pool5_padding = {0, 0}; |
591 | |
592 | std::vector<float> pool5_dst(product(pool5_dst_tz)); |
593 | |
594 | auto pool5_dst_md = memory::desc({pool5_dst_tz}, dt::f32, tag::any); |
595 | |
596 | // create a pooling |
597 | auto pool5_pd = pooling_forward::primitive_desc(eng, |
598 | prop_kind::forward_inference, algorithm::pooling_max, |
599 | conv5_dst_memory.get_desc(), pool5_dst_md, pool5_strides, |
600 | pool5_kernel, pool5_dilation, pool5_padding, pool5_padding); |
601 | |
602 | auto pool5_dst_memory = memory(pool5_pd.dst_desc(), eng); |
603 | |
604 | // create pooling primitive an add it to net |
605 | net.push_back(pooling_forward(pool5_pd)); |
606 | net_args.push_back({{DNNL_ARG_SRC, conv5_dst_memory}, |
607 | {DNNL_ARG_DST, pool5_dst_memory}}); |
608 | |
609 | // fc6 inner product {batch, 256, 6, 6} (x) {4096, 256, 6, 6}-> {batch, |
610 | // 4096} |
611 | memory::dims fc6_src_tz = {batch, 256, 6, 6}; |
612 | memory::dims fc6_weights_tz = {4096, 256, 6, 6}; |
613 | memory::dims fc6_bias_tz = {4096}; |
614 | memory::dims fc6_dst_tz = {batch, 4096}; |
615 | |
616 | std::vector<float> fc6_weights(product(fc6_weights_tz)); |
617 | std::vector<float> fc6_bias(product(fc6_bias_tz)); |
618 | |
619 | // create memory for user data |
620 | auto fc6_user_weights_memory |
621 | = memory({{fc6_weights_tz}, dt::f32, tag::oihw}, eng); |
622 | write_to_dnnl_memory(fc6_weights.data(), fc6_user_weights_memory); |
623 | auto fc6_user_bias_memory = memory({{fc6_bias_tz}, dt::f32, tag::x}, eng); |
624 | write_to_dnnl_memory(fc6_bias.data(), fc6_user_bias_memory); |
625 | |
626 | // create memory descriptors for convolution data w/ no specified format |
627 | auto fc6_src_md = memory::desc({fc6_src_tz}, dt::f32, tag::any); |
628 | auto fc6_bias_md = memory::desc({fc6_bias_tz}, dt::f32, tag::any); |
629 | auto fc6_weights_md = memory::desc({fc6_weights_tz}, dt::f32, tag::any); |
630 | auto fc6_dst_md = memory::desc({fc6_dst_tz}, dt::f32, tag::any); |
631 | |
632 | // create a inner_product |
633 | auto fc6_prim_desc = inner_product_forward::primitive_desc(eng, |
634 | prop_kind::forward_inference, fc6_src_md, fc6_weights_md, |
635 | fc6_bias_md, fc6_dst_md); |
636 | |
637 | auto fc6_src_memory = pool5_dst_memory; |
638 | if (fc6_prim_desc.src_desc() != fc6_src_memory.get_desc()) { |
639 | fc6_src_memory = memory(fc6_prim_desc.src_desc(), eng); |
640 | net.push_back(reorder(pool5_dst_memory, fc6_src_memory)); |
641 | net_args.push_back({{DNNL_ARG_FROM, pool5_dst_memory}, |
642 | {DNNL_ARG_TO, fc6_src_memory}}); |
643 | } |
644 | |
645 | auto fc6_weights_memory = fc6_user_weights_memory; |
646 | if (fc6_prim_desc.weights_desc() != fc6_user_weights_memory.get_desc()) { |
647 | fc6_weights_memory = memory(fc6_prim_desc.weights_desc(), eng); |
648 | reorder(fc6_user_weights_memory, fc6_weights_memory) |
649 | .execute(s, fc6_user_weights_memory, fc6_weights_memory); |
650 | } |
651 | |
652 | auto fc6_dst_memory = memory(fc6_prim_desc.dst_desc(), eng); |
653 | |
654 | // create convolution primitive and add it to net |
655 | net.push_back(inner_product_forward(fc6_prim_desc)); |
656 | net_args.push_back({{DNNL_ARG_SRC, fc6_src_memory}, |
657 | {DNNL_ARG_WEIGHTS, fc6_weights_memory}, |
658 | {DNNL_ARG_BIAS, fc6_user_bias_memory}, |
659 | {DNNL_ARG_DST, fc6_dst_memory}}); |
660 | |
661 | // fc7 inner product {batch, 4096} (x) {4096, 4096}-> {batch, 4096} |
662 | memory::dims fc7_weights_tz = {4096, 4096}; |
663 | memory::dims fc7_bias_tz = {4096}; |
664 | memory::dims fc7_dst_tz = {batch, 4096}; |
665 | |
666 | std::vector<float> fc7_weights(product(fc7_weights_tz)); |
667 | std::vector<float> fc7_bias(product(fc7_bias_tz)); |
668 | |
669 | // create memory for user data |
670 | auto fc7_user_weights_memory |
671 | = memory({{fc7_weights_tz}, dt::f32, tag::nc}, eng); |
672 | write_to_dnnl_memory(fc7_weights.data(), fc7_user_weights_memory); |
673 | |
674 | auto fc7_user_bias_memory = memory({{fc7_bias_tz}, dt::f32, tag::x}, eng); |
675 | write_to_dnnl_memory(fc7_bias.data(), fc7_user_bias_memory); |
676 | |
677 | // create memory descriptors for convolution data w/ no specified format |
678 | auto fc7_bias_md = memory::desc({fc7_bias_tz}, dt::f32, tag::any); |
679 | auto fc7_weights_md = memory::desc({fc7_weights_tz}, dt::f32, tag::any); |
680 | auto fc7_dst_md = memory::desc({fc7_dst_tz}, dt::f32, tag::any); |
681 | |
682 | // create a inner_product |
683 | auto fc7_prim_desc = inner_product_forward::primitive_desc(eng, |
684 | prop_kind::forward_inference, fc6_dst_memory.get_desc(), |
685 | fc7_weights_md, fc7_bias_md, fc7_dst_md); |
686 | |
687 | auto fc7_weights_memory = fc7_user_weights_memory; |
688 | if (fc7_prim_desc.weights_desc() != fc7_user_weights_memory.get_desc()) { |
689 | fc7_weights_memory = memory(fc7_prim_desc.weights_desc(), eng); |
690 | reorder(fc7_user_weights_memory, fc7_weights_memory) |
691 | .execute(s, fc7_user_weights_memory, fc7_weights_memory); |
692 | } |
693 | |
694 | auto fc7_dst_memory = memory(fc7_prim_desc.dst_desc(), eng); |
695 | |
696 | // create convolution primitive and add it to net |
697 | net.push_back(inner_product_forward(fc7_prim_desc)); |
698 | net_args.push_back({{DNNL_ARG_SRC, fc6_dst_memory}, |
699 | {DNNL_ARG_WEIGHTS, fc7_weights_memory}, |
700 | {DNNL_ARG_BIAS, fc7_user_bias_memory}, |
701 | {DNNL_ARG_DST, fc7_dst_memory}}); |
702 | |
703 | // fc8 inner product {batch, 4096} (x) {1000, 4096}-> {batch, 1000} |
704 | memory::dims fc8_weights_tz = {1000, 4096}; |
705 | memory::dims fc8_bias_tz = {1000}; |
706 | memory::dims fc8_dst_tz = {batch, 1000}; |
707 | |
708 | std::vector<float> fc8_weights(product(fc8_weights_tz)); |
709 | std::vector<float> fc8_bias(product(fc8_bias_tz)); |
710 | |
711 | // create memory for user data |
712 | auto fc8_user_weights_memory |
713 | = memory({{fc8_weights_tz}, dt::f32, tag::nc}, eng); |
714 | write_to_dnnl_memory(fc8_weights.data(), fc8_user_weights_memory); |
715 | auto fc8_user_bias_memory = memory({{fc8_bias_tz}, dt::f32, tag::x}, eng); |
716 | write_to_dnnl_memory(fc8_bias.data(), fc8_user_bias_memory); |
717 | auto user_dst_memory = memory({{fc8_dst_tz}, dt::f32, tag::nc}, eng); |
718 | write_to_dnnl_memory(user_dst.data(), user_dst_memory); |
719 | |
720 | // create memory descriptors for convolution data w/ no specified format |
721 | auto fc8_bias_md = memory::desc({fc8_bias_tz}, dt::f32, tag::any); |
722 | auto fc8_weights_md = memory::desc({fc8_weights_tz}, dt::f32, tag::any); |
723 | auto fc8_dst_md = memory::desc({fc8_dst_tz}, dt::f32, tag::any); |
724 | |
725 | // create a inner_product |
726 | auto fc8_prim_desc = inner_product_forward::primitive_desc(eng, |
727 | prop_kind::forward_inference, fc7_dst_memory.get_desc(), |
728 | fc8_weights_md, fc8_bias_md, fc8_dst_md); |
729 | |
730 | auto fc8_weights_memory = fc8_user_weights_memory; |
731 | if (fc8_prim_desc.weights_desc() != fc8_user_weights_memory.get_desc()) { |
732 | fc8_weights_memory = memory(fc8_prim_desc.weights_desc(), eng); |
733 | reorder(fc8_user_weights_memory, fc8_weights_memory) |
734 | .execute(s, fc8_user_weights_memory, fc8_weights_memory); |
735 | } |
736 | |
737 | auto fc8_dst_memory = memory(fc8_prim_desc.dst_desc(), eng); |
738 | |
739 | // create convolution primitive and add it to net |
740 | net.push_back(inner_product_forward(fc8_prim_desc)); |
741 | net_args.push_back({{DNNL_ARG_SRC, fc7_dst_memory}, |
742 | {DNNL_ARG_WEIGHTS, fc8_weights_memory}, |
743 | {DNNL_ARG_BIAS, fc8_user_bias_memory}, |
744 | {DNNL_ARG_DST, fc8_dst_memory}}); |
745 | |
746 | // create reorder between internal and user data if it is needed and |
747 | // add it to net after pooling |
748 | if (fc8_dst_memory != user_dst_memory) { |
749 | net.push_back(reorder(fc8_dst_memory, user_dst_memory)); |
750 | net_args.push_back({{DNNL_ARG_FROM, fc8_dst_memory}, |
751 | {DNNL_ARG_TO, user_dst_memory}}); |
752 | } |
753 | |
754 | /// @page cnn_inference_f32_cpp |
755 | /// Finally, execute the primitives. For this example, the net is executed |
756 | /// multiple times and each execution is timed individually. |
757 | /// @snippet cnn_inference_f32.cpp Execute model |
758 | //[Execute model] |
759 | for (int j = 0; j < times; ++j) { |
760 | assert(net.size() == net_args.size() && "something is missing" ); |
761 | for (size_t i = 0; i < net.size(); ++i) |
762 | net.at(i).execute(s, net_args.at(i)); |
763 | } |
764 | //[Execute model] |
765 | |
766 | s.wait(); |
767 | } |
768 | |
769 | void cnn_inference_f32(engine::kind engine_kind) { |
770 | auto begin = std::chrono::duration_cast<std::chrono::milliseconds>( |
771 | std::chrono::steady_clock::now().time_since_epoch()) |
772 | .count(); |
773 | int times = 100; |
774 | simple_net(engine_kind, times); |
775 | auto end = std::chrono::duration_cast<std::chrono::milliseconds>( |
776 | std::chrono::steady_clock::now().time_since_epoch()) |
777 | .count(); |
778 | std::cout << "Use time: " << (end - begin) / (times + 0.0) |
779 | << " ms per iteration." << std::endl; |
780 | } |
781 | |
782 | int main(int argc, char **argv) { |
783 | return handle_example_errors( |
784 | cnn_inference_f32, parse_engine_kind(argc, argv)); |
785 | } |
786 | |