1/*******************************************************************************
2* Copyright 2018-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example cnn_inference_int8.cpp
18/// @copybrief cnn_inference_int8_cpp
19/// > Annotated version: @ref cnn_inference_int8_cpp
20
21/// @page cnn_inference_int8_cpp CNN int8 inference example
22/// This C++ API example demonstrates how to run AlexNet's conv3 and relu3
23/// with int8 data type.
24///
25/// > Example code: @ref cnn_inference_int8.cpp
26
27#include <stdexcept>
28
29#include "oneapi/dnnl/dnnl.hpp"
30
31#include "example_utils.hpp"
32
33using namespace dnnl;
34
35void simple_net_int8(engine::kind engine_kind) {
36 using tag = memory::format_tag;
37 using dt = memory::data_type;
38
39 auto eng = engine(engine_kind, 0);
40 stream s(eng);
41
42 const int batch = 8;
43
44 /// Configure tensor shapes
45 /// @snippet cnn_inference_int8.cpp Configure tensor shapes
46 //[Configure tensor shapes]
47 // AlexNet: conv3
48 // {batch, 256, 13, 13} (x) {384, 256, 3, 3}; -> {batch, 384, 13, 13}
49 // strides: {1, 1}
50 memory::dims conv_src_tz = {batch, 256, 13, 13};
51 memory::dims conv_weights_tz = {384, 256, 3, 3};
52 memory::dims conv_bias_tz = {384};
53 memory::dims conv_dst_tz = {batch, 384, 13, 13};
54 memory::dims conv_strides = {1, 1};
55 memory::dims conv_padding = {1, 1};
56 //[Configure tensor shapes]
57
58 /// Next, the example configures the scales used to quantize f32 data
59 /// into int8. For this example, the scaling value is chosen as an
60 /// arbitrary number, although in a realistic scenario, it should be
61 /// calculated from a set of precomputed values as previously mentioned.
62 /// @snippet cnn_inference_int8.cpp Choose scaling factors
63 //[Choose scaling factors]
64 // Choose scaling factors for input, weight and output
65 std::vector<float> src_scales = {1.8f};
66 std::vector<float> weight_scales = {2.0f};
67 std::vector<float> dst_scales = {0.55f};
68
69 // Choose channel-wise scaling factors for convolution
70 std::vector<float> conv_scales(384);
71 const int scales_half = 384 / 2;
72 std::fill(conv_scales.begin(), conv_scales.begin() + scales_half, 0.3f);
73 std::fill(conv_scales.begin() + scales_half + 1, conv_scales.end(), 0.8f);
74 //[Choose scaling factors]
75
76 /// The *source, weights, bias* and *destination* datasets use the single-scale
77 /// format with mask set to '0', while the *output* from the convolution
78 /// (conv_scales) will use the array format where mask = 2 corresponding
79 /// to the output dimension.
80 /// @snippet cnn_inference_int8.cpp Set scaling mask
81 //[Set scaling mask]
82 const int src_mask = 0;
83 const int weight_mask = 0;
84 const int dst_mask = 0;
85 //[Set scaling mask]
86
87 // Allocate input and output buffers for user data
88 std::vector<float> user_src(batch * 256 * 13 * 13);
89 std::vector<float> user_dst(batch * 384 * 13 * 13);
90
91 // Allocate and fill buffers for weights and bias
92 std::vector<float> conv_weights(product(conv_weights_tz));
93 std::vector<float> conv_bias(product(conv_bias_tz));
94
95 /// Create the memory primitives for user data (source, weights, and bias).
96 /// The user data will be in its original 32-bit floating point format.
97 /// @snippet cnn_inference_int8.cpp Allocate buffers
98 //[Allocate buffers]
99 auto user_src_memory = memory({{conv_src_tz}, dt::f32, tag::nchw}, eng);
100 write_to_dnnl_memory(user_src.data(), user_src_memory);
101 auto user_weights_memory
102 = memory({{conv_weights_tz}, dt::f32, tag::oihw}, eng);
103 write_to_dnnl_memory(conv_weights.data(), user_weights_memory);
104 auto user_bias_memory = memory({{conv_bias_tz}, dt::f32, tag::x}, eng);
105 write_to_dnnl_memory(conv_bias.data(), user_bias_memory);
106 //[Allocate buffers]
107
108 /// Create a memory descriptor for each convolution parameter.
109 /// The convolution data uses 8-bit integer values, so the memory
110 /// descriptors are configured as:
111 ///
112 /// * 8-bit unsigned (u8) for source and destination.
113 /// * 8-bit signed (s8) for bias and weights.
114 ///
115 /// > **Note**
116 /// > The destination type is chosen as *unsigned* because the
117 /// > convolution applies a ReLU operation where data results \f$\geq 0\f$.
118 /// @snippet cnn_inference_int8.cpp Create convolution memory descriptors
119 //[Create convolution memory descriptors]
120 auto conv_src_md = memory::desc({conv_src_tz}, dt::u8, tag::any);
121 auto conv_bias_md = memory::desc({conv_bias_tz}, dt::s8, tag::any);
122 auto conv_weights_md = memory::desc({conv_weights_tz}, dt::s8, tag::any);
123 auto conv_dst_md = memory::desc({conv_dst_tz}, dt::u8, tag::any);
124 //[Create convolution memory descriptors]
125
126 /// Configuring int8-specific parameters in an int8 primitive is done
127 /// via the Attributes Primitive. Create an attributes object for the
128 /// convolution and configure it accordingly.
129 /// @snippet cnn_inference_int8.cpp Configure scaling
130 //[Configure scaling]
131 primitive_attr conv_attr;
132 conv_attr.set_scales_mask(DNNL_ARG_SRC, src_mask);
133 conv_attr.set_scales_mask(DNNL_ARG_WEIGHTS, weight_mask);
134 conv_attr.set_scales_mask(DNNL_ARG_DST, dst_mask);
135
136 // Prepare dst scales
137 auto dst_scale_md = memory::desc({1}, dt::f32, tag::x);
138 auto dst_scale_memory = memory(dst_scale_md, eng);
139 write_to_dnnl_memory(dst_scales.data(), dst_scale_memory);
140 //[Configure scaling]
141
142 /// The ReLU layer from Alexnet is executed through the PostOps feature. Create
143 /// a PostOps object and configure it to execute an _eltwise relu_ operation.
144 /// @snippet cnn_inference_int8.cpp Configure post-ops
145 //[Configure post-ops]
146 const float ops_alpha = 0.f; // relu negative slope
147 const float ops_beta = 0.f;
148 post_ops ops;
149 ops.append_eltwise(algorithm::eltwise_relu, ops_alpha, ops_beta);
150 conv_attr.set_post_ops(ops);
151 //[Configure post-ops]
152
153 // check if int8 convolution is supported
154 try {
155 convolution_forward::primitive_desc(eng, prop_kind::forward,
156 algorithm::convolution_direct, conv_src_md, conv_weights_md,
157 conv_bias_md, conv_dst_md, conv_strides, conv_padding,
158 conv_padding, conv_attr);
159 } catch (error &e) {
160 if (e.status == dnnl_unimplemented)
161 throw example_allows_unimplemented {
162 "No int8 convolution implementation is available for this "
163 "platform.\n"
164 "Please refer to the developer guide for details."};
165
166 // on any other error just re-throw
167 throw;
168 }
169
170 /// Create a primitive descriptor passing the int8 memory descriptors
171 /// and int8 attributes to the constructor. The primitive
172 /// descriptor for the convolution will contain the specific memory
173 /// formats for the computation.
174 /// @snippet cnn_inference_int8.cpp Create convolution primitive descriptor
175 //[Create convolution primitive descriptor]
176 auto conv_prim_desc = convolution_forward::primitive_desc(eng,
177 prop_kind::forward, algorithm::convolution_direct, conv_src_md,
178 conv_weights_md, conv_bias_md, conv_dst_md, conv_strides,
179 conv_padding, conv_padding, conv_attr);
180 //[Create convolution primitive descriptor]
181
182 /// Create a memory for each of the convolution's data input
183 /// parameters (source, bias, weights, and destination). Using the convolution
184 /// primitive descriptor as the creation parameter enables oneDNN
185 /// to configure the memory formats for the convolution.
186 ///
187 /// Scaling parameters are passed to the reorder primitive via the attributes
188 /// primitive.
189 ///
190 /// User memory must be transformed into convolution-friendly memory
191 /// (for int8 and memory format). A reorder layer performs the data
192 /// transformation from f32 (the original user data) into int8 format
193 /// (the data used for the convolution). In addition, the reorder
194 /// transforms the user data into the required memory format (as explained
195 /// in the simple_net example).
196 ///
197 /// @snippet cnn_inference_int8.cpp Quantize data and weights
198 //[Quantize data and weights]
199 auto conv_src_memory = memory(conv_prim_desc.src_desc(), eng);
200 primitive_attr src_attr;
201 src_attr.set_scales_mask(DNNL_ARG_DST, src_mask);
202 auto src_scale_md = memory::desc({1}, dt::f32, tag::x);
203 auto src_scale_memory = memory(src_scale_md, eng);
204 write_to_dnnl_memory(src_scales.data(), src_scale_memory);
205 auto src_reorder_pd
206 = reorder::primitive_desc(eng, user_src_memory.get_desc(), eng,
207 conv_src_memory.get_desc(), src_attr);
208 auto src_reorder = reorder(src_reorder_pd);
209 src_reorder.execute(s,
210 {{DNNL_ARG_FROM, user_src_memory}, {DNNL_ARG_TO, conv_src_memory},
211 {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, src_scale_memory}});
212
213 auto conv_weights_memory = memory(conv_prim_desc.weights_desc(), eng);
214 primitive_attr weight_attr;
215 weight_attr.set_scales_mask(DNNL_ARG_DST, weight_mask);
216 auto wei_scale_md = memory::desc({1}, dt::f32, tag::x);
217 auto wei_scale_memory = memory(wei_scale_md, eng);
218 write_to_dnnl_memory(weight_scales.data(), wei_scale_memory);
219 auto weight_reorder_pd
220 = reorder::primitive_desc(eng, user_weights_memory.get_desc(), eng,
221 conv_weights_memory.get_desc(), weight_attr);
222 auto weight_reorder = reorder(weight_reorder_pd);
223 weight_reorder.execute(s,
224 {{DNNL_ARG_FROM, user_weights_memory},
225 {DNNL_ARG_TO, conv_weights_memory},
226 {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, wei_scale_memory}});
227
228 auto conv_bias_memory = memory(conv_prim_desc.bias_desc(), eng);
229 write_to_dnnl_memory(conv_bias.data(), conv_bias_memory);
230 //[Quantize data and weights]
231
232 auto conv_dst_memory = memory(conv_prim_desc.dst_desc(), eng);
233
234 /// Create the convolution primitive and add it to the net. The int8 example
235 /// computes the same Convolution +ReLU layers from AlexNet simple-net.cpp
236 /// using the int8 and PostOps approach. Although performance is not
237 /// measured here, in practice it would require less computation time to achieve
238 /// similar results.
239 /// @snippet cnn_inference_int8.cpp Create convolution primitive
240 //[Create convolution primitive]
241 auto conv = convolution_forward(conv_prim_desc);
242 conv.execute(s,
243 {{DNNL_ARG_SRC, conv_src_memory},
244 {DNNL_ARG_WEIGHTS, conv_weights_memory},
245 {DNNL_ARG_BIAS, conv_bias_memory},
246 {DNNL_ARG_DST, conv_dst_memory},
247 {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scale_memory},
248 {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scale_memory},
249 {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scale_memory}});
250 //[Create convolution primitive]
251
252 /// @page cnn_inference_int8_cpp
253 /// Finally, *dst memory* may be dequantized from int8 into the original
254 /// f32 format. Create a memory primitive for the user data in the original
255 /// 32-bit floating point format and then apply a reorder to transform the
256 /// computation output data.
257 /// @snippet cnn_inference_int8.cpp Dequantize the result
258 ///[Dequantize the result]
259 auto user_dst_memory = memory({{conv_dst_tz}, dt::f32, tag::nchw}, eng);
260 write_to_dnnl_memory(user_dst.data(), user_dst_memory);
261 primitive_attr dst_attr;
262 dst_attr.set_scales_mask(DNNL_ARG_SRC, dst_mask);
263 auto dst_reorder_pd
264 = reorder::primitive_desc(eng, conv_dst_memory.get_desc(), eng,
265 user_dst_memory.get_desc(), dst_attr);
266 auto dst_reorder = reorder(dst_reorder_pd);
267 dst_reorder.execute(s,
268 {{DNNL_ARG_FROM, conv_dst_memory}, {DNNL_ARG_TO, user_dst_memory},
269 {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, dst_scale_memory}});
270 //[Dequantize the result]
271
272 s.wait();
273}
274
275int main(int argc, char **argv) {
276 return handle_example_errors(
277 simple_net_int8, parse_engine_kind(argc, argv));
278}
279