1 | /******************************************************************************* |
2 | * Copyright 2018-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example cnn_inference_int8.cpp |
18 | /// @copybrief cnn_inference_int8_cpp |
19 | /// > Annotated version: @ref cnn_inference_int8_cpp |
20 | |
21 | /// @page cnn_inference_int8_cpp CNN int8 inference example |
22 | /// This C++ API example demonstrates how to run AlexNet's conv3 and relu3 |
23 | /// with int8 data type. |
24 | /// |
25 | /// > Example code: @ref cnn_inference_int8.cpp |
26 | |
27 | #include <stdexcept> |
28 | |
29 | #include "oneapi/dnnl/dnnl.hpp" |
30 | |
31 | #include "example_utils.hpp" |
32 | |
33 | using namespace dnnl; |
34 | |
35 | void simple_net_int8(engine::kind engine_kind) { |
36 | using tag = memory::format_tag; |
37 | using dt = memory::data_type; |
38 | |
39 | auto eng = engine(engine_kind, 0); |
40 | stream s(eng); |
41 | |
42 | const int batch = 8; |
43 | |
44 | /// Configure tensor shapes |
45 | /// @snippet cnn_inference_int8.cpp Configure tensor shapes |
46 | //[Configure tensor shapes] |
47 | // AlexNet: conv3 |
48 | // {batch, 256, 13, 13} (x) {384, 256, 3, 3}; -> {batch, 384, 13, 13} |
49 | // strides: {1, 1} |
50 | memory::dims conv_src_tz = {batch, 256, 13, 13}; |
51 | memory::dims conv_weights_tz = {384, 256, 3, 3}; |
52 | memory::dims conv_bias_tz = {384}; |
53 | memory::dims conv_dst_tz = {batch, 384, 13, 13}; |
54 | memory::dims conv_strides = {1, 1}; |
55 | memory::dims conv_padding = {1, 1}; |
56 | //[Configure tensor shapes] |
57 | |
58 | /// Next, the example configures the scales used to quantize f32 data |
59 | /// into int8. For this example, the scaling value is chosen as an |
60 | /// arbitrary number, although in a realistic scenario, it should be |
61 | /// calculated from a set of precomputed values as previously mentioned. |
62 | /// @snippet cnn_inference_int8.cpp Choose scaling factors |
63 | //[Choose scaling factors] |
64 | // Choose scaling factors for input, weight and output |
65 | std::vector<float> src_scales = {1.8f}; |
66 | std::vector<float> weight_scales = {2.0f}; |
67 | std::vector<float> dst_scales = {0.55f}; |
68 | |
69 | // Choose channel-wise scaling factors for convolution |
70 | std::vector<float> conv_scales(384); |
71 | const int scales_half = 384 / 2; |
72 | std::fill(conv_scales.begin(), conv_scales.begin() + scales_half, 0.3f); |
73 | std::fill(conv_scales.begin() + scales_half + 1, conv_scales.end(), 0.8f); |
74 | //[Choose scaling factors] |
75 | |
76 | /// The *source, weights, bias* and *destination* datasets use the single-scale |
77 | /// format with mask set to '0', while the *output* from the convolution |
78 | /// (conv_scales) will use the array format where mask = 2 corresponding |
79 | /// to the output dimension. |
80 | /// @snippet cnn_inference_int8.cpp Set scaling mask |
81 | //[Set scaling mask] |
82 | const int src_mask = 0; |
83 | const int weight_mask = 0; |
84 | const int dst_mask = 0; |
85 | //[Set scaling mask] |
86 | |
87 | // Allocate input and output buffers for user data |
88 | std::vector<float> user_src(batch * 256 * 13 * 13); |
89 | std::vector<float> user_dst(batch * 384 * 13 * 13); |
90 | |
91 | // Allocate and fill buffers for weights and bias |
92 | std::vector<float> conv_weights(product(conv_weights_tz)); |
93 | std::vector<float> conv_bias(product(conv_bias_tz)); |
94 | |
95 | /// Create the memory primitives for user data (source, weights, and bias). |
96 | /// The user data will be in its original 32-bit floating point format. |
97 | /// @snippet cnn_inference_int8.cpp Allocate buffers |
98 | //[Allocate buffers] |
99 | auto user_src_memory = memory({{conv_src_tz}, dt::f32, tag::nchw}, eng); |
100 | write_to_dnnl_memory(user_src.data(), user_src_memory); |
101 | auto user_weights_memory |
102 | = memory({{conv_weights_tz}, dt::f32, tag::oihw}, eng); |
103 | write_to_dnnl_memory(conv_weights.data(), user_weights_memory); |
104 | auto user_bias_memory = memory({{conv_bias_tz}, dt::f32, tag::x}, eng); |
105 | write_to_dnnl_memory(conv_bias.data(), user_bias_memory); |
106 | //[Allocate buffers] |
107 | |
108 | /// Create a memory descriptor for each convolution parameter. |
109 | /// The convolution data uses 8-bit integer values, so the memory |
110 | /// descriptors are configured as: |
111 | /// |
112 | /// * 8-bit unsigned (u8) for source and destination. |
113 | /// * 8-bit signed (s8) for bias and weights. |
114 | /// |
115 | /// > **Note** |
116 | /// > The destination type is chosen as *unsigned* because the |
117 | /// > convolution applies a ReLU operation where data results \f$\geq 0\f$. |
118 | /// @snippet cnn_inference_int8.cpp Create convolution memory descriptors |
119 | //[Create convolution memory descriptors] |
120 | auto conv_src_md = memory::desc({conv_src_tz}, dt::u8, tag::any); |
121 | auto conv_bias_md = memory::desc({conv_bias_tz}, dt::s8, tag::any); |
122 | auto conv_weights_md = memory::desc({conv_weights_tz}, dt::s8, tag::any); |
123 | auto conv_dst_md = memory::desc({conv_dst_tz}, dt::u8, tag::any); |
124 | //[Create convolution memory descriptors] |
125 | |
126 | /// Configuring int8-specific parameters in an int8 primitive is done |
127 | /// via the Attributes Primitive. Create an attributes object for the |
128 | /// convolution and configure it accordingly. |
129 | /// @snippet cnn_inference_int8.cpp Configure scaling |
130 | //[Configure scaling] |
131 | primitive_attr conv_attr; |
132 | conv_attr.set_scales_mask(DNNL_ARG_SRC, src_mask); |
133 | conv_attr.set_scales_mask(DNNL_ARG_WEIGHTS, weight_mask); |
134 | conv_attr.set_scales_mask(DNNL_ARG_DST, dst_mask); |
135 | |
136 | // Prepare dst scales |
137 | auto dst_scale_md = memory::desc({1}, dt::f32, tag::x); |
138 | auto dst_scale_memory = memory(dst_scale_md, eng); |
139 | write_to_dnnl_memory(dst_scales.data(), dst_scale_memory); |
140 | //[Configure scaling] |
141 | |
142 | /// The ReLU layer from Alexnet is executed through the PostOps feature. Create |
143 | /// a PostOps object and configure it to execute an _eltwise relu_ operation. |
144 | /// @snippet cnn_inference_int8.cpp Configure post-ops |
145 | //[Configure post-ops] |
146 | const float ops_alpha = 0.f; // relu negative slope |
147 | const float ops_beta = 0.f; |
148 | post_ops ops; |
149 | ops.append_eltwise(algorithm::eltwise_relu, ops_alpha, ops_beta); |
150 | conv_attr.set_post_ops(ops); |
151 | //[Configure post-ops] |
152 | |
153 | // check if int8 convolution is supported |
154 | try { |
155 | convolution_forward::primitive_desc(eng, prop_kind::forward, |
156 | algorithm::convolution_direct, conv_src_md, conv_weights_md, |
157 | conv_bias_md, conv_dst_md, conv_strides, conv_padding, |
158 | conv_padding, conv_attr); |
159 | } catch (error &e) { |
160 | if (e.status == dnnl_unimplemented) |
161 | throw example_allows_unimplemented { |
162 | "No int8 convolution implementation is available for this " |
163 | "platform.\n" |
164 | "Please refer to the developer guide for details." }; |
165 | |
166 | // on any other error just re-throw |
167 | throw; |
168 | } |
169 | |
170 | /// Create a primitive descriptor passing the int8 memory descriptors |
171 | /// and int8 attributes to the constructor. The primitive |
172 | /// descriptor for the convolution will contain the specific memory |
173 | /// formats for the computation. |
174 | /// @snippet cnn_inference_int8.cpp Create convolution primitive descriptor |
175 | //[Create convolution primitive descriptor] |
176 | auto conv_prim_desc = convolution_forward::primitive_desc(eng, |
177 | prop_kind::forward, algorithm::convolution_direct, conv_src_md, |
178 | conv_weights_md, conv_bias_md, conv_dst_md, conv_strides, |
179 | conv_padding, conv_padding, conv_attr); |
180 | //[Create convolution primitive descriptor] |
181 | |
182 | /// Create a memory for each of the convolution's data input |
183 | /// parameters (source, bias, weights, and destination). Using the convolution |
184 | /// primitive descriptor as the creation parameter enables oneDNN |
185 | /// to configure the memory formats for the convolution. |
186 | /// |
187 | /// Scaling parameters are passed to the reorder primitive via the attributes |
188 | /// primitive. |
189 | /// |
190 | /// User memory must be transformed into convolution-friendly memory |
191 | /// (for int8 and memory format). A reorder layer performs the data |
192 | /// transformation from f32 (the original user data) into int8 format |
193 | /// (the data used for the convolution). In addition, the reorder |
194 | /// transforms the user data into the required memory format (as explained |
195 | /// in the simple_net example). |
196 | /// |
197 | /// @snippet cnn_inference_int8.cpp Quantize data and weights |
198 | //[Quantize data and weights] |
199 | auto conv_src_memory = memory(conv_prim_desc.src_desc(), eng); |
200 | primitive_attr src_attr; |
201 | src_attr.set_scales_mask(DNNL_ARG_DST, src_mask); |
202 | auto src_scale_md = memory::desc({1}, dt::f32, tag::x); |
203 | auto src_scale_memory = memory(src_scale_md, eng); |
204 | write_to_dnnl_memory(src_scales.data(), src_scale_memory); |
205 | auto src_reorder_pd |
206 | = reorder::primitive_desc(eng, user_src_memory.get_desc(), eng, |
207 | conv_src_memory.get_desc(), src_attr); |
208 | auto src_reorder = reorder(src_reorder_pd); |
209 | src_reorder.execute(s, |
210 | {{DNNL_ARG_FROM, user_src_memory}, {DNNL_ARG_TO, conv_src_memory}, |
211 | {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, src_scale_memory}}); |
212 | |
213 | auto conv_weights_memory = memory(conv_prim_desc.weights_desc(), eng); |
214 | primitive_attr weight_attr; |
215 | weight_attr.set_scales_mask(DNNL_ARG_DST, weight_mask); |
216 | auto wei_scale_md = memory::desc({1}, dt::f32, tag::x); |
217 | auto wei_scale_memory = memory(wei_scale_md, eng); |
218 | write_to_dnnl_memory(weight_scales.data(), wei_scale_memory); |
219 | auto weight_reorder_pd |
220 | = reorder::primitive_desc(eng, user_weights_memory.get_desc(), eng, |
221 | conv_weights_memory.get_desc(), weight_attr); |
222 | auto weight_reorder = reorder(weight_reorder_pd); |
223 | weight_reorder.execute(s, |
224 | {{DNNL_ARG_FROM, user_weights_memory}, |
225 | {DNNL_ARG_TO, conv_weights_memory}, |
226 | {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, wei_scale_memory}}); |
227 | |
228 | auto conv_bias_memory = memory(conv_prim_desc.bias_desc(), eng); |
229 | write_to_dnnl_memory(conv_bias.data(), conv_bias_memory); |
230 | //[Quantize data and weights] |
231 | |
232 | auto conv_dst_memory = memory(conv_prim_desc.dst_desc(), eng); |
233 | |
234 | /// Create the convolution primitive and add it to the net. The int8 example |
235 | /// computes the same Convolution +ReLU layers from AlexNet simple-net.cpp |
236 | /// using the int8 and PostOps approach. Although performance is not |
237 | /// measured here, in practice it would require less computation time to achieve |
238 | /// similar results. |
239 | /// @snippet cnn_inference_int8.cpp Create convolution primitive |
240 | //[Create convolution primitive] |
241 | auto conv = convolution_forward(conv_prim_desc); |
242 | conv.execute(s, |
243 | {{DNNL_ARG_SRC, conv_src_memory}, |
244 | {DNNL_ARG_WEIGHTS, conv_weights_memory}, |
245 | {DNNL_ARG_BIAS, conv_bias_memory}, |
246 | {DNNL_ARG_DST, conv_dst_memory}, |
247 | {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scale_memory}, |
248 | {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scale_memory}, |
249 | {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scale_memory}}); |
250 | //[Create convolution primitive] |
251 | |
252 | /// @page cnn_inference_int8_cpp |
253 | /// Finally, *dst memory* may be dequantized from int8 into the original |
254 | /// f32 format. Create a memory primitive for the user data in the original |
255 | /// 32-bit floating point format and then apply a reorder to transform the |
256 | /// computation output data. |
257 | /// @snippet cnn_inference_int8.cpp Dequantize the result |
258 | ///[Dequantize the result] |
259 | auto user_dst_memory = memory({{conv_dst_tz}, dt::f32, tag::nchw}, eng); |
260 | write_to_dnnl_memory(user_dst.data(), user_dst_memory); |
261 | primitive_attr dst_attr; |
262 | dst_attr.set_scales_mask(DNNL_ARG_SRC, dst_mask); |
263 | auto dst_reorder_pd |
264 | = reorder::primitive_desc(eng, conv_dst_memory.get_desc(), eng, |
265 | user_dst_memory.get_desc(), dst_attr); |
266 | auto dst_reorder = reorder(dst_reorder_pd); |
267 | dst_reorder.execute(s, |
268 | {{DNNL_ARG_FROM, conv_dst_memory}, {DNNL_ARG_TO, user_dst_memory}, |
269 | {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, dst_scale_memory}}); |
270 | //[Dequantize the result] |
271 | |
272 | s.wait(); |
273 | } |
274 | |
275 | int main(int argc, char **argv) { |
276 | return handle_example_errors( |
277 | simple_net_int8, parse_engine_kind(argc, argv)); |
278 | } |
279 | |