1 | /******************************************************************************* |
2 | * Copyright 2018-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example cpu_rnn_inference_int8.cpp |
18 | /// @copybrief cpu_rnn_inference_int8_cpp |
19 | /// > Annotated version: @ref cpu_rnn_inference_int8_cpp |
20 | |
21 | /// @page cpu_rnn_inference_int8_cpp RNN int8 inference example |
22 | /// This C++ API example demonstrates how to build GNMT model inference. |
23 | /// |
24 | /// > Example code: @ref cpu_rnn_inference_int8.cpp |
25 | /// |
26 | /// For the encoder we use: |
27 | /// - one primitive for the bidirectional layer of the encoder |
28 | /// - one primitive for all remaining unidirectional layers in the encoder |
29 | /// For the decoder we use: |
30 | /// - one primitive for the first iteration |
31 | /// - one primitive for all subsequent iterations in the decoder. Note that |
32 | /// in this example, this primitive computes the states in place. |
33 | /// - the attention mechanism is implemented separately as there is no support |
34 | /// for the context vectors in oneDNN yet |
35 | |
36 | #include <assert.h> |
37 | |
38 | #include <cstring> |
39 | #include <iostream> |
40 | #include <math.h> |
41 | #include <numeric> |
42 | #include <string> |
43 | |
44 | #include "oneapi/dnnl/dnnl.hpp" |
45 | |
46 | #include "example_utils.hpp" |
47 | |
48 | using namespace dnnl; |
49 | |
50 | using dim_t = dnnl::memory::dim; |
51 | |
52 | const dim_t batch = 32; |
53 | const dim_t src_seq_length_max = 10; |
54 | const dim_t tgt_seq_length_max = 10; |
55 | |
56 | const dim_t feature_size = 256; |
57 | |
58 | const dim_t enc_bidir_n_layers = 1; |
59 | const dim_t enc_unidir_n_layers = 3; |
60 | const dim_t dec_n_layers = 4; |
61 | |
62 | const int lstm_n_gates = 4; |
63 | |
64 | std::vector<int32_t> weighted_src_layer(batch *feature_size, 1); |
65 | std::vector<float> alignment_model( |
66 | src_seq_length_max *batch *feature_size, 1.0f); |
67 | std::vector<float> alignments(src_seq_length_max *batch, 1.0f); |
68 | std::vector<float> exp_sums(batch, 1.0f); |
69 | |
70 | void compute_weighted_annotations(float *weighted_annotations, |
71 | dim_t src_seq_length_max, dim_t batch, dim_t feature_size, |
72 | float *weights_annot, float *annotations) { |
73 | // annotations(aka enc_dst_layer) is (t, n, 2c) |
74 | // weights_annot is (2c, c) |
75 | |
76 | dim_t num_weighted_annotations = src_seq_length_max * batch; |
77 | // annotation[i] = GEMM(weights_annot, enc_dst_layer[i]); |
78 | dnnl_sgemm('N', 'N', num_weighted_annotations, feature_size, feature_size, |
79 | 1.f, annotations, feature_size, weights_annot, feature_size, 0.f, |
80 | weighted_annotations, feature_size); |
81 | } |
82 | |
83 | void compute_sum_of_rows( |
84 | int8_t *a, dim_t rows, dim_t cols, int32_t *a_reduced) { |
85 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1) |
86 | for (dim_t i = 0; i < cols; i++) { |
87 | a_reduced[i] = 0; |
88 | for (dim_t j = 0; j < rows; j++) { |
89 | a_reduced[i] += (int32_t)a[i * rows + j]; |
90 | } |
91 | } |
92 | } |
93 | |
94 | void compute_attention(float *context_vectors, dim_t src_seq_length_max, |
95 | dim_t batch, dim_t feature_size, int8_t *weights_src_layer, |
96 | float weights_src_layer_scale, int32_t *compensation, |
97 | uint8_t *dec_src_layer, float dec_src_layer_scale, |
98 | float dec_src_layer_shift, uint8_t *annotations, |
99 | float *weighted_annotations, float *weights_alignments) { |
100 | // dst_iter : (n, c) matrix |
101 | // src_layer: (n, c) matrix |
102 | // weighted_annotations (t, n, c) |
103 | |
104 | // weights_yi is (c, c) |
105 | // weights_ai is (c, 1) |
106 | // tmp[i] is (n, c) |
107 | // a[i] is (n, 1) |
108 | // p is (n, 1) |
109 | |
110 | // first we precompute the weighted_dec_src_layer |
111 | int32_t co = 0; |
112 | dnnl_gemm_u8s8s32('N', 'N', 'F', batch, feature_size, feature_size, 1.f, |
113 | dec_src_layer, feature_size, 0, weights_src_layer, feature_size, 0, |
114 | 0.f, weighted_src_layer.data(), feature_size, &co); |
115 | |
116 | // then we compute the alignment model |
117 | float *alignment_model_ptr = alignment_model.data(); |
118 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) |
119 | for (dim_t i = 0; i < src_seq_length_max; i++) { |
120 | for (dim_t j = 0; j < batch; j++) { |
121 | for (dim_t k = 0; k < feature_size; k++) { |
122 | size_t tnc_offset |
123 | = i * batch * feature_size + j * feature_size + k; |
124 | alignment_model_ptr[tnc_offset] |
125 | = tanhf((float)(weighted_src_layer[j * feature_size + k] |
126 | - dec_src_layer_shift * compensation[k]) |
127 | / (dec_src_layer_scale |
128 | * weights_src_layer_scale) |
129 | + weighted_annotations[tnc_offset]); |
130 | } |
131 | } |
132 | } |
133 | |
134 | // gemv with alignments weights. the resulting alignments are in alignments |
135 | dim_t num_weighted_annotations = src_seq_length_max * batch; |
136 | dnnl_sgemm('N', 'N', num_weighted_annotations, 1, feature_size, 1.f, |
137 | alignment_model_ptr, feature_size, weights_alignments, 1, 0.f, |
138 | alignments.data(), 1); |
139 | |
140 | // softmax on alignments. the resulting context weights are in alignments |
141 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1) |
142 | for (dim_t i = 0; i < batch; i++) |
143 | exp_sums[i] = 0.0f; |
144 | |
145 | // For each batch j, in the expression: exp(A_i) / \sum_i exp(A_i) |
146 | // we calculate max_idx t so that A_i <= A_t and calculate the expression as |
147 | // exp(A_i - A_t) / \sum_i exp(A_i - A_t) |
148 | // which mitigates the overflow errors |
149 | std::vector<dim_t> max_idx(batch, 0); |
150 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1) |
151 | for (dim_t j = 0; j < batch; j++) { |
152 | for (dim_t i = 1; i < src_seq_length_max; i++) { |
153 | if (alignments[i * batch + j] > alignments[(i - 1) * batch + j]) |
154 | max_idx[j] = i; |
155 | } |
156 | } |
157 | |
158 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1) |
159 | for (dim_t j = 0; j < batch; j++) { |
160 | auto max_idx_val = alignments[max_idx[j] * batch + j]; |
161 | for (dim_t i = 0; i < src_seq_length_max; i++) { |
162 | alignments[i * batch + j] -= max_idx_val; |
163 | alignments[i * batch + j] = expf(alignments[i * batch + j]); |
164 | exp_sums[j] += alignments[i * batch + j]; |
165 | } |
166 | } |
167 | |
168 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) |
169 | for (dim_t i = 0; i < src_seq_length_max; i++) |
170 | for (dim_t j = 0; j < batch; j++) |
171 | alignments[i * batch + j] /= exp_sums[j]; |
172 | |
173 | // then we compute the context vectors |
174 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) |
175 | for (dim_t i = 0; i < batch; i++) |
176 | for (dim_t j = 0; j < feature_size; j++) |
177 | context_vectors[i * (feature_size + feature_size) + feature_size |
178 | + j] |
179 | = 0.0f; |
180 | |
181 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2) |
182 | for (dim_t i = 0; i < batch; i++) |
183 | for (dim_t j = 0; j < feature_size; j++) |
184 | for (dim_t k = 0; k < src_seq_length_max; k++) |
185 | context_vectors[i * (feature_size + feature_size) + feature_size |
186 | + j] |
187 | += alignments[k * batch + i] |
188 | * (((float)annotations[j |
189 | + feature_size * (i + batch * k)] |
190 | - dec_src_layer_shift) |
191 | / dec_src_layer_scale); |
192 | } |
193 | |
194 | void copy_context( |
195 | float *src_iter, dim_t n_layers, dim_t batch, dim_t feature_size) { |
196 | // we copy the context from the first layer to all other layers |
197 | PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(3) |
198 | for (dim_t k = 1; k < n_layers; k++) |
199 | for (dim_t j = 0; j < batch; j++) |
200 | for (dim_t i = 0; i < feature_size; i++) |
201 | src_iter[(k * batch + j) * (feature_size + feature_size) |
202 | + feature_size + i] |
203 | = src_iter[j * (feature_size + feature_size) |
204 | + feature_size + i]; |
205 | } |
206 | |
207 | void simple_net() { |
208 | /// |
209 | /// Initialize a CPU engine and stream. The last parameter in the call represents |
210 | /// the index of the engine. |
211 | /// @snippet cpu_rnn_inference_int8.cpp Initialize engine and stream |
212 | /// |
213 | //[Initialize engine and stream] |
214 | auto cpu_engine = engine(engine::kind::cpu, 0); |
215 | stream s(cpu_engine); |
216 | //[Initialize engine and stream] |
217 | |
218 | /// |
219 | /// Declare encoder net and decoder net |
220 | /// @snippet cpu_rnn_inference_int8.cpp declare net |
221 | /// |
222 | //[declare net] |
223 | std::vector<primitive> encoder_net, decoder_net; |
224 | std::vector<std::unordered_map<int, memory>> encoder_net_args, |
225 | decoder_net_args; |
226 | |
227 | std::vector<float> net_src(batch * src_seq_length_max * feature_size, 0.1f); |
228 | std::vector<float> net_dst(batch * tgt_seq_length_max * feature_size, 0.1f); |
229 | //[declare net] |
230 | |
231 | // Quantization factors for f32 data |
232 | |
233 | /// |
234 | /// Quantization factors for f32 data |
235 | /// @snippet cpu_rnn_inference_int8.cpp quantize |
236 | /// |
237 | const float data_shift = 64.; |
238 | const float data_scale = 63.; |
239 | const int weights_scale_mask = 0 |
240 | + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo` |
241 | + (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo` |
242 | //[quantize] |
243 | std::vector<float> weights_scales(lstm_n_gates * feature_size); |
244 | // assign halves of vector with arbitrary values |
245 | const dim_t scales_half = lstm_n_gates * feature_size / 2; |
246 | std::fill( |
247 | weights_scales.begin(), weights_scales.begin() + scales_half, 30.f); |
248 | std::fill( |
249 | weights_scales.begin() + scales_half, weights_scales.end(), 65.5f); |
250 | //[quantize] |
251 | |
252 | /// |
253 | /// **Encoder** |
254 | /// |
255 | /// |
256 | /// Initialize Encoder Memory |
257 | /// @snippet cpu_rnn_inference_int8.cpp Initialize encoder memory |
258 | /// |
259 | //[Initialize encoder memory] |
260 | memory::dims enc_bidir_src_layer_tz |
261 | = {src_seq_length_max, batch, feature_size}; |
262 | memory::dims enc_bidir_weights_layer_tz |
263 | = {enc_bidir_n_layers, 2, feature_size, lstm_n_gates, feature_size}; |
264 | memory::dims enc_bidir_weights_iter_tz |
265 | = {enc_bidir_n_layers, 2, feature_size, lstm_n_gates, feature_size}; |
266 | memory::dims enc_bidir_bias_tz |
267 | = {enc_bidir_n_layers, 2, lstm_n_gates, feature_size}; |
268 | memory::dims enc_bidir_dst_layer_tz |
269 | = {src_seq_length_max, batch, 2 * feature_size}; |
270 | |
271 | //[Initialize encoder memory] |
272 | |
273 | /// |
274 | /// |
275 | /// Encoder: 1 bidirectional layer and 7 unidirectional layers |
276 | /// |
277 | |
278 | std::vector<float> user_enc_bidir_wei_layer( |
279 | enc_bidir_n_layers * 2 * feature_size * lstm_n_gates * feature_size, |
280 | 0.3f); |
281 | std::vector<float> user_enc_bidir_wei_iter( |
282 | enc_bidir_n_layers * 2 * feature_size * lstm_n_gates * feature_size, |
283 | 0.2f); |
284 | std::vector<float> user_enc_bidir_bias( |
285 | enc_bidir_n_layers * 2 * lstm_n_gates * feature_size, 1.0f); |
286 | |
287 | /// |
288 | /// Create the memory for user data |
289 | /// @snippet cpu_rnn_inference_int8.cpp data memory creation |
290 | /// |
291 | //[data memory creation] |
292 | auto user_enc_bidir_src_layer_md = memory::desc({enc_bidir_src_layer_tz}, |
293 | memory::data_type::f32, memory::format_tag::tnc); |
294 | |
295 | auto user_enc_bidir_wei_layer_md |
296 | = memory::desc({enc_bidir_weights_layer_tz}, memory::data_type::f32, |
297 | memory::format_tag::ldigo); |
298 | |
299 | auto user_enc_bidir_wei_iter_md = memory::desc({enc_bidir_weights_iter_tz}, |
300 | memory::data_type::f32, memory::format_tag::ldigo); |
301 | |
302 | auto user_enc_bidir_bias_md = memory::desc({enc_bidir_bias_tz}, |
303 | memory::data_type::f32, memory::format_tag::ldgo); |
304 | |
305 | auto user_enc_bidir_src_layer_memory |
306 | = memory(user_enc_bidir_src_layer_md, cpu_engine, net_src.data()); |
307 | auto user_enc_bidir_wei_layer_memory = memory(user_enc_bidir_wei_layer_md, |
308 | cpu_engine, user_enc_bidir_wei_layer.data()); |
309 | auto user_enc_bidir_wei_iter_memory = memory(user_enc_bidir_wei_iter_md, |
310 | cpu_engine, user_enc_bidir_wei_iter.data()); |
311 | auto user_enc_bidir_bias_memory = memory( |
312 | user_enc_bidir_bias_md, cpu_engine, user_enc_bidir_bias.data()); |
313 | //[data memory creation] |
314 | |
315 | /// |
316 | /// Create memory descriptors for RNN data w/o specified layout |
317 | /// @snippet cpu_rnn_inference_int8.cpp memory desc for RNN data |
318 | /// |
319 | //[memory desc for RNN data] |
320 | auto enc_bidir_src_layer_md = memory::desc({enc_bidir_src_layer_tz}, |
321 | memory::data_type::u8, memory::format_tag::any); |
322 | |
323 | auto enc_bidir_wei_layer_md = memory::desc({enc_bidir_weights_layer_tz}, |
324 | memory::data_type::s8, memory::format_tag::any); |
325 | |
326 | auto enc_bidir_wei_iter_md = memory::desc({enc_bidir_weights_iter_tz}, |
327 | memory::data_type::s8, memory::format_tag::any); |
328 | |
329 | auto enc_bidir_dst_layer_md = memory::desc({enc_bidir_dst_layer_tz}, |
330 | memory::data_type::u8, memory::format_tag::any); |
331 | //[memory desc for RNN data] |
332 | |
333 | /// |
334 | /// Create bidirectional RNN |
335 | |
336 | /// |
337 | /// Define RNN attributes that store quantization parameters |
338 | /// @snippet cpu_rnn_inference_int8.cpp RNN attri |
339 | /// |
340 | //[RNN attri] |
341 | primitive_attr attr; |
342 | attr.set_rnn_data_qparams(data_scale, data_shift); |
343 | attr.set_rnn_weights_qparams(weights_scale_mask, weights_scales); |
344 | |
345 | // check if int8 LSTM is supported |
346 | lstm_forward::primitive_desc enc_bidir_prim_desc; |
347 | try { |
348 | enc_bidir_prim_desc = lstm_forward::primitive_desc(cpu_engine, |
349 | prop_kind::forward_inference, |
350 | rnn_direction::bidirectional_concat, enc_bidir_src_layer_md, |
351 | memory::desc(), memory::desc(), enc_bidir_wei_layer_md, |
352 | enc_bidir_wei_iter_md, user_enc_bidir_bias_md, |
353 | enc_bidir_dst_layer_md, memory::desc(), memory::desc(), attr); |
354 | } catch (error &e) { |
355 | if (e.status == dnnl_unimplemented) |
356 | throw example_allows_unimplemented { |
357 | "No int8 LSTM implementation is available for this " |
358 | "platform.\n" |
359 | "Please refer to the developer guide for details." }; |
360 | |
361 | // on any other error just re-throw |
362 | throw; |
363 | } |
364 | |
365 | //[RNN attri] |
366 | |
367 | /// |
368 | /// Create memory for input data and use reorders to quantize values to int8 |
369 | /// NOTE: same attributes are used when creating RNN primitive and reorders |
370 | /// @snippet cpu_rnn_inference_int8.cpp reorder input data |
371 | /// |
372 | //[reorder input data] |
373 | auto enc_bidir_src_layer_memory |
374 | = memory(enc_bidir_prim_desc.src_layer_desc(), cpu_engine); |
375 | auto enc_bidir_src_layer_reorder_pd = reorder::primitive_desc( |
376 | user_enc_bidir_src_layer_memory, enc_bidir_src_layer_memory, attr); |
377 | encoder_net.push_back(reorder(enc_bidir_src_layer_reorder_pd)); |
378 | encoder_net_args.push_back( |
379 | {{DNNL_ARG_FROM, user_enc_bidir_src_layer_memory}, |
380 | {DNNL_ARG_TO, enc_bidir_src_layer_memory}}); |
381 | //[reorder input data] |
382 | |
383 | auto enc_bidir_wei_layer_memory |
384 | = memory(enc_bidir_prim_desc.weights_layer_desc(), cpu_engine); |
385 | auto enc_bidir_wei_layer_reorder_pd = reorder::primitive_desc( |
386 | user_enc_bidir_wei_layer_memory, enc_bidir_wei_layer_memory, attr); |
387 | reorder(enc_bidir_wei_layer_reorder_pd) |
388 | .execute(s, user_enc_bidir_wei_layer_memory, |
389 | enc_bidir_wei_layer_memory); |
390 | |
391 | auto enc_bidir_wei_iter_memory |
392 | = memory(enc_bidir_prim_desc.weights_iter_desc(), cpu_engine); |
393 | auto enc_bidir_wei_iter_reorder_pd = reorder::primitive_desc( |
394 | user_enc_bidir_wei_iter_memory, enc_bidir_wei_iter_memory, attr); |
395 | reorder(enc_bidir_wei_iter_reorder_pd) |
396 | .execute(s, user_enc_bidir_wei_iter_memory, |
397 | enc_bidir_wei_iter_memory); |
398 | |
399 | auto enc_bidir_dst_layer_memory |
400 | = memory(enc_bidir_prim_desc.dst_layer_desc(), cpu_engine); |
401 | |
402 | /// |
403 | /// Encoder : add the bidirectional rnn primitive with related arguments into encoder_net |
404 | /// @snippet cpu_rnn_inference_int8.cpp push bi rnn to encoder net |
405 | /// |
406 | //[push bi rnn to encoder net] |
407 | encoder_net.push_back(lstm_forward(enc_bidir_prim_desc)); |
408 | encoder_net_args.push_back( |
409 | {{DNNL_ARG_SRC_LAYER, enc_bidir_src_layer_memory}, |
410 | {DNNL_ARG_WEIGHTS_LAYER, enc_bidir_wei_layer_memory}, |
411 | {DNNL_ARG_WEIGHTS_ITER, enc_bidir_wei_iter_memory}, |
412 | {DNNL_ARG_BIAS, user_enc_bidir_bias_memory}, |
413 | {DNNL_ARG_DST_LAYER, enc_bidir_dst_layer_memory}}); |
414 | //[push bi rnn to encoder net] |
415 | |
416 | /// |
417 | /// Encoder: unidirectional layers |
418 | /// |
419 | /// |
420 | /// First unidirectinal layer scales 2 * feature_size output of bidirectional |
421 | /// layer to feature_size output |
422 | /// @snippet cpu_rnn_inference_int8.cpp first uni layer |
423 | /// |
424 | //[first uni layer] |
425 | std::vector<float> user_enc_uni_first_wei_layer( |
426 | 1 * 1 * 2 * feature_size * lstm_n_gates * feature_size, 0.3f); |
427 | std::vector<float> user_enc_uni_first_wei_iter( |
428 | 1 * 1 * feature_size * lstm_n_gates * feature_size, 0.2f); |
429 | std::vector<float> user_enc_uni_first_bias( |
430 | 1 * 1 * lstm_n_gates * feature_size, 1.0f); |
431 | //[first uni layer] |
432 | |
433 | memory::dims user_enc_uni_first_wei_layer_dims |
434 | = {1, 1, 2 * feature_size, lstm_n_gates, feature_size}; |
435 | memory::dims user_enc_uni_first_wei_iter_dims |
436 | = {1, 1, feature_size, lstm_n_gates, feature_size}; |
437 | memory::dims user_enc_uni_first_bias_dims |
438 | = {1, 1, lstm_n_gates, feature_size}; |
439 | memory::dims enc_uni_first_dst_layer_dims |
440 | = {src_seq_length_max, batch, feature_size}; |
441 | |
442 | auto user_enc_uni_first_wei_layer_md |
443 | = memory::desc({user_enc_uni_first_wei_layer_dims}, |
444 | memory::data_type::f32, memory::format_tag::ldigo); |
445 | auto user_enc_uni_first_wei_iter_md |
446 | = memory::desc({user_enc_uni_first_wei_iter_dims}, |
447 | memory::data_type::f32, memory::format_tag::ldigo); |
448 | auto user_enc_uni_first_bias_md |
449 | = memory::desc({user_enc_uni_first_bias_dims}, |
450 | memory::data_type::f32, memory::format_tag::ldgo); |
451 | auto user_enc_uni_first_wei_layer_memory |
452 | = memory(user_enc_uni_first_wei_layer_md, cpu_engine, |
453 | user_enc_uni_first_wei_layer.data()); |
454 | auto user_enc_uni_first_wei_iter_memory |
455 | = memory(user_enc_uni_first_wei_iter_md, cpu_engine, |
456 | user_enc_uni_first_wei_iter.data()); |
457 | auto user_enc_uni_first_bias_memory = memory(user_enc_uni_first_bias_md, |
458 | cpu_engine, user_enc_uni_first_bias.data()); |
459 | |
460 | auto enc_uni_first_wei_layer_md |
461 | = memory::desc({user_enc_uni_first_wei_layer_dims}, |
462 | memory::data_type::s8, memory::format_tag::any); |
463 | auto enc_uni_first_wei_iter_md |
464 | = memory::desc({user_enc_uni_first_wei_iter_dims}, |
465 | memory::data_type::s8, memory::format_tag::any); |
466 | auto enc_uni_first_dst_layer_md |
467 | = memory::desc({enc_uni_first_dst_layer_dims}, |
468 | memory::data_type::u8, memory::format_tag::any); |
469 | |
470 | /// |
471 | /// Encoder : Create unidirection RNN for first cell |
472 | /// @snippet cpu_rnn_inference_int8.cpp create uni first |
473 | /// |
474 | //[create uni first] |
475 | |
476 | auto enc_uni_first_prim_desc = lstm_forward::primitive_desc(cpu_engine, |
477 | prop_kind::forward_inference, |
478 | rnn_direction::unidirectional_left2right, enc_bidir_dst_layer_md, |
479 | memory::desc(), memory::desc(), enc_uni_first_wei_layer_md, |
480 | enc_uni_first_wei_iter_md, user_enc_uni_first_bias_md, |
481 | enc_uni_first_dst_layer_md, memory::desc(), memory::desc(), attr); |
482 | |
483 | //[create uni first] |
484 | |
485 | auto enc_uni_first_wei_layer_memory |
486 | = memory(enc_uni_first_prim_desc.weights_layer_desc(), cpu_engine); |
487 | reorder(user_enc_uni_first_wei_layer_memory, enc_uni_first_wei_layer_memory) |
488 | .execute(s, user_enc_uni_first_wei_layer_memory, |
489 | enc_uni_first_wei_layer_memory); |
490 | |
491 | auto enc_uni_first_wei_iter_memory |
492 | = memory(enc_uni_first_prim_desc.weights_iter_desc(), cpu_engine); |
493 | reorder(user_enc_uni_first_wei_iter_memory, enc_uni_first_wei_iter_memory) |
494 | .execute(s, user_enc_uni_first_wei_iter_memory, |
495 | enc_uni_first_wei_iter_memory); |
496 | |
497 | auto enc_uni_first_dst_layer_memory |
498 | = memory(enc_uni_first_prim_desc.dst_layer_desc(), cpu_engine); |
499 | |
500 | /// |
501 | /// Encoder : add the first unidirectional rnn primitive with related arguments into encoder_net |
502 | /// @snippet cpu_rnn_inference_int8.cpp push first uni rnn to encoder net |
503 | /// |
504 | //[push first uni rnn to encoder net] |
505 | encoder_net.push_back(lstm_forward(enc_uni_first_prim_desc)); |
506 | encoder_net_args.push_back( |
507 | {{DNNL_ARG_SRC_LAYER, enc_bidir_dst_layer_memory}, |
508 | {DNNL_ARG_WEIGHTS_LAYER, enc_uni_first_wei_layer_memory}, |
509 | {DNNL_ARG_WEIGHTS_ITER, enc_uni_first_wei_iter_memory}, |
510 | {DNNL_ARG_BIAS, user_enc_uni_first_bias_memory}, |
511 | {DNNL_ARG_DST_LAYER, enc_uni_first_dst_layer_memory}}); |
512 | //[push first uni rnn to encoder net] |
513 | |
514 | /// |
515 | /// Encoder : Remaining unidirectional layers |
516 | /// @snippet cpu_rnn_inference_int8.cpp remaining uni layers |
517 | /// |
518 | //[remaining uni layers] |
519 | std::vector<float> user_enc_uni_wei_layer((enc_unidir_n_layers - 1) * 1 |
520 | * feature_size * lstm_n_gates * feature_size, |
521 | 0.3f); |
522 | std::vector<float> user_enc_uni_wei_iter((enc_unidir_n_layers - 1) * 1 |
523 | * feature_size * lstm_n_gates * feature_size, |
524 | 0.2f); |
525 | std::vector<float> user_enc_uni_bias( |
526 | (enc_unidir_n_layers - 1) * 1 * lstm_n_gates * feature_size, 1.0f); |
527 | //[remaining uni layers] |
528 | |
529 | memory::dims user_enc_uni_wei_layer_dims = {(enc_unidir_n_layers - 1), 1, |
530 | feature_size, lstm_n_gates, feature_size}; |
531 | memory::dims user_enc_uni_wei_iter_dims = {(enc_unidir_n_layers - 1), 1, |
532 | feature_size, lstm_n_gates, feature_size}; |
533 | memory::dims user_enc_uni_bias_dims |
534 | = {(enc_unidir_n_layers - 1), 1, lstm_n_gates, feature_size}; |
535 | memory::dims enc_dst_layer_dims = {src_seq_length_max, batch, feature_size}; |
536 | |
537 | auto user_enc_uni_wei_layer_md = memory::desc({user_enc_uni_wei_layer_dims}, |
538 | memory::data_type::f32, memory::format_tag::ldigo); |
539 | auto user_enc_uni_wei_iter_md = memory::desc({user_enc_uni_wei_iter_dims}, |
540 | memory::data_type::f32, memory::format_tag::ldigo); |
541 | auto user_enc_uni_bias_md = memory::desc({user_enc_uni_bias_dims}, |
542 | memory::data_type::f32, memory::format_tag::ldgo); |
543 | |
544 | auto user_enc_uni_wei_layer_memory = memory(user_enc_uni_wei_layer_md, |
545 | cpu_engine, user_enc_uni_wei_layer.data()); |
546 | auto user_enc_uni_wei_iter_memory = memory( |
547 | user_enc_uni_wei_iter_md, cpu_engine, user_enc_uni_wei_iter.data()); |
548 | auto user_enc_uni_bias_memory = memory( |
549 | user_enc_uni_bias_md, cpu_engine, user_enc_uni_bias.data()); |
550 | |
551 | auto enc_uni_wei_layer_md = memory::desc({user_enc_uni_wei_layer_dims}, |
552 | memory::data_type::s8, memory::format_tag::any); |
553 | auto enc_uni_wei_iter_md = memory::desc({user_enc_uni_wei_iter_dims}, |
554 | memory::data_type::s8, memory::format_tag::any); |
555 | auto enc_dst_layer_md = memory::desc({enc_dst_layer_dims}, |
556 | memory::data_type::f32, memory::format_tag::any); |
557 | |
558 | /// |
559 | /// Encoder : Create unidirection RNN cell |
560 | /// @snippet cpu_rnn_inference_int8.cpp create uni rnn |
561 | /// |
562 | //[create uni rnn] |
563 | |
564 | auto enc_uni_prim_desc = lstm_forward::primitive_desc(cpu_engine, |
565 | prop_kind::forward_inference, |
566 | rnn_direction::unidirectional_left2right, |
567 | enc_uni_first_dst_layer_md, memory::desc(), memory::desc(), |
568 | enc_uni_wei_layer_md, enc_uni_wei_iter_md, user_enc_uni_bias_md, |
569 | enc_dst_layer_md, memory::desc(), memory::desc(), attr); |
570 | //[create uni rnn] |
571 | |
572 | auto enc_uni_wei_layer_memory |
573 | = memory(enc_uni_prim_desc.weights_layer_desc(), cpu_engine); |
574 | auto enc_uni_wei_layer_reorder_pd = reorder::primitive_desc( |
575 | user_enc_uni_wei_layer_memory, enc_uni_wei_layer_memory, attr); |
576 | reorder(enc_uni_wei_layer_reorder_pd) |
577 | .execute( |
578 | s, user_enc_uni_wei_layer_memory, enc_uni_wei_layer_memory); |
579 | |
580 | auto enc_uni_wei_iter_memory |
581 | = memory(enc_uni_prim_desc.weights_iter_desc(), cpu_engine); |
582 | auto enc_uni_wei_iter_reorder_pd = reorder::primitive_desc( |
583 | user_enc_uni_wei_iter_memory, enc_uni_wei_iter_memory, attr); |
584 | reorder(enc_uni_wei_iter_reorder_pd) |
585 | .execute(s, user_enc_uni_wei_iter_memory, enc_uni_wei_iter_memory); |
586 | |
587 | auto enc_dst_layer_memory |
588 | = memory(enc_uni_prim_desc.dst_layer_desc(), cpu_engine); |
589 | |
590 | /// |
591 | /// Encoder : add the unidirectional rnn primitive with related arguments into encoder_net |
592 | /// @snippet cpu_rnn_inference_int8.cpp push uni rnn to encoder net |
593 | /// |
594 | //[push uni rnn to encoder net] |
595 | encoder_net.push_back(lstm_forward(enc_uni_prim_desc)); |
596 | encoder_net_args.push_back( |
597 | {{DNNL_ARG_SRC_LAYER, enc_uni_first_dst_layer_memory}, |
598 | {DNNL_ARG_WEIGHTS_LAYER, enc_uni_wei_layer_memory}, |
599 | {DNNL_ARG_WEIGHTS_ITER, enc_uni_wei_iter_memory}, |
600 | {DNNL_ARG_BIAS, user_enc_uni_bias_memory}, |
601 | {DNNL_ARG_DST_LAYER, enc_dst_layer_memory}}); |
602 | //[push uni rnn to encoder net] |
603 | |
604 | /// |
605 | /// **Decoder with attention mechanism** |
606 | /// |
607 | /// |
608 | /// Decoder : declare memory dimensions |
609 | /// @snippet cpu_rnn_inference_int8.cpp dec mem dim |
610 | /// |
611 | //[dec mem dim] |
612 | std::vector<float> user_dec_wei_layer( |
613 | dec_n_layers * 1 * feature_size * lstm_n_gates * feature_size, |
614 | 0.2f); |
615 | std::vector<float> user_dec_wei_iter(dec_n_layers * 1 |
616 | * (feature_size + feature_size) * lstm_n_gates |
617 | * feature_size, |
618 | 0.3f); |
619 | std::vector<float> user_dec_bias( |
620 | dec_n_layers * 1 * lstm_n_gates * feature_size, 1.0f); |
621 | std::vector<int8_t> user_weights_attention_src_layer( |
622 | feature_size * feature_size, 1); |
623 | float weights_attention_scale = 127.; |
624 | std::vector<float> user_weights_annotation( |
625 | feature_size * feature_size, 1.0f); |
626 | std::vector<float> user_weights_alignments(feature_size, 1.0f); |
627 | // Buffer to store decoder output for all iterations |
628 | std::vector<uint8_t> dec_dst(tgt_seq_length_max * batch * feature_size, 0); |
629 | |
630 | memory::dims user_dec_wei_layer_dims |
631 | = {dec_n_layers, 1, feature_size, lstm_n_gates, feature_size}; |
632 | memory::dims user_dec_wei_iter_dims = {dec_n_layers, 1, |
633 | feature_size + feature_size, lstm_n_gates, feature_size}; |
634 | memory::dims user_dec_bias_dims |
635 | = {dec_n_layers, 1, lstm_n_gates, feature_size}; |
636 | memory::dims dec_src_layer_dims = {1, batch, feature_size}; |
637 | memory::dims dec_dst_layer_dims = {1, batch, feature_size}; |
638 | memory::dims dec_dst_iter_c_dims = {dec_n_layers, 1, batch, feature_size}; |
639 | //[dec mem dim] |
640 | |
641 | // We will use the same memory for dec_src_iter and dec_dst_iter |
642 | // However, dec_src_iter has a context vector but not |
643 | // dec_dst_iter. |
644 | // To resolve this we will create one memory that holds the |
645 | // context vector as well as the both the hidden and cell states. |
646 | // For the dst_iter, we will use a view on this memory. |
647 | // Note that the cell state will be padded by |
648 | // feature_size values. However, we do not compute or |
649 | // access those. |
650 | /// @snippet cpu_rnn_inference_int8.cpp noctx mem dim |
651 | //[noctx mem dim] |
652 | std::vector<float> dec_dst_iter( |
653 | dec_n_layers * batch * 2 * feature_size, 1.0f); |
654 | |
655 | memory::dims dec_dst_iter_dims |
656 | = {dec_n_layers, 1, batch, feature_size + feature_size}; |
657 | memory::dims dec_dst_iter_noctx_dims |
658 | = {dec_n_layers, 1, batch, feature_size}; |
659 | //[noctx mem dim] |
660 | |
661 | /// |
662 | /// Decoder : create memory description |
663 | /// Create memory descriptors for RNN data w/o specified layout |
664 | /// @snippet cpu_rnn_inference_int8.cpp dec mem desc |
665 | /// |
666 | //[dec mem desc] |
667 | auto user_dec_wei_layer_md = memory::desc({user_dec_wei_layer_dims}, |
668 | memory::data_type::f32, memory::format_tag::ldigo); |
669 | auto user_dec_wei_iter_md = memory::desc({user_dec_wei_iter_dims}, |
670 | memory::data_type::f32, memory::format_tag::ldigo); |
671 | auto user_dec_bias_md = memory::desc({user_dec_bias_dims}, |
672 | memory::data_type::f32, memory::format_tag::ldgo); |
673 | auto dec_src_layer_md = memory::desc({dec_src_layer_dims}, |
674 | memory::data_type::u8, memory::format_tag::tnc); |
675 | auto dec_dst_layer_md = memory::desc({dec_dst_layer_dims}, |
676 | memory::data_type::u8, memory::format_tag::tnc); |
677 | auto dec_dst_iter_md = memory::desc({dec_dst_iter_dims}, |
678 | memory::data_type::f32, memory::format_tag::ldnc); |
679 | auto dec_dst_iter_c_md = memory::desc({dec_dst_iter_c_dims}, |
680 | memory::data_type::f32, memory::format_tag::ldnc); |
681 | //[dec mem desc] |
682 | |
683 | /// |
684 | /// Decoder : Create memory |
685 | /// @snippet cpu_rnn_inference_int8.cpp create dec memory |
686 | /// |
687 | //[create dec memory] |
688 | auto user_dec_wei_layer_memory = memory( |
689 | user_dec_wei_layer_md, cpu_engine, user_dec_wei_layer.data()); |
690 | auto user_dec_wei_iter_memory = memory( |
691 | user_dec_wei_iter_md, cpu_engine, user_dec_wei_iter.data()); |
692 | auto user_dec_bias_memory |
693 | = memory(user_dec_bias_md, cpu_engine, user_dec_bias.data()); |
694 | auto dec_src_layer_memory = memory(dec_src_layer_md, cpu_engine); |
695 | auto dec_dst_layer_memory |
696 | = memory(dec_dst_layer_md, cpu_engine, dec_dst.data()); |
697 | auto dec_dst_iter_c_memory = memory(dec_dst_iter_c_md, cpu_engine); |
698 | //[create dec memory] |
699 | |
700 | // Create memory descriptors for RNN data w/o specified layout |
701 | auto dec_wei_layer_md = memory::desc({user_dec_wei_layer_dims}, |
702 | memory::data_type::s8, memory::format_tag::any); |
703 | auto dec_wei_iter_md = memory::desc({user_dec_wei_iter_dims}, |
704 | memory::data_type::s8, memory::format_tag::any); |
705 | |
706 | /// |
707 | /// Decoder : As mentioned above, we create a view without context out of the memory with context. |
708 | /// @snippet cpu_rnn_inference_int8.cpp create noctx mem |
709 | /// |
710 | //[create noctx mem] |
711 | auto dec_dst_iter_memory |
712 | = memory(dec_dst_iter_md, cpu_engine, dec_dst_iter.data()); |
713 | auto dec_dst_iter_noctx_md = dec_dst_iter_md.submemory_desc( |
714 | dec_dst_iter_noctx_dims, {0, 0, 0, 0, 0}); |
715 | //[create noctx mem] |
716 | |
717 | auto dec_ctx_prim_desc = lstm_forward::primitive_desc(cpu_engine, |
718 | prop_kind::forward_inference, |
719 | rnn_direction::unidirectional_left2right, dec_src_layer_md, |
720 | dec_dst_iter_md, dec_dst_iter_c_md, dec_wei_layer_md, |
721 | dec_wei_iter_md, user_dec_bias_md, dec_dst_layer_md, |
722 | dec_dst_iter_noctx_md, dec_dst_iter_c_md, attr); |
723 | |
724 | /// |
725 | /// Decoder : Create memory for input data and use reorders to quantize values |
726 | /// to int8 |
727 | /// @snippet cpu_rnn_inference_int8.cpp dec reorder |
728 | /// |
729 | //[dec reorder] |
730 | auto dec_wei_layer_memory |
731 | = memory(dec_ctx_prim_desc.weights_layer_desc(), cpu_engine); |
732 | auto dec_wei_layer_reorder_pd = reorder::primitive_desc( |
733 | user_dec_wei_layer_memory, dec_wei_layer_memory, attr); |
734 | reorder(dec_wei_layer_reorder_pd) |
735 | .execute(s, user_dec_wei_layer_memory, dec_wei_layer_memory); |
736 | //[dec reorder] |
737 | |
738 | auto dec_wei_iter_memory |
739 | = memory(dec_ctx_prim_desc.weights_iter_desc(), cpu_engine); |
740 | auto dec_wei_iter_reorder_pd = reorder::primitive_desc( |
741 | user_dec_wei_iter_memory, dec_wei_iter_memory, attr); |
742 | reorder(dec_wei_iter_reorder_pd) |
743 | .execute(s, user_dec_wei_iter_memory, dec_wei_iter_memory); |
744 | |
745 | decoder_net.push_back(lstm_forward(dec_ctx_prim_desc)); |
746 | decoder_net_args.push_back({{DNNL_ARG_SRC_LAYER, dec_src_layer_memory}, |
747 | {DNNL_ARG_SRC_ITER, dec_dst_iter_memory}, |
748 | {DNNL_ARG_SRC_ITER_C, dec_dst_iter_c_memory}, |
749 | {DNNL_ARG_WEIGHTS_LAYER, dec_wei_layer_memory}, |
750 | {DNNL_ARG_WEIGHTS_ITER, dec_wei_iter_memory}, |
751 | {DNNL_ARG_BIAS, user_dec_bias_memory}, |
752 | {DNNL_ARG_DST_LAYER, dec_dst_layer_memory}, |
753 | {DNNL_ARG_DST_ITER, dec_dst_iter_memory}, |
754 | {DNNL_ARG_DST_ITER_C, dec_dst_iter_c_memory}}); |
755 | |
756 | // Allocating temporary buffers for attention mechanism |
757 | std::vector<float> weighted_annotations( |
758 | src_seq_length_max * batch * feature_size, 1.0f); |
759 | std::vector<int32_t> weights_attention_sum_rows(feature_size, 1); |
760 | |
761 | /// |
762 | /// **Execution** |
763 | /// |
764 | |
765 | auto execute = [&]() { |
766 | assert(encoder_net.size() == encoder_net_args.size() |
767 | && "something is missing" ); |
768 | /// |
769 | /// run encoder (1 stream) |
770 | /// @snippet cpu_rnn_inference_int8.cpp run enc |
771 | /// |
772 | //[run enc] |
773 | for (size_t p = 0; p < encoder_net.size(); ++p) |
774 | encoder_net.at(p).execute(s, encoder_net_args.at(p)); |
775 | //[run enc] |
776 | |
777 | // compute the weighted annotations once before the decoder |
778 | /// |
779 | /// we compute the weighted annotations once before the decoder |
780 | /// @snippet cpu_rnn_inference_int8.cpp weight ano |
781 | /// |
782 | //[weight ano] |
783 | compute_weighted_annotations(weighted_annotations.data(), |
784 | src_seq_length_max, batch, feature_size, |
785 | user_weights_annotation.data(), |
786 | (float *)enc_dst_layer_memory.get_data_handle()); |
787 | //[weight ano] |
788 | /// |
789 | /// precompute compensation for s8u8s32 gemm in compute attention |
790 | /// @snippet cpu_rnn_inference_int8.cpp s8u8s32 |
791 | /// |
792 | //[s8u8s32] |
793 | compute_sum_of_rows(user_weights_attention_src_layer.data(), |
794 | feature_size, feature_size, weights_attention_sum_rows.data()); |
795 | //[s8u8s32] |
796 | |
797 | /// |
798 | /// We initialize src_layer to the embedding of the end of |
799 | /// sequence character, which are assumed to be 0 here |
800 | /// @snippet cpu_rnn_inference_int8.cpp init src_layer |
801 | /// |
802 | //[init src_layer] |
803 | memset(dec_src_layer_memory.get_data_handle(), 0, |
804 | dec_src_layer_memory.get_desc().get_size()); |
805 | //[init src_layer] |
806 | |
807 | /// |
808 | /// From now on, src points to the output of the last iteration |
809 | /// |
810 | for (dim_t i = 0; i < tgt_seq_length_max; i++) { |
811 | uint8_t *src_att_layer_handle |
812 | = (uint8_t *)dec_src_layer_memory.get_data_handle(); |
813 | float *src_att_iter_handle |
814 | = (float *)dec_dst_iter_memory.get_data_handle(); |
815 | |
816 | /// |
817 | /// Compute attention context vector into the first layer src_iter |
818 | /// @snippet cpu_rnn_inference_int8.cpp att ctx |
819 | /// |
820 | //[att ctx] |
821 | compute_attention(src_att_iter_handle, src_seq_length_max, batch, |
822 | feature_size, user_weights_attention_src_layer.data(), |
823 | weights_attention_scale, weights_attention_sum_rows.data(), |
824 | src_att_layer_handle, data_scale, data_shift, |
825 | (uint8_t *)enc_bidir_dst_layer_memory.get_data_handle(), |
826 | weighted_annotations.data(), |
827 | user_weights_alignments.data()); |
828 | //[att ctx] |
829 | |
830 | /// |
831 | /// copy the context vectors to all layers of src_iter |
832 | /// @snippet cpu_rnn_inference_int8.cpp cp ctx |
833 | /// |
834 | //[cp ctx] |
835 | copy_context( |
836 | src_att_iter_handle, dec_n_layers, batch, feature_size); |
837 | //[cp ctx] |
838 | |
839 | assert(decoder_net.size() == decoder_net_args.size() |
840 | && "something is missing" ); |
841 | /// |
842 | /// run the decoder iteration |
843 | /// @snippet cpu_rnn_inference_int8.cpp run dec iter |
844 | /// |
845 | //[run dec iter] |
846 | for (size_t p = 0; p < decoder_net.size(); ++p) |
847 | decoder_net.at(p).execute(s, decoder_net_args.at(p)); |
848 | //[run dec iter] |
849 | |
850 | /// |
851 | /// Move the handle on the src/dst layer to the next iteration |
852 | /// @snippet cpu_rnn_inference_int8.cpp set handle |
853 | /// |
854 | //[set handle] |
855 | auto dst_layer_handle |
856 | = (uint8_t *)dec_dst_layer_memory.get_data_handle(); |
857 | dec_src_layer_memory.set_data_handle(dst_layer_handle); |
858 | dec_dst_layer_memory.set_data_handle( |
859 | dst_layer_handle + batch * feature_size); |
860 | //[set handle] |
861 | } |
862 | }; |
863 | |
864 | /// @page cpu_rnn_inference_int8_cpp |
865 | /// |
866 | std::cout << "Parameters:" << std::endl |
867 | << " batch = " << batch << std::endl |
868 | << " feature size = " << feature_size << std::endl |
869 | << " maximum source sequence length = " << src_seq_length_max |
870 | << std::endl |
871 | << " maximum target sequence length = " << tgt_seq_length_max |
872 | << std::endl |
873 | << " number of layers of the bidirectional encoder = " |
874 | << enc_bidir_n_layers << std::endl |
875 | << " number of layers of the unidirectional encoder = " |
876 | << enc_unidir_n_layers << std::endl |
877 | << " number of layers of the decoder = " << dec_n_layers |
878 | << std::endl; |
879 | |
880 | execute(); |
881 | s.wait(); |
882 | } |
883 | |
884 | int main(int argc, char **argv) { |
885 | return handle_example_errors({engine::kind::cpu}, simple_net); |
886 | } |
887 | |