1/*******************************************************************************
2* Copyright 2018-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example cpu_rnn_inference_int8.cpp
18/// @copybrief cpu_rnn_inference_int8_cpp
19/// > Annotated version: @ref cpu_rnn_inference_int8_cpp
20
21/// @page cpu_rnn_inference_int8_cpp RNN int8 inference example
22/// This C++ API example demonstrates how to build GNMT model inference.
23///
24/// > Example code: @ref cpu_rnn_inference_int8.cpp
25///
26/// For the encoder we use:
27/// - one primitive for the bidirectional layer of the encoder
28/// - one primitive for all remaining unidirectional layers in the encoder
29/// For the decoder we use:
30/// - one primitive for the first iteration
31/// - one primitive for all subsequent iterations in the decoder. Note that
32/// in this example, this primitive computes the states in place.
33/// - the attention mechanism is implemented separately as there is no support
34/// for the context vectors in oneDNN yet
35
36#include <assert.h>
37
38#include <cstring>
39#include <iostream>
40#include <math.h>
41#include <numeric>
42#include <string>
43
44#include "oneapi/dnnl/dnnl.hpp"
45
46#include "example_utils.hpp"
47
48using namespace dnnl;
49
50using dim_t = dnnl::memory::dim;
51
52const dim_t batch = 32;
53const dim_t src_seq_length_max = 10;
54const dim_t tgt_seq_length_max = 10;
55
56const dim_t feature_size = 256;
57
58const dim_t enc_bidir_n_layers = 1;
59const dim_t enc_unidir_n_layers = 3;
60const dim_t dec_n_layers = 4;
61
62const int lstm_n_gates = 4;
63
64std::vector<int32_t> weighted_src_layer(batch *feature_size, 1);
65std::vector<float> alignment_model(
66 src_seq_length_max *batch *feature_size, 1.0f);
67std::vector<float> alignments(src_seq_length_max *batch, 1.0f);
68std::vector<float> exp_sums(batch, 1.0f);
69
70void compute_weighted_annotations(float *weighted_annotations,
71 dim_t src_seq_length_max, dim_t batch, dim_t feature_size,
72 float *weights_annot, float *annotations) {
73 // annotations(aka enc_dst_layer) is (t, n, 2c)
74 // weights_annot is (2c, c)
75
76 dim_t num_weighted_annotations = src_seq_length_max * batch;
77 // annotation[i] = GEMM(weights_annot, enc_dst_layer[i]);
78 dnnl_sgemm('N', 'N', num_weighted_annotations, feature_size, feature_size,
79 1.f, annotations, feature_size, weights_annot, feature_size, 0.f,
80 weighted_annotations, feature_size);
81}
82
83void compute_sum_of_rows(
84 int8_t *a, dim_t rows, dim_t cols, int32_t *a_reduced) {
85 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1)
86 for (dim_t i = 0; i < cols; i++) {
87 a_reduced[i] = 0;
88 for (dim_t j = 0; j < rows; j++) {
89 a_reduced[i] += (int32_t)a[i * rows + j];
90 }
91 }
92}
93
94void compute_attention(float *context_vectors, dim_t src_seq_length_max,
95 dim_t batch, dim_t feature_size, int8_t *weights_src_layer,
96 float weights_src_layer_scale, int32_t *compensation,
97 uint8_t *dec_src_layer, float dec_src_layer_scale,
98 float dec_src_layer_shift, uint8_t *annotations,
99 float *weighted_annotations, float *weights_alignments) {
100 // dst_iter : (n, c) matrix
101 // src_layer: (n, c) matrix
102 // weighted_annotations (t, n, c)
103
104 // weights_yi is (c, c)
105 // weights_ai is (c, 1)
106 // tmp[i] is (n, c)
107 // a[i] is (n, 1)
108 // p is (n, 1)
109
110 // first we precompute the weighted_dec_src_layer
111 int32_t co = 0;
112 dnnl_gemm_u8s8s32('N', 'N', 'F', batch, feature_size, feature_size, 1.f,
113 dec_src_layer, feature_size, 0, weights_src_layer, feature_size, 0,
114 0.f, weighted_src_layer.data(), feature_size, &co);
115
116 // then we compute the alignment model
117 float *alignment_model_ptr = alignment_model.data();
118 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
119 for (dim_t i = 0; i < src_seq_length_max; i++) {
120 for (dim_t j = 0; j < batch; j++) {
121 for (dim_t k = 0; k < feature_size; k++) {
122 size_t tnc_offset
123 = i * batch * feature_size + j * feature_size + k;
124 alignment_model_ptr[tnc_offset]
125 = tanhf((float)(weighted_src_layer[j * feature_size + k]
126 - dec_src_layer_shift * compensation[k])
127 / (dec_src_layer_scale
128 * weights_src_layer_scale)
129 + weighted_annotations[tnc_offset]);
130 }
131 }
132 }
133
134 // gemv with alignments weights. the resulting alignments are in alignments
135 dim_t num_weighted_annotations = src_seq_length_max * batch;
136 dnnl_sgemm('N', 'N', num_weighted_annotations, 1, feature_size, 1.f,
137 alignment_model_ptr, feature_size, weights_alignments, 1, 0.f,
138 alignments.data(), 1);
139
140 // softmax on alignments. the resulting context weights are in alignments
141 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1)
142 for (dim_t i = 0; i < batch; i++)
143 exp_sums[i] = 0.0f;
144
145 // For each batch j, in the expression: exp(A_i) / \sum_i exp(A_i)
146 // we calculate max_idx t so that A_i <= A_t and calculate the expression as
147 // exp(A_i - A_t) / \sum_i exp(A_i - A_t)
148 // which mitigates the overflow errors
149 std::vector<dim_t> max_idx(batch, 0);
150 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1)
151 for (dim_t j = 0; j < batch; j++) {
152 for (dim_t i = 1; i < src_seq_length_max; i++) {
153 if (alignments[i * batch + j] > alignments[(i - 1) * batch + j])
154 max_idx[j] = i;
155 }
156 }
157
158 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(1)
159 for (dim_t j = 0; j < batch; j++) {
160 auto max_idx_val = alignments[max_idx[j] * batch + j];
161 for (dim_t i = 0; i < src_seq_length_max; i++) {
162 alignments[i * batch + j] -= max_idx_val;
163 alignments[i * batch + j] = expf(alignments[i * batch + j]);
164 exp_sums[j] += alignments[i * batch + j];
165 }
166 }
167
168 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
169 for (dim_t i = 0; i < src_seq_length_max; i++)
170 for (dim_t j = 0; j < batch; j++)
171 alignments[i * batch + j] /= exp_sums[j];
172
173 // then we compute the context vectors
174 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
175 for (dim_t i = 0; i < batch; i++)
176 for (dim_t j = 0; j < feature_size; j++)
177 context_vectors[i * (feature_size + feature_size) + feature_size
178 + j]
179 = 0.0f;
180
181 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(2)
182 for (dim_t i = 0; i < batch; i++)
183 for (dim_t j = 0; j < feature_size; j++)
184 for (dim_t k = 0; k < src_seq_length_max; k++)
185 context_vectors[i * (feature_size + feature_size) + feature_size
186 + j]
187 += alignments[k * batch + i]
188 * (((float)annotations[j
189 + feature_size * (i + batch * k)]
190 - dec_src_layer_shift)
191 / dec_src_layer_scale);
192}
193
194void copy_context(
195 float *src_iter, dim_t n_layers, dim_t batch, dim_t feature_size) {
196 // we copy the context from the first layer to all other layers
197 PRAGMA_OMP_PARALLEL_FOR_COLLAPSE(3)
198 for (dim_t k = 1; k < n_layers; k++)
199 for (dim_t j = 0; j < batch; j++)
200 for (dim_t i = 0; i < feature_size; i++)
201 src_iter[(k * batch + j) * (feature_size + feature_size)
202 + feature_size + i]
203 = src_iter[j * (feature_size + feature_size)
204 + feature_size + i];
205}
206
207void simple_net() {
208 ///
209 /// Initialize a CPU engine and stream. The last parameter in the call represents
210 /// the index of the engine.
211 /// @snippet cpu_rnn_inference_int8.cpp Initialize engine and stream
212 ///
213 //[Initialize engine and stream]
214 auto cpu_engine = engine(engine::kind::cpu, 0);
215 stream s(cpu_engine);
216 //[Initialize engine and stream]
217
218 ///
219 /// Declare encoder net and decoder net
220 /// @snippet cpu_rnn_inference_int8.cpp declare net
221 ///
222 //[declare net]
223 std::vector<primitive> encoder_net, decoder_net;
224 std::vector<std::unordered_map<int, memory>> encoder_net_args,
225 decoder_net_args;
226
227 std::vector<float> net_src(batch * src_seq_length_max * feature_size, 0.1f);
228 std::vector<float> net_dst(batch * tgt_seq_length_max * feature_size, 0.1f);
229 //[declare net]
230
231 // Quantization factors for f32 data
232
233 ///
234 /// Quantization factors for f32 data
235 /// @snippet cpu_rnn_inference_int8.cpp quantize
236 ///
237 const float data_shift = 64.;
238 const float data_scale = 63.;
239 const int weights_scale_mask = 0
240 + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo`
241 + (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo`
242 //[quantize]
243 std::vector<float> weights_scales(lstm_n_gates * feature_size);
244 // assign halves of vector with arbitrary values
245 const dim_t scales_half = lstm_n_gates * feature_size / 2;
246 std::fill(
247 weights_scales.begin(), weights_scales.begin() + scales_half, 30.f);
248 std::fill(
249 weights_scales.begin() + scales_half, weights_scales.end(), 65.5f);
250 //[quantize]
251
252 ///
253 /// **Encoder**
254 ///
255 ///
256 /// Initialize Encoder Memory
257 /// @snippet cpu_rnn_inference_int8.cpp Initialize encoder memory
258 ///
259 //[Initialize encoder memory]
260 memory::dims enc_bidir_src_layer_tz
261 = {src_seq_length_max, batch, feature_size};
262 memory::dims enc_bidir_weights_layer_tz
263 = {enc_bidir_n_layers, 2, feature_size, lstm_n_gates, feature_size};
264 memory::dims enc_bidir_weights_iter_tz
265 = {enc_bidir_n_layers, 2, feature_size, lstm_n_gates, feature_size};
266 memory::dims enc_bidir_bias_tz
267 = {enc_bidir_n_layers, 2, lstm_n_gates, feature_size};
268 memory::dims enc_bidir_dst_layer_tz
269 = {src_seq_length_max, batch, 2 * feature_size};
270
271 //[Initialize encoder memory]
272
273 ///
274 ///
275 /// Encoder: 1 bidirectional layer and 7 unidirectional layers
276 ///
277
278 std::vector<float> user_enc_bidir_wei_layer(
279 enc_bidir_n_layers * 2 * feature_size * lstm_n_gates * feature_size,
280 0.3f);
281 std::vector<float> user_enc_bidir_wei_iter(
282 enc_bidir_n_layers * 2 * feature_size * lstm_n_gates * feature_size,
283 0.2f);
284 std::vector<float> user_enc_bidir_bias(
285 enc_bidir_n_layers * 2 * lstm_n_gates * feature_size, 1.0f);
286
287 ///
288 /// Create the memory for user data
289 /// @snippet cpu_rnn_inference_int8.cpp data memory creation
290 ///
291 //[data memory creation]
292 auto user_enc_bidir_src_layer_md = memory::desc({enc_bidir_src_layer_tz},
293 memory::data_type::f32, memory::format_tag::tnc);
294
295 auto user_enc_bidir_wei_layer_md
296 = memory::desc({enc_bidir_weights_layer_tz}, memory::data_type::f32,
297 memory::format_tag::ldigo);
298
299 auto user_enc_bidir_wei_iter_md = memory::desc({enc_bidir_weights_iter_tz},
300 memory::data_type::f32, memory::format_tag::ldigo);
301
302 auto user_enc_bidir_bias_md = memory::desc({enc_bidir_bias_tz},
303 memory::data_type::f32, memory::format_tag::ldgo);
304
305 auto user_enc_bidir_src_layer_memory
306 = memory(user_enc_bidir_src_layer_md, cpu_engine, net_src.data());
307 auto user_enc_bidir_wei_layer_memory = memory(user_enc_bidir_wei_layer_md,
308 cpu_engine, user_enc_bidir_wei_layer.data());
309 auto user_enc_bidir_wei_iter_memory = memory(user_enc_bidir_wei_iter_md,
310 cpu_engine, user_enc_bidir_wei_iter.data());
311 auto user_enc_bidir_bias_memory = memory(
312 user_enc_bidir_bias_md, cpu_engine, user_enc_bidir_bias.data());
313 //[data memory creation]
314
315 ///
316 /// Create memory descriptors for RNN data w/o specified layout
317 /// @snippet cpu_rnn_inference_int8.cpp memory desc for RNN data
318 ///
319 //[memory desc for RNN data]
320 auto enc_bidir_src_layer_md = memory::desc({enc_bidir_src_layer_tz},
321 memory::data_type::u8, memory::format_tag::any);
322
323 auto enc_bidir_wei_layer_md = memory::desc({enc_bidir_weights_layer_tz},
324 memory::data_type::s8, memory::format_tag::any);
325
326 auto enc_bidir_wei_iter_md = memory::desc({enc_bidir_weights_iter_tz},
327 memory::data_type::s8, memory::format_tag::any);
328
329 auto enc_bidir_dst_layer_md = memory::desc({enc_bidir_dst_layer_tz},
330 memory::data_type::u8, memory::format_tag::any);
331 //[memory desc for RNN data]
332
333 ///
334 /// Create bidirectional RNN
335
336 ///
337 /// Define RNN attributes that store quantization parameters
338 /// @snippet cpu_rnn_inference_int8.cpp RNN attri
339 ///
340 //[RNN attri]
341 primitive_attr attr;
342 attr.set_rnn_data_qparams(data_scale, data_shift);
343 attr.set_rnn_weights_qparams(weights_scale_mask, weights_scales);
344
345 // check if int8 LSTM is supported
346 lstm_forward::primitive_desc enc_bidir_prim_desc;
347 try {
348 enc_bidir_prim_desc = lstm_forward::primitive_desc(cpu_engine,
349 prop_kind::forward_inference,
350 rnn_direction::bidirectional_concat, enc_bidir_src_layer_md,
351 memory::desc(), memory::desc(), enc_bidir_wei_layer_md,
352 enc_bidir_wei_iter_md, user_enc_bidir_bias_md,
353 enc_bidir_dst_layer_md, memory::desc(), memory::desc(), attr);
354 } catch (error &e) {
355 if (e.status == dnnl_unimplemented)
356 throw example_allows_unimplemented {
357 "No int8 LSTM implementation is available for this "
358 "platform.\n"
359 "Please refer to the developer guide for details."};
360
361 // on any other error just re-throw
362 throw;
363 }
364
365 //[RNN attri]
366
367 ///
368 /// Create memory for input data and use reorders to quantize values to int8
369 /// NOTE: same attributes are used when creating RNN primitive and reorders
370 /// @snippet cpu_rnn_inference_int8.cpp reorder input data
371 ///
372 //[reorder input data]
373 auto enc_bidir_src_layer_memory
374 = memory(enc_bidir_prim_desc.src_layer_desc(), cpu_engine);
375 auto enc_bidir_src_layer_reorder_pd = reorder::primitive_desc(
376 user_enc_bidir_src_layer_memory, enc_bidir_src_layer_memory, attr);
377 encoder_net.push_back(reorder(enc_bidir_src_layer_reorder_pd));
378 encoder_net_args.push_back(
379 {{DNNL_ARG_FROM, user_enc_bidir_src_layer_memory},
380 {DNNL_ARG_TO, enc_bidir_src_layer_memory}});
381 //[reorder input data]
382
383 auto enc_bidir_wei_layer_memory
384 = memory(enc_bidir_prim_desc.weights_layer_desc(), cpu_engine);
385 auto enc_bidir_wei_layer_reorder_pd = reorder::primitive_desc(
386 user_enc_bidir_wei_layer_memory, enc_bidir_wei_layer_memory, attr);
387 reorder(enc_bidir_wei_layer_reorder_pd)
388 .execute(s, user_enc_bidir_wei_layer_memory,
389 enc_bidir_wei_layer_memory);
390
391 auto enc_bidir_wei_iter_memory
392 = memory(enc_bidir_prim_desc.weights_iter_desc(), cpu_engine);
393 auto enc_bidir_wei_iter_reorder_pd = reorder::primitive_desc(
394 user_enc_bidir_wei_iter_memory, enc_bidir_wei_iter_memory, attr);
395 reorder(enc_bidir_wei_iter_reorder_pd)
396 .execute(s, user_enc_bidir_wei_iter_memory,
397 enc_bidir_wei_iter_memory);
398
399 auto enc_bidir_dst_layer_memory
400 = memory(enc_bidir_prim_desc.dst_layer_desc(), cpu_engine);
401
402 ///
403 /// Encoder : add the bidirectional rnn primitive with related arguments into encoder_net
404 /// @snippet cpu_rnn_inference_int8.cpp push bi rnn to encoder net
405 ///
406 //[push bi rnn to encoder net]
407 encoder_net.push_back(lstm_forward(enc_bidir_prim_desc));
408 encoder_net_args.push_back(
409 {{DNNL_ARG_SRC_LAYER, enc_bidir_src_layer_memory},
410 {DNNL_ARG_WEIGHTS_LAYER, enc_bidir_wei_layer_memory},
411 {DNNL_ARG_WEIGHTS_ITER, enc_bidir_wei_iter_memory},
412 {DNNL_ARG_BIAS, user_enc_bidir_bias_memory},
413 {DNNL_ARG_DST_LAYER, enc_bidir_dst_layer_memory}});
414 //[push bi rnn to encoder net]
415
416 ///
417 /// Encoder: unidirectional layers
418 ///
419 ///
420 /// First unidirectinal layer scales 2 * feature_size output of bidirectional
421 /// layer to feature_size output
422 /// @snippet cpu_rnn_inference_int8.cpp first uni layer
423 ///
424 //[first uni layer]
425 std::vector<float> user_enc_uni_first_wei_layer(
426 1 * 1 * 2 * feature_size * lstm_n_gates * feature_size, 0.3f);
427 std::vector<float> user_enc_uni_first_wei_iter(
428 1 * 1 * feature_size * lstm_n_gates * feature_size, 0.2f);
429 std::vector<float> user_enc_uni_first_bias(
430 1 * 1 * lstm_n_gates * feature_size, 1.0f);
431 //[first uni layer]
432
433 memory::dims user_enc_uni_first_wei_layer_dims
434 = {1, 1, 2 * feature_size, lstm_n_gates, feature_size};
435 memory::dims user_enc_uni_first_wei_iter_dims
436 = {1, 1, feature_size, lstm_n_gates, feature_size};
437 memory::dims user_enc_uni_first_bias_dims
438 = {1, 1, lstm_n_gates, feature_size};
439 memory::dims enc_uni_first_dst_layer_dims
440 = {src_seq_length_max, batch, feature_size};
441
442 auto user_enc_uni_first_wei_layer_md
443 = memory::desc({user_enc_uni_first_wei_layer_dims},
444 memory::data_type::f32, memory::format_tag::ldigo);
445 auto user_enc_uni_first_wei_iter_md
446 = memory::desc({user_enc_uni_first_wei_iter_dims},
447 memory::data_type::f32, memory::format_tag::ldigo);
448 auto user_enc_uni_first_bias_md
449 = memory::desc({user_enc_uni_first_bias_dims},
450 memory::data_type::f32, memory::format_tag::ldgo);
451 auto user_enc_uni_first_wei_layer_memory
452 = memory(user_enc_uni_first_wei_layer_md, cpu_engine,
453 user_enc_uni_first_wei_layer.data());
454 auto user_enc_uni_first_wei_iter_memory
455 = memory(user_enc_uni_first_wei_iter_md, cpu_engine,
456 user_enc_uni_first_wei_iter.data());
457 auto user_enc_uni_first_bias_memory = memory(user_enc_uni_first_bias_md,
458 cpu_engine, user_enc_uni_first_bias.data());
459
460 auto enc_uni_first_wei_layer_md
461 = memory::desc({user_enc_uni_first_wei_layer_dims},
462 memory::data_type::s8, memory::format_tag::any);
463 auto enc_uni_first_wei_iter_md
464 = memory::desc({user_enc_uni_first_wei_iter_dims},
465 memory::data_type::s8, memory::format_tag::any);
466 auto enc_uni_first_dst_layer_md
467 = memory::desc({enc_uni_first_dst_layer_dims},
468 memory::data_type::u8, memory::format_tag::any);
469
470 ///
471 /// Encoder : Create unidirection RNN for first cell
472 /// @snippet cpu_rnn_inference_int8.cpp create uni first
473 ///
474 //[create uni first]
475
476 auto enc_uni_first_prim_desc = lstm_forward::primitive_desc(cpu_engine,
477 prop_kind::forward_inference,
478 rnn_direction::unidirectional_left2right, enc_bidir_dst_layer_md,
479 memory::desc(), memory::desc(), enc_uni_first_wei_layer_md,
480 enc_uni_first_wei_iter_md, user_enc_uni_first_bias_md,
481 enc_uni_first_dst_layer_md, memory::desc(), memory::desc(), attr);
482
483 //[create uni first]
484
485 auto enc_uni_first_wei_layer_memory
486 = memory(enc_uni_first_prim_desc.weights_layer_desc(), cpu_engine);
487 reorder(user_enc_uni_first_wei_layer_memory, enc_uni_first_wei_layer_memory)
488 .execute(s, user_enc_uni_first_wei_layer_memory,
489 enc_uni_first_wei_layer_memory);
490
491 auto enc_uni_first_wei_iter_memory
492 = memory(enc_uni_first_prim_desc.weights_iter_desc(), cpu_engine);
493 reorder(user_enc_uni_first_wei_iter_memory, enc_uni_first_wei_iter_memory)
494 .execute(s, user_enc_uni_first_wei_iter_memory,
495 enc_uni_first_wei_iter_memory);
496
497 auto enc_uni_first_dst_layer_memory
498 = memory(enc_uni_first_prim_desc.dst_layer_desc(), cpu_engine);
499
500 ///
501 /// Encoder : add the first unidirectional rnn primitive with related arguments into encoder_net
502 /// @snippet cpu_rnn_inference_int8.cpp push first uni rnn to encoder net
503 ///
504 //[push first uni rnn to encoder net]
505 encoder_net.push_back(lstm_forward(enc_uni_first_prim_desc));
506 encoder_net_args.push_back(
507 {{DNNL_ARG_SRC_LAYER, enc_bidir_dst_layer_memory},
508 {DNNL_ARG_WEIGHTS_LAYER, enc_uni_first_wei_layer_memory},
509 {DNNL_ARG_WEIGHTS_ITER, enc_uni_first_wei_iter_memory},
510 {DNNL_ARG_BIAS, user_enc_uni_first_bias_memory},
511 {DNNL_ARG_DST_LAYER, enc_uni_first_dst_layer_memory}});
512 //[push first uni rnn to encoder net]
513
514 ///
515 /// Encoder : Remaining unidirectional layers
516 /// @snippet cpu_rnn_inference_int8.cpp remaining uni layers
517 ///
518 //[remaining uni layers]
519 std::vector<float> user_enc_uni_wei_layer((enc_unidir_n_layers - 1) * 1
520 * feature_size * lstm_n_gates * feature_size,
521 0.3f);
522 std::vector<float> user_enc_uni_wei_iter((enc_unidir_n_layers - 1) * 1
523 * feature_size * lstm_n_gates * feature_size,
524 0.2f);
525 std::vector<float> user_enc_uni_bias(
526 (enc_unidir_n_layers - 1) * 1 * lstm_n_gates * feature_size, 1.0f);
527 //[remaining uni layers]
528
529 memory::dims user_enc_uni_wei_layer_dims = {(enc_unidir_n_layers - 1), 1,
530 feature_size, lstm_n_gates, feature_size};
531 memory::dims user_enc_uni_wei_iter_dims = {(enc_unidir_n_layers - 1), 1,
532 feature_size, lstm_n_gates, feature_size};
533 memory::dims user_enc_uni_bias_dims
534 = {(enc_unidir_n_layers - 1), 1, lstm_n_gates, feature_size};
535 memory::dims enc_dst_layer_dims = {src_seq_length_max, batch, feature_size};
536
537 auto user_enc_uni_wei_layer_md = memory::desc({user_enc_uni_wei_layer_dims},
538 memory::data_type::f32, memory::format_tag::ldigo);
539 auto user_enc_uni_wei_iter_md = memory::desc({user_enc_uni_wei_iter_dims},
540 memory::data_type::f32, memory::format_tag::ldigo);
541 auto user_enc_uni_bias_md = memory::desc({user_enc_uni_bias_dims},
542 memory::data_type::f32, memory::format_tag::ldgo);
543
544 auto user_enc_uni_wei_layer_memory = memory(user_enc_uni_wei_layer_md,
545 cpu_engine, user_enc_uni_wei_layer.data());
546 auto user_enc_uni_wei_iter_memory = memory(
547 user_enc_uni_wei_iter_md, cpu_engine, user_enc_uni_wei_iter.data());
548 auto user_enc_uni_bias_memory = memory(
549 user_enc_uni_bias_md, cpu_engine, user_enc_uni_bias.data());
550
551 auto enc_uni_wei_layer_md = memory::desc({user_enc_uni_wei_layer_dims},
552 memory::data_type::s8, memory::format_tag::any);
553 auto enc_uni_wei_iter_md = memory::desc({user_enc_uni_wei_iter_dims},
554 memory::data_type::s8, memory::format_tag::any);
555 auto enc_dst_layer_md = memory::desc({enc_dst_layer_dims},
556 memory::data_type::f32, memory::format_tag::any);
557
558 ///
559 /// Encoder : Create unidirection RNN cell
560 /// @snippet cpu_rnn_inference_int8.cpp create uni rnn
561 ///
562 //[create uni rnn]
563
564 auto enc_uni_prim_desc = lstm_forward::primitive_desc(cpu_engine,
565 prop_kind::forward_inference,
566 rnn_direction::unidirectional_left2right,
567 enc_uni_first_dst_layer_md, memory::desc(), memory::desc(),
568 enc_uni_wei_layer_md, enc_uni_wei_iter_md, user_enc_uni_bias_md,
569 enc_dst_layer_md, memory::desc(), memory::desc(), attr);
570 //[create uni rnn]
571
572 auto enc_uni_wei_layer_memory
573 = memory(enc_uni_prim_desc.weights_layer_desc(), cpu_engine);
574 auto enc_uni_wei_layer_reorder_pd = reorder::primitive_desc(
575 user_enc_uni_wei_layer_memory, enc_uni_wei_layer_memory, attr);
576 reorder(enc_uni_wei_layer_reorder_pd)
577 .execute(
578 s, user_enc_uni_wei_layer_memory, enc_uni_wei_layer_memory);
579
580 auto enc_uni_wei_iter_memory
581 = memory(enc_uni_prim_desc.weights_iter_desc(), cpu_engine);
582 auto enc_uni_wei_iter_reorder_pd = reorder::primitive_desc(
583 user_enc_uni_wei_iter_memory, enc_uni_wei_iter_memory, attr);
584 reorder(enc_uni_wei_iter_reorder_pd)
585 .execute(s, user_enc_uni_wei_iter_memory, enc_uni_wei_iter_memory);
586
587 auto enc_dst_layer_memory
588 = memory(enc_uni_prim_desc.dst_layer_desc(), cpu_engine);
589
590 ///
591 /// Encoder : add the unidirectional rnn primitive with related arguments into encoder_net
592 /// @snippet cpu_rnn_inference_int8.cpp push uni rnn to encoder net
593 ///
594 //[push uni rnn to encoder net]
595 encoder_net.push_back(lstm_forward(enc_uni_prim_desc));
596 encoder_net_args.push_back(
597 {{DNNL_ARG_SRC_LAYER, enc_uni_first_dst_layer_memory},
598 {DNNL_ARG_WEIGHTS_LAYER, enc_uni_wei_layer_memory},
599 {DNNL_ARG_WEIGHTS_ITER, enc_uni_wei_iter_memory},
600 {DNNL_ARG_BIAS, user_enc_uni_bias_memory},
601 {DNNL_ARG_DST_LAYER, enc_dst_layer_memory}});
602 //[push uni rnn to encoder net]
603
604 ///
605 /// **Decoder with attention mechanism**
606 ///
607 ///
608 /// Decoder : declare memory dimensions
609 /// @snippet cpu_rnn_inference_int8.cpp dec mem dim
610 ///
611 //[dec mem dim]
612 std::vector<float> user_dec_wei_layer(
613 dec_n_layers * 1 * feature_size * lstm_n_gates * feature_size,
614 0.2f);
615 std::vector<float> user_dec_wei_iter(dec_n_layers * 1
616 * (feature_size + feature_size) * lstm_n_gates
617 * feature_size,
618 0.3f);
619 std::vector<float> user_dec_bias(
620 dec_n_layers * 1 * lstm_n_gates * feature_size, 1.0f);
621 std::vector<int8_t> user_weights_attention_src_layer(
622 feature_size * feature_size, 1);
623 float weights_attention_scale = 127.;
624 std::vector<float> user_weights_annotation(
625 feature_size * feature_size, 1.0f);
626 std::vector<float> user_weights_alignments(feature_size, 1.0f);
627 // Buffer to store decoder output for all iterations
628 std::vector<uint8_t> dec_dst(tgt_seq_length_max * batch * feature_size, 0);
629
630 memory::dims user_dec_wei_layer_dims
631 = {dec_n_layers, 1, feature_size, lstm_n_gates, feature_size};
632 memory::dims user_dec_wei_iter_dims = {dec_n_layers, 1,
633 feature_size + feature_size, lstm_n_gates, feature_size};
634 memory::dims user_dec_bias_dims
635 = {dec_n_layers, 1, lstm_n_gates, feature_size};
636 memory::dims dec_src_layer_dims = {1, batch, feature_size};
637 memory::dims dec_dst_layer_dims = {1, batch, feature_size};
638 memory::dims dec_dst_iter_c_dims = {dec_n_layers, 1, batch, feature_size};
639 //[dec mem dim]
640
641 // We will use the same memory for dec_src_iter and dec_dst_iter
642 // However, dec_src_iter has a context vector but not
643 // dec_dst_iter.
644 // To resolve this we will create one memory that holds the
645 // context vector as well as the both the hidden and cell states.
646 // For the dst_iter, we will use a view on this memory.
647 // Note that the cell state will be padded by
648 // feature_size values. However, we do not compute or
649 // access those.
650 /// @snippet cpu_rnn_inference_int8.cpp noctx mem dim
651 //[noctx mem dim]
652 std::vector<float> dec_dst_iter(
653 dec_n_layers * batch * 2 * feature_size, 1.0f);
654
655 memory::dims dec_dst_iter_dims
656 = {dec_n_layers, 1, batch, feature_size + feature_size};
657 memory::dims dec_dst_iter_noctx_dims
658 = {dec_n_layers, 1, batch, feature_size};
659 //[noctx mem dim]
660
661 ///
662 /// Decoder : create memory description
663 /// Create memory descriptors for RNN data w/o specified layout
664 /// @snippet cpu_rnn_inference_int8.cpp dec mem desc
665 ///
666 //[dec mem desc]
667 auto user_dec_wei_layer_md = memory::desc({user_dec_wei_layer_dims},
668 memory::data_type::f32, memory::format_tag::ldigo);
669 auto user_dec_wei_iter_md = memory::desc({user_dec_wei_iter_dims},
670 memory::data_type::f32, memory::format_tag::ldigo);
671 auto user_dec_bias_md = memory::desc({user_dec_bias_dims},
672 memory::data_type::f32, memory::format_tag::ldgo);
673 auto dec_src_layer_md = memory::desc({dec_src_layer_dims},
674 memory::data_type::u8, memory::format_tag::tnc);
675 auto dec_dst_layer_md = memory::desc({dec_dst_layer_dims},
676 memory::data_type::u8, memory::format_tag::tnc);
677 auto dec_dst_iter_md = memory::desc({dec_dst_iter_dims},
678 memory::data_type::f32, memory::format_tag::ldnc);
679 auto dec_dst_iter_c_md = memory::desc({dec_dst_iter_c_dims},
680 memory::data_type::f32, memory::format_tag::ldnc);
681 //[dec mem desc]
682
683 ///
684 /// Decoder : Create memory
685 /// @snippet cpu_rnn_inference_int8.cpp create dec memory
686 ///
687 //[create dec memory]
688 auto user_dec_wei_layer_memory = memory(
689 user_dec_wei_layer_md, cpu_engine, user_dec_wei_layer.data());
690 auto user_dec_wei_iter_memory = memory(
691 user_dec_wei_iter_md, cpu_engine, user_dec_wei_iter.data());
692 auto user_dec_bias_memory
693 = memory(user_dec_bias_md, cpu_engine, user_dec_bias.data());
694 auto dec_src_layer_memory = memory(dec_src_layer_md, cpu_engine);
695 auto dec_dst_layer_memory
696 = memory(dec_dst_layer_md, cpu_engine, dec_dst.data());
697 auto dec_dst_iter_c_memory = memory(dec_dst_iter_c_md, cpu_engine);
698 //[create dec memory]
699
700 // Create memory descriptors for RNN data w/o specified layout
701 auto dec_wei_layer_md = memory::desc({user_dec_wei_layer_dims},
702 memory::data_type::s8, memory::format_tag::any);
703 auto dec_wei_iter_md = memory::desc({user_dec_wei_iter_dims},
704 memory::data_type::s8, memory::format_tag::any);
705
706 ///
707 /// Decoder : As mentioned above, we create a view without context out of the memory with context.
708 /// @snippet cpu_rnn_inference_int8.cpp create noctx mem
709 ///
710 //[create noctx mem]
711 auto dec_dst_iter_memory
712 = memory(dec_dst_iter_md, cpu_engine, dec_dst_iter.data());
713 auto dec_dst_iter_noctx_md = dec_dst_iter_md.submemory_desc(
714 dec_dst_iter_noctx_dims, {0, 0, 0, 0, 0});
715 //[create noctx mem]
716
717 auto dec_ctx_prim_desc = lstm_forward::primitive_desc(cpu_engine,
718 prop_kind::forward_inference,
719 rnn_direction::unidirectional_left2right, dec_src_layer_md,
720 dec_dst_iter_md, dec_dst_iter_c_md, dec_wei_layer_md,
721 dec_wei_iter_md, user_dec_bias_md, dec_dst_layer_md,
722 dec_dst_iter_noctx_md, dec_dst_iter_c_md, attr);
723
724 ///
725 /// Decoder : Create memory for input data and use reorders to quantize values
726 /// to int8
727 /// @snippet cpu_rnn_inference_int8.cpp dec reorder
728 ///
729 //[dec reorder]
730 auto dec_wei_layer_memory
731 = memory(dec_ctx_prim_desc.weights_layer_desc(), cpu_engine);
732 auto dec_wei_layer_reorder_pd = reorder::primitive_desc(
733 user_dec_wei_layer_memory, dec_wei_layer_memory, attr);
734 reorder(dec_wei_layer_reorder_pd)
735 .execute(s, user_dec_wei_layer_memory, dec_wei_layer_memory);
736 //[dec reorder]
737
738 auto dec_wei_iter_memory
739 = memory(dec_ctx_prim_desc.weights_iter_desc(), cpu_engine);
740 auto dec_wei_iter_reorder_pd = reorder::primitive_desc(
741 user_dec_wei_iter_memory, dec_wei_iter_memory, attr);
742 reorder(dec_wei_iter_reorder_pd)
743 .execute(s, user_dec_wei_iter_memory, dec_wei_iter_memory);
744
745 decoder_net.push_back(lstm_forward(dec_ctx_prim_desc));
746 decoder_net_args.push_back({{DNNL_ARG_SRC_LAYER, dec_src_layer_memory},
747 {DNNL_ARG_SRC_ITER, dec_dst_iter_memory},
748 {DNNL_ARG_SRC_ITER_C, dec_dst_iter_c_memory},
749 {DNNL_ARG_WEIGHTS_LAYER, dec_wei_layer_memory},
750 {DNNL_ARG_WEIGHTS_ITER, dec_wei_iter_memory},
751 {DNNL_ARG_BIAS, user_dec_bias_memory},
752 {DNNL_ARG_DST_LAYER, dec_dst_layer_memory},
753 {DNNL_ARG_DST_ITER, dec_dst_iter_memory},
754 {DNNL_ARG_DST_ITER_C, dec_dst_iter_c_memory}});
755
756 // Allocating temporary buffers for attention mechanism
757 std::vector<float> weighted_annotations(
758 src_seq_length_max * batch * feature_size, 1.0f);
759 std::vector<int32_t> weights_attention_sum_rows(feature_size, 1);
760
761 ///
762 /// **Execution**
763 ///
764
765 auto execute = [&]() {
766 assert(encoder_net.size() == encoder_net_args.size()
767 && "something is missing");
768 ///
769 /// run encoder (1 stream)
770 /// @snippet cpu_rnn_inference_int8.cpp run enc
771 ///
772 //[run enc]
773 for (size_t p = 0; p < encoder_net.size(); ++p)
774 encoder_net.at(p).execute(s, encoder_net_args.at(p));
775 //[run enc]
776
777 // compute the weighted annotations once before the decoder
778 ///
779 /// we compute the weighted annotations once before the decoder
780 /// @snippet cpu_rnn_inference_int8.cpp weight ano
781 ///
782 //[weight ano]
783 compute_weighted_annotations(weighted_annotations.data(),
784 src_seq_length_max, batch, feature_size,
785 user_weights_annotation.data(),
786 (float *)enc_dst_layer_memory.get_data_handle());
787 //[weight ano]
788 ///
789 /// precompute compensation for s8u8s32 gemm in compute attention
790 /// @snippet cpu_rnn_inference_int8.cpp s8u8s32
791 ///
792 //[s8u8s32]
793 compute_sum_of_rows(user_weights_attention_src_layer.data(),
794 feature_size, feature_size, weights_attention_sum_rows.data());
795 //[s8u8s32]
796
797 ///
798 /// We initialize src_layer to the embedding of the end of
799 /// sequence character, which are assumed to be 0 here
800 /// @snippet cpu_rnn_inference_int8.cpp init src_layer
801 ///
802 //[init src_layer]
803 memset(dec_src_layer_memory.get_data_handle(), 0,
804 dec_src_layer_memory.get_desc().get_size());
805 //[init src_layer]
806
807 ///
808 /// From now on, src points to the output of the last iteration
809 ///
810 for (dim_t i = 0; i < tgt_seq_length_max; i++) {
811 uint8_t *src_att_layer_handle
812 = (uint8_t *)dec_src_layer_memory.get_data_handle();
813 float *src_att_iter_handle
814 = (float *)dec_dst_iter_memory.get_data_handle();
815
816 ///
817 /// Compute attention context vector into the first layer src_iter
818 /// @snippet cpu_rnn_inference_int8.cpp att ctx
819 ///
820 //[att ctx]
821 compute_attention(src_att_iter_handle, src_seq_length_max, batch,
822 feature_size, user_weights_attention_src_layer.data(),
823 weights_attention_scale, weights_attention_sum_rows.data(),
824 src_att_layer_handle, data_scale, data_shift,
825 (uint8_t *)enc_bidir_dst_layer_memory.get_data_handle(),
826 weighted_annotations.data(),
827 user_weights_alignments.data());
828 //[att ctx]
829
830 ///
831 /// copy the context vectors to all layers of src_iter
832 /// @snippet cpu_rnn_inference_int8.cpp cp ctx
833 ///
834 //[cp ctx]
835 copy_context(
836 src_att_iter_handle, dec_n_layers, batch, feature_size);
837 //[cp ctx]
838
839 assert(decoder_net.size() == decoder_net_args.size()
840 && "something is missing");
841 ///
842 /// run the decoder iteration
843 /// @snippet cpu_rnn_inference_int8.cpp run dec iter
844 ///
845 //[run dec iter]
846 for (size_t p = 0; p < decoder_net.size(); ++p)
847 decoder_net.at(p).execute(s, decoder_net_args.at(p));
848 //[run dec iter]
849
850 ///
851 /// Move the handle on the src/dst layer to the next iteration
852 /// @snippet cpu_rnn_inference_int8.cpp set handle
853 ///
854 //[set handle]
855 auto dst_layer_handle
856 = (uint8_t *)dec_dst_layer_memory.get_data_handle();
857 dec_src_layer_memory.set_data_handle(dst_layer_handle);
858 dec_dst_layer_memory.set_data_handle(
859 dst_layer_handle + batch * feature_size);
860 //[set handle]
861 }
862 };
863
864 /// @page cpu_rnn_inference_int8_cpp
865 ///
866 std::cout << "Parameters:" << std::endl
867 << " batch = " << batch << std::endl
868 << " feature size = " << feature_size << std::endl
869 << " maximum source sequence length = " << src_seq_length_max
870 << std::endl
871 << " maximum target sequence length = " << tgt_seq_length_max
872 << std::endl
873 << " number of layers of the bidirectional encoder = "
874 << enc_bidir_n_layers << std::endl
875 << " number of layers of the unidirectional encoder = "
876 << enc_unidir_n_layers << std::endl
877 << " number of layers of the decoder = " << dec_n_layers
878 << std::endl;
879
880 execute();
881 s.wait();
882}
883
884int main(int argc, char **argv) {
885 return handle_example_errors({engine::kind::cpu}, simple_net);
886}
887