inference_int8_matmul.cpp source code [oneDNN/examples/tutorials/matmul/inference_int8_matmul.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	/// @example inference_int8_matmul.cpp
18	/// > Annotated version: @ref inference_int8_matmul_cpp
19	///
20	/// @page inference_int8_matmul_cpp_short
21	/// C++ API example demonstrating how one can use
22	/// [MatMul](@ref dev_guide_matmul) fused with ReLU in INT8 inference.
23	///
24	/// Concepts:
25	/// - Asymmetric quantization
26	/// - Scales: dnnl::primitive_attr::set_scales_mask()
27	/// - Zero points: dnnl::primitive_attr::set_zero_points_mask()
28	/// - [Operation fusion](@ref dev_guide_attributes_post_ops)
29	/// - Create primitive once, use multiple times
30	/// - Run-time tensor shapes: #DNNL_RUNTIME_DIM_VAL
31	/// - Weights pre-packing: use #dnnl::memory::format_tag::any
32	///
33	/// @page inference_int8_matmul_cpp MatMul Tutorial: INT8 Inference
34	/// @copydetails inference_int8_matmul_cpp_short
35	///
36	/// Assumptions:
37	/// 1. The shape of the weights (matrix \f$B(K, N)\f$) is known in advance, the
38	/// data type is `int8_t` and centered around 0 (i.e. the zero point is 0).
39	/// 2. The shapes of the source matrix \f$A\f$ and destination matrix \f$C\f$
40	/// are partially unknown. Both matrices use `uint8_t` data type and might
41	/// have arbitrary zero points (specified at execution time only).
42	/// 3. Scaling (re-quantization) factor specified at run-time only.
43	///
44	/// Since the shape of weights is known in advance, the MatMul weights can be
45	/// created with format tag #dnnl::memory::format_tag::any to enable the library
46	/// to choose the most appropriate layout for best performance.
47	///
48	/// @warning
49	/// The format tag #dnnl::memory::format_tag::any doesn't work for memory
50	/// descriptors that have one or more unknown dimensions and/or strides.
51	///
52	/// @include inference_int8_matmul.cpp
53
54	#include <cassert>
55	#include <cctype>
56	#include <cmath>
57	#include <cstdio>
58	#include <iostream>
59	#include <random>
60	#include <stdexcept>
61	#include <vector>
62
63	#include "oneapi/dnnl/dnnl.hpp"
64
65	#include "example_utils.hpp"
66
67	using namespace dnnl;
68
69	namespace {
70
71	void init_vector(std::vector<float> &v) {
72	std::mt19937 gen;
73	std::uniform_real_distribution<float> u(`0`, `1`);
74	for (auto &e : v)
75	e = u(gen);
76	}
77
78	void init_vector(std::vector<uint8_t> &v) {
79	std::mt19937 gen;
80	std::uniform_int_distribution<unsigned int> u(`0`, `255`);
81	for (auto &e : v)
82	e = static_cast<uint8_t>(u(gen));
83	}
84
85	} // namespace
86
87	int number_of_runs = `1`;
88
89	// Create a MatMul primitive descriptor for the following op:
90	// C_u8 = ReLU(sc_A sc_B[:] * (A_u8 - zp_A) * B_s8) / sc_C + zp_C*
91	//
92	// Here:
93	// - Matrices A and C are known to be non-transposed but their M dimension is
94	// not known. They can be activation matrices in an MLP topology and the M
95	// dimension can be the mini-batch dimension.
96	// - zp_A and zp_C are zero points for matrices A and C which are stored as
97	// uint8_t. These are run-time parameters that are not known at the primitive
98	// creation time.
99	// - The B matrix is stored as int8_t, its zero point is 0, and all its
100	// dimensions are known. This matrix can be a matrix of weights in an MLP
101	// topology.
102	// - The scaling values are not known at the primitive creation time.
103	matmul::primitive_desc matmul_pd_create(
104	int64_t K, int64_t N, const engine &eng) {
105	const int64_t M = DNNL_RUNTIME_DIM_VAL;
106
107	memory::desc a_md({M, K}, memory::data_type::u8, {K, `1`}); // M x K layout
108	memory::desc b_md({K, N}, memory::data_type::s8, memory::format_tag::any);
109	memory::desc c_md({M, N}, memory::data_type::u8, {N, `1`}); // M x N layout
110
111	// Create attributes and indicate that the alpha and zero points are
112	// runtime parameters
113	primitive_attr attr;
114	attr.set_scales_mask(DNNL_ARG_SRC, / mask / `0`);
115	attr.set_scales_mask(DNNL_ARG_WEIGHTS, / mask / `1` << `1`);
116	attr.set_scales_mask(DNNL_ARG_DST, / mask / `0`);
117	attr.set_zero_points_mask(DNNL_ARG_SRC, / mask / `0`);
118	attr.set_zero_points_mask(DNNL_ARG_DST, / mask / `0`);
119	post_ops po;
120	po.append_eltwise(algorithm::eltwise_relu, `0.f`, `0.f`);
121	attr.set_post_ops(po);
122
123	// Create a MatMul primitive descriptor
124	return matmul::primitive_desc(eng, a_md, b_md, c_md, attr);
125	}
126
127	void prepare_input(memory &A_u8_mem, memory &sc_A_mem, memory &sc_B_mem,
128	memory &sc_C_mem, memory &zp_A_mem, memory &zp_C_mem) {
129	int64_t M = A_u8_mem.get_desc().get_dims()[`0`];
130	int64_t N = sc_B_mem.get_desc().get_dims()[`0`];
131	int64_t K = A_u8_mem.get_desc().get_dims()[`1`];
132
133	std::vector<uint8_t> A_u8(M * K);
134	init_vector(A_u8);
135
136	std::vector<float> sc_B(N);
137	init_vector(sc_B);
138
139	float sc_A = `0.5f`;
140	float sc_C = `0.25f`;
141	int32_t zp_A = `128`, zp_C = `40`;
142
143	write_to_dnnl_memory(A_u8.data(), A_u8_mem);
144	write_to_dnnl_memory(&zp_A, zp_A_mem);
145	write_to_dnnl_memory(&zp_C, zp_C_mem);
146	write_to_dnnl_memory(&sc_A, sc_A_mem);
147	write_to_dnnl_memory(sc_B.data(), sc_B_mem);
148	write_to_dnnl_memory(&sc_C, sc_C_mem);
149	}
150
151	void sanity_check(memory &C_u8_mem, memory &zp_C_mem) {
152	int64_t M = C_u8_mem.get_desc().get_dims()[`0`];
153	int64_t N = C_u8_mem.get_desc().get_dims()[`1`];
154	int32_t zp_C = `0`;
155	std::vector<uint8_t> C_u8(M * N);
156
157	read_from_dnnl_memory(C_u8.data(), C_u8_mem);
158	read_from_dnnl_memory(&zp_C, zp_C_mem);
159
160	// simple check: C_u8 >= zp_C
161	for (int64_t i = `0`; i < M * N; ++i)
162	if (C_u8[i] < zp_C)
163	throw std::logic_error(
164	"Smoke check failed."
165	"\n\tQuantized value is smaller than the zero point,"
166	"\n\twhich should not happen since ReLU was applied.");
167	}
168
169	void infer(const matmul &matmul_p, int64_t M, int64_t N, int64_t K,
170	const memory &B_s8_mem, const engine &eng) {
171	// inputs of the current layer / operation
172	memory A_u8_mem({{M, K}, memory::data_type::u8, {K, `1`}}, eng);
173	memory zp_A_mem({{`1`}, memory::data_type::s32, {`1`}}, eng);
174	memory zp_C_mem({{`1`}, memory::data_type::s32, {`1`}}, eng);
175	memory sc_A_mem({{`1`}, memory::data_type::f32, {`1`}}, eng);
176	memory sc_B_mem({{N}, memory::data_type::f32, {`1`}}, eng);
177	memory sc_C_mem({{`1`}, memory::data_type::f32, {`1`}}, eng);
178
179	// the function below fills dnnl::memory with some values
180	// these memories, typically, come from the previous layers / operations
181	// with meaningful data inside
182	prepare_input(A_u8_mem, sc_A_mem, sc_B_mem, sc_C_mem, zp_A_mem, zp_C_mem);
183
184	// output - no initialization required
185	memory C_u8_mem({{M, N}, memory::data_type::u8, {N, `1`}}, eng);
186
187	stream s(eng);
188	for (int run = `0`; run < number_of_runs; ++run)
189	matmul_p.execute(s,
190	{{DNNL_ARG_SRC, A_u8_mem}, {DNNL_ARG_WEIGHTS, B_s8_mem},
191	{DNNL_ARG_DST, C_u8_mem},
192	{DNNL_ARG_ATTR_SCALES \| DNNL_ARG_SRC, sc_A_mem},
193	{DNNL_ARG_ATTR_SCALES \| DNNL_ARG_WEIGHTS, sc_B_mem},
194	{DNNL_ARG_ATTR_SCALES \| DNNL_ARG_DST, sc_C_mem},
195	{DNNL_ARG_ATTR_ZERO_POINTS \| DNNL_ARG_SRC, zp_A_mem},
196	{DNNL_ARG_ATTR_ZERO_POINTS \| DNNL_ARG_DST, zp_C_mem}});
197	s.wait();
198
199	// a sanity check for the correctness of the output
200	sanity_check(C_u8_mem, zp_C_mem);
201	}
202
203	void inference_int8_matmul(engine::kind engine_kind) {
204	engine eng(engine_kind, `0`);
205
206	const int64_t K = `96`;
207	const int64_t N = `1000`;
208	auto matmul_pd = matmul_pd_create(K, N, eng);
209
210	// Original weights stored as float in a known format
211	std::vector<float> B_f32(K * N);
212	init_vector(B_f32);
213
214	// Pre-packed weights stored as int8_t
215	memory B_s8_mem(matmul_pd.weights_desc(), eng);
216	{
217	stream s(eng);
218	memory B_f32_mem(
219	{{K, N}, memory::data_type::f32, memory::format_tag::ab}, eng);
220	write_to_dnnl_memory(B_f32.data(), B_f32_mem);
221	reorder(B_f32_mem, B_s8_mem).execute(s, B_f32_mem, B_s8_mem);
222	s.wait();
223	}
224
225	matmul matmul_p(matmul_pd);
226
227	for (int64_t M : {`1`, `100`})
228	infer(matmul_p, M, N, K, B_s8_mem, eng);
229	}
230
231	int main(int argc, char **argv) {
232	engine::kind engine_kind = parse_engine_kind(argc, argv);
233	return handle_example_errors(inference_int8_matmul, engine_kind);
234	}
235

Browse the source code of oneDNN/examples/tutorials/matmul/inference_int8_matmul.cpp