cpu_matmul_quantization.cpp source code [oneDNN/examples/tutorials/matmul/cpu_matmul_quantization.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	/// @example cpu_matmul_quantization.cpp
18	/// > Annotated version: @ref cpu_matmul_quantization_cpp
19	///
20	/// @page cpu_matmul_quantization_cpp_short
21	/// C++ API example demonstrating how one can perform reduced precision
22	/// matrix-matrix multiplication using [MatMul](@ref dev_guide_matmul) and the
23	/// accuracy of the result compared to the floating point computations.
24	///
25	/// Concepts:
26	/// - Static* and dynamic quantization*
27	/// - Asymmetric quantization
28	/// - Scales: dnnl::primitive_attr::set_scales_mask()
29	/// - Zero points: dnnl::primitive_attr::set_zero_points_mask()
30	///
31	/// @page cpu_matmul_quantization_cpp MatMul Tutorial: Quantization
32	/// @copydetails cpu_matmul_quantization_cpp_short
33	///
34	/// The example is focused around the following computation:
35	/// \f[
36	/// C = A \times B
37	/// \f]
38	///
39	/// First, we produce the reference result, having the original matrices
40	/// \f$A\f$ and \f$B\f$ be in #dnnl::memory::data_type::f32 data type.
41	///
42	/// For reduced precision computations, the matrices \f$A\f$ and \f$C\f$ will
43	/// use #dnnl::memory::data_type::u8 data type and would have the appropriate
44	/// zero points. For the matrix \f$B\f$, we will use the
45	/// #dnnl::memory::data_type::s8 data type, assuming that the data is centered
46	/// around zero (hence, the zero point would be simply 0).
47	///
48	/// The quantization formula is:
49	/// \f[
50	/// X_{f32}(:) := scale\_X \cdot (X_{int8}(:) - zp\_X),
51	/// \f]
52	///
53	/// where:
54	/// - \f$X_{f32}(:)\f$ -- original matrix;
55	///
56	/// - \f$X_{int8}(:)\f$ -- quantized matrix, where `int8` is either `u8`
57	/// (`uint8_t`) for the matrices \f$A\f$ and \f$C\f$, or
58	/// `s8` (`int8_t`) for the matrix \f$B\f$;
59	///
60	/// - \f$scale\_X\f$ -- `f32` scaling factor. For simplicity we will use a
61	/// single scale factor for each matrix, though for
62	/// better accuracy it might be a good idea to use
63	/// per-N-dimension scaling factor for the matrix B.
64	///
65	/// - \f$zp\_X\f$ -- integer quantization parameter "zero point"
66	/// (essentially, the representation of the real 0 in
67	/// the quantized data type).
68	///
69	/// For a given matrix \f$X_{f32}\f$ and `int8` data type (`u8` or `s8`), the
70	/// process of finding the proper \f$scale\_X\f$ and \f$zp\_X\f$ is a research
71	/// problem and can be different depending on the domain. For example purposes,
72	/// we will use the simplest approach by mapping the maximum (minimum)
73	/// \f$X_{f32}\f$ elements to the maximum (minimum) number in the corresponding
74	/// integer data type, using the following formulas:
75	///
76	/// 1. Since:
77	/// - \f$max(X_{f32}(:)) = scale\_X \cdot (max_{int8} - zp\_X)\f$
78	/// - \f$min(X_{f32}(:)) = scale\_X \cdot (min_{int8} - zp\_X)\f$
79	///
80	/// 2. Hence:
81	/// - \f$scale\_X =
82	/// \frac{max(X_{f32}(:)) - min(X_{f32}(:))}{max_{int8} - min_{int8}}\f$
83	/// - \f$zp\_X = max_{int8} - \frac{max(X_{f32}(:))}{scale\_X}\f$
84	///
85	/// It is worth noting that quantization parameters are not always computed at
86	/// actual run-time. For example, if we perform MatMul operation for _similar_
87	/// matrices (in a sense that data distribution is similar between the runs) we
88	/// can simply _guess_ the proper quantization parameters by collecting some
89	/// statistics during the early runs. This approach is called static
90	/// quantization. It gives good performance (since no cycles are spent on
91	/// computing those parameters) and is typically used in reduced precision
92	/// CNN inference. However, the static* quantization has an obvious*
93	/// disadvantage -- the _guessed_ parameters might not work well for some
94	/// particular matrices. For example, that would most likely be the case if we
95	/// could not guarantee the similarity of the input matrices. In this case, the
96	/// dynamic* quantization would be used, i.e. the parameters (re-)computed at*
97	/// runtime. This gives slightly worse performance, but that might be inevitable
98	/// due to accuracy considerations.
99	///
100	/// Only dynamic approaches is demonstrated in this example.
101	///
102	/// Other details:
103	/// - For simplicity all matrices will be stored in Row-Major format.
104	/// - The shapes of the matrices are assumed to be known at creation time.
105	/// However, for dynamic quantization we would consider q10n parameters
106	/// (\f$scale\_X\f$ and \f$zp\_X\f$) to be known at run-time only. On the
107	/// contrary, for the static quantization these parameters are known at
108	/// creation time as well.
109	///
110	/// @include cpu_matmul_quantization.cpp
111
112	#include <cassert>
113	#include <cctype>
114	#include <cmath>
115	#include <cstdio>
116	#include <iostream>
117	#include <random>
118	#include <stdexcept>
119	#include <vector>
120	#include <type_traits>
121
122	#include "oneapi/dnnl/dnnl.hpp"
123
124	#include "example_utils.hpp"
125
126	using namespace dnnl;
127
128	namespace {
129
130	void init_vector(std::vector<float> &v, float min_value, float max_value) {
131	std::mt19937 gen;
132	std::uniform_real_distribution<float> u(min_value, max_value);
133
134	for (auto &e : v)
135	e = u(gen);
136	}
137
138	template <typename T>
139	void find_min_max(const std::vector<T> &v, float &min_value, float &max_value) {
140	min_value = max_value = v[`0`];
141	for (auto &e : v) {
142	min_value = std::min<float>(min_value, e);
143	max_value = std::max<float>(max_value, e);
144	}
145	}
146
147	template <typename T>
148	void compute_q10n_params(const char message, const* std::vector<float> &v,
149	float &scale, int32_t &zp) {
150	// Find property of T integer type
151	// Simple trick to improve accuracy: shrink the range a little bit
152	float max_int = (float)std::numeric_limits<T>::max() - `1`;
153	float min_int = (float)std::numeric_limits<T>::lowest() + `1`;
154
155	#ifndef OMIT_WORKAROUND_FOR_SKX
156	// Read more in CPU / Section 1 here:
157	// https://oneapi-src.github.io/oneDNN/dev_guide_int8_computations.html
158	if (std::is_same<T, uint8_t>::value) max_int /= `2`;
159	#endif
160
161	// Find min and max value in array
162	float min_val = v[`0`], max_val = v[`0`];
163	find_min_max(v, min_val, max_val);
164
165	// Compute appropriate scale
166	scale = (max_val - min_val) / (max_int - min_int);
167
168	// Compute appropriate offset
169	if (std::is_same<T, int8_t>::value)
170	zp = `0`;
171	else
172	zp = (int32_t)(max_int - max_val / scale);
173	printf("\tComputing q10n params for %s\n"
174	"\t\tData type: %s\n"
175	"\t\tScale:%.3g (inverse scale:%.3g)\n"
176	"\t\tZero point:%d\n\n",
177	message, std::is_same<T, int8_t>::value ? "int8_t" : "uint8_t",
178	scale, `1` / scale, zp);
179	}
180
181	int compare_vectors(const std::vector<float> &v1,
182	const std::vector<uint8_t> &v2, float scale_v2, int32_t zp_v2,
183	float threshold) {
184	double v1_l2 = `0`, diff_l2 = `0`;
185	for (size_t n = `0`; n < v1.size(); ++n) {
186	float v2_n = scale_v2 * (v2[n] - zp_v2); // deq10n v2
187	float diff = v1[n] - v2_n;
188	v1_l2 += v1[n] * v1[n];
189	diff_l2 += diff * diff;
190	}
191
192	v1_l2 = std::sqrt(v1_l2);
193	diff_l2 = std::sqrt(diff_l2);
194	bool ok = diff_l2 <= threshold * v1_l2;
195
196	printf("\tComparison (using l2-norms)\n"
197	"\t\tReference matrix:%g\n\t\tError:%g\n\t\tRelative error:%g\n"
198	"\nAccuracy check: %s\n\n",
199	v1_l2, diff_l2, diff_l2 / v1_l2, ok ? "OK" : "FAILED");
200
201	return ok ? `0` : `1`;
202	}
203
204	} // namespace
205
206	engine eng(engine::kind::cpu, `0`); // We create a global engine for simplicity
207
208	// Quantize float data into X_int_m oneDNN memory using the q10n parameters
209	//
210	// Inputs:
211	// - X_f32 -- source f32 matrix
212	// - scale_X, zp_X -- quantization parameters
213	// - q10n_scheme -- dynamic or static, to mimic real-world applications wrt to
214	// how the q10n parameters are passed to reorders
215	// Outputs:
216	// - X_int_m -- prepared oneDNN memory that would hold quantized values
217	void quantize(const std::vector<float> &X_f32, float scale_X, int32_t zp_X,
218	memory &X_int_m) {
219	using dt = memory::data_type;
220
221	stream s(eng);
222
223	memory::desc x_int_md = X_int_m.get_desc();
224	const auto &dims = x_int_md.get_dims();
225
226	memory::desc x_f32_md({dims[`0`], dims[`1`]}, dt::f32, {dims[`1`], `1`});
227	memory X_f32_m(x_f32_md, eng, (void *)X_f32.data());
228
229	primitive_attr q10n_attr;
230	q10n_attr.set_scales_mask(DNNL_ARG_DST, / mask / `0`);
231	q10n_attr.set_zero_points_mask(DNNL_ARG_DST, / mask / `0`);
232
233	reorder::primitive_desc q10n_pd(eng, x_f32_md, eng, x_int_md, q10n_attr);
234	memory dst_scale_X_m({{`1`}, dt::f32, {`1`}}, eng, &scale_X);
235	memory zp_X_m({{`1`}, dt::s32, {`1`}}, eng, &zp_X);
236	reorder(q10n_pd).execute(s,
237	{{DNNL_ARG_SRC, X_f32_m}, {DNNL_ARG_DST, X_int_m},
238	{DNNL_ARG_ATTR_SCALES \| DNNL_ARG_DST, dst_scale_X_m},
239	{DNNL_ARG_ATTR_ZERO_POINTS \| DNNL_ARG_DST, zp_X_m}});
240
241	s.wait();
242	}
243
244	// Floating point MatMul
245	// Inputs:
246	// - Shape: M, N, K
247	// - Matrices A and B
248	// Outputs:
249	// - Matrix C
250	void f32_matmul_compute(int64_t M, int64_t N, int64_t K,
251	const std::vector<float> &A_f32, const std::vector<float> &B_f32,
252	std::vector<float> &C_f32) {
253	// Initialize memory descriptors that describes matrices in Row-Major format
254	memory::desc a_md({M, K}, memory::data_type::f32, {K, `1`});
255	memory::desc b_md({K, N}, memory::data_type::f32, {N, `1`});
256	memory::desc c_md({M, N}, memory::data_type::f32, {N, `1`});
257
258	// Wrap raw pointers into oneDNN memory objects
259	memory A_f32_m(a_md, eng, (void *)A_f32.data());
260	memory B_f32_m(b_md, eng, (void *)B_f32.data());
261	memory C_f32_m(c_md, eng, (void *)C_f32.data());
262
263	// Create a MatMul primitive
264	matmul::primitive_desc matmul_pd(eng, a_md, b_md, c_md);
265	matmul matmul_p(matmul_pd);
266
267	stream s(eng);
268	matmul_p.execute(s,
269	{{DNNL_ARG_SRC, A_f32_m}, {DNNL_ARG_WEIGHTS, B_f32_m},
270	{DNNL_ARG_DST, C_f32_m}});
271	s.wait();
272	}
273
274	// Reduced precision MatMul with dynamic* quantization*
275	// Inputs:
276	// - Shape: M, N, K
277	// - Matrices A and B in float (would be quantized inside the function)
278	// Outputs:
279	// - Matrix C in uint8_t
280	// - Quantization parameters: scale_C and zp_C
281	void dynamic_q10n_matmul(int64_t M, int64_t N, int64_t K,
282	const std::vector<float> &A_f32, const std::vector<float> &B_f32,
283	std::vector<uint8_t> &C_u8, float &scale_C, int32_t &zp_C) {
284	stream s(eng);
285
286	float scale_A, scale_B;
287	int32_t zp_A, zp_B;
288
289	// We compute q10n parameters here, but in the real world applications for
290	// inputs these parameters are transferred from the previous layers
291	compute_q10n_params<uint8_t>("A", A_f32, scale_A, zp_A);
292	compute_q10n_params<int8_t>("B", B_f32, scale_B, zp_B);
293	assert(zp_B == `0` && "for int8 q10n we assume zero point = 0");
294
295	// Quantize matrix A_u8 using reorder primitive
296	std::vector<uint8_t> A_u8(M * K, `0`);
297	memory::desc a_u8_md({M, K}, memory::data_type::u8, {K, `1`});
298	memory A_u8_m(a_u8_md, eng, (void *)A_u8.data());
299	quantize(A_f32, scale_A, zp_A, A_u8_m);
300
301	// Quantize matrix B_s8 using reorder primitive
302	std::vector<uint8_t> B_s8(K * N, `0`);
303	memory::desc b_s8_md({K, N}, memory::data_type::s8, {N, `1`});
304	memory B_s8_m(b_s8_md, eng, (void *)B_s8.data());
305	quantize(B_f32, scale_B, `0`, B_s8_m);
306
307	// Compute C_f32. We cannot directly compute C_u8 since we don't know the
308	// appropriate quantization parameters.
309	//
310	// Note: typically the computed data type in this case is int32_t and not
311	// float. But for brevity we are going to embed the scale_A and
312	// scale_B directly in this quantized MatMul, and hence will get the
313	// intermediate computation in floating point anyways, so there is
314	// no sense to convert the result to int32_t.
315	// In theory, we could postpone using the scale_A and scale_B, compute
316	// the exact C_s32 := (A_u8 - zp_A) B_s8, and then find the*
317	// appropriate quantization parameters for matrix C.
318	// Let it be an exercise :)
319
320	std::vector<float> C_f32(M * N, `0`);
321	memory::desc c_f32_md({M, N}, memory::data_type::f32, {N, `1`});
322	memory C_f32_m(c_f32_md, eng, (void *)C_f32.data());
323
324	// Create and compute a reduced precision MatMul primitive
325	{
326	primitive_attr matmul_attr;
327	matmul_attr.set_scales_mask(DNNL_ARG_SRC, / mask / `0`);
328	matmul_attr.set_scales_mask(DNNL_ARG_WEIGHTS, / mask / `0`);
329	matmul_attr.set_zero_points_mask(DNNL_ARG_SRC, / mask / `0`);
330
331	matmul::primitive_desc matmul_pd(
332	eng, a_u8_md, b_s8_md, c_f32_md, matmul_attr);
333	matmul matmul_p(matmul_pd);
334
335	memory scales_A_m({{`1`}, memory::data_type::f32, {`1`}}, eng, &scale_A);
336	memory scales_B_m({{`1`}, memory::data_type::f32, {`1`}}, eng, &scale_B);
337	memory zp_A_m({{`1`}, memory::data_type::s32, {`1`}}, eng, &zp_A);
338
339	matmul_p.execute(s,
340	{{DNNL_ARG_SRC, A_u8_m}, {DNNL_ARG_WEIGHTS, B_s8_m},
341	{DNNL_ARG_DST, C_f32_m},
342	{DNNL_ARG_ATTR_SCALES \| DNNL_ARG_SRC, scales_A_m},
343	{DNNL_ARG_ATTR_SCALES \| DNNL_ARG_WEIGHTS, scales_B_m},
344	{DNNL_ARG_ATTR_ZERO_POINTS \| DNNL_ARG_SRC, zp_A_m}});
345	}
346
347	// Find quantization parameters for matrix C
348	compute_q10n_params<uint8_t>("C", C_f32, scale_C, zp_C);
349
350	// Finally quantize the matrix C
351	memory::desc c_u8_md({M, N}, memory::data_type::u8, {N, `1`});
352	memory C_u8_m(c_u8_md, eng, (void *)C_u8.data());
353	quantize(C_f32, scale_C, zp_C, C_u8_m);
354	}
355
356	void compare_f32_and_quantized_matmuls() {
357	// MatMul parameters
358	const int64_t M = `10`, N = `20`, K = `30`;
359
360	// Data distribution for matrices A and B
361	const float param_A_min_val = -`2.f`;
362	const float param_A_max_val = `1.4f`;
363
364	const float param_B_min_val = -`1.f`;
365	const float param_B_max_val = -param_B_min_val; // B is centered around 0
366
367	// Thresholds
368	//
369	const float threshold_dynamic_q10n = `3` * `1e-2f`;
370
371	// Prepare matrices
372	std::vector<float> A_f32(M * K), B_f32(K * N), C_f32(M * N, `0`);
373	init_vector(A_f32, param_A_min_val, param_A_max_val);
374	init_vector(B_f32, param_B_min_val, param_B_max_val);
375
376	// Compute _true_ f32 result
377	f32_matmul_compute(M, N, K, A_f32, B_f32, C_f32);
378
379	std::vector<uint8_t> C_u8_dynamic_q10n(M * N, `0`);
380
381	float scale_C_dynamic_q10n; // Q10n parameters we don't know yet
382	int zp_C_dynamic_q10n;
383
384	dynamic_q10n_matmul(M, N, K, A_f32, B_f32, C_u8_dynamic_q10n,
385	scale_C_dynamic_q10n, zp_C_dynamic_q10n);
386
387	// Compare _true_ f32 result with dynamic q10n
388	int rc = compare_vectors(C_f32, C_u8_dynamic_q10n, scale_C_dynamic_q10n,
389	zp_C_dynamic_q10n, threshold_dynamic_q10n);
390	if (rc) throw std::logic_error("Dynamic quantization accuracy failed.");
391	}
392
393	int main(int argc, char **argv) {
394	return handle_example_errors(
395	{engine::kind::cpu}, compare_f32_and_quantized_matmuls);
396	}
397

Browse the source code of oneDNN/examples/tutorials/matmul/cpu_matmul_quantization.cpp