cpu_sgemm_and_matmul.cpp source code [oneDNN/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp]

1	/*******************************************************************************
2	* Copyright 2019-2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	/// @example cpu_sgemm_and_matmul.cpp
18	/// > Annotated version: @ref cpu_sgemm_and_matmul_cpp
19	///
20	/// @page cpu_sgemm_and_matmul_cpp_short
21	/// C++ API example demonstrating [MatMul](@ref dev_guide_matmul)
22	/// as a replacement for SGEMM functions.
23	///
24	/// Concepts:
25	/// - Create primitive once, use multiple times
26	/// - Run-time tensor shapes: #DNNL_RUNTIME_DIM_VAL
27	/// - Scales: dnnl::primitive_attr::set_scales_mask()
28	///
29	/// @page cpu_sgemm_and_matmul_cpp MatMul Tutorial: Comparison with SGEMM
30	/// @copydetails cpu_sgemm_and_matmul_cpp_short
31	///
32	/// We will show two modes for the MatMul primitive:
33	/// 1. The shapes of the input and output matrices are passed at execution time.
34	/// This enables you to create a primitive only once and use it for different
35	/// matrices, just like normal SGEMM (though with a handle -- oneDNN
36	/// primitive).
37	/// To indicate the unknown dimensions and floating point values, you should
38	/// use #DNNL_RUNTIME_DIM_VAL and #DNNL_RUNTIME_F32_VAL respectively.
39	/// 2. The shapes of the input and output matrices are passed at creation time,
40	/// as in oneDNN programming model.
41	/// This enables creating a highly specialized kernel for the given problem
42	/// sizes with the loss of generality.
43	///
44	/// Users are free to choose between these two options, as well as any
45	/// intermediate ones (e.g., specifying some of the parameters at creation time
46	/// while leaving the others until execution time). This enables balancing
47	/// between flexibility and performance.
48	///
49	/// @note
50	/// The more you specify at creation time, the better performance is.
51	///
52	/// @include cpu_sgemm_and_matmul.cpp
53
54	#include <cassert>
55	#include <cctype>
56	#include <cmath>
57	#include <cstdio>
58	#include <iostream>
59	#include <random>
60	#include <stdexcept>
61	#include <vector>
62
63	#include "oneapi/dnnl/dnnl.hpp"
64
65	#include "example_utils.hpp"
66
67	using namespace dnnl;
68
69	namespace {
70
71	void init_vector(std::vector<float> &v) {
72	std::mt19937 gen;
73	std::uniform_real_distribution<float> u(-`1`, `1`);
74
75	for (auto &e : v)
76	e = u(gen);
77	}
78
79	int compare_vectors(const std::vector<float> &v1, const std::vector<float> &v2,
80	int64_t K, const char *message) {
81	double v1_l2 = `0`, diff_l2 = `0`;
82	for (size_t n = `0`; n < v1.size(); ++n) {
83	float diff = v1[n] - v2[n];
84	v1_l2 += v1[n] * v1[n];
85	diff_l2 += diff * diff;
86	}
87
88	v1_l2 = std::sqrt(v1_l2);
89	diff_l2 = std::sqrt(diff_l2);
90
91	// Finding the reasonable (tight and accurate) threshold is quite difficult
92	// problem.
93	// The implementation testing might also use special data filling to
94	// alleviate issues related to the finite precision arithmetic.
95	// However, in simple cases the machine epsilon multiplied by log(K) should
96	// work reasonably well.
97	const double threshold = std::numeric_limits<float>::epsilon()
98	* std::log(std::max(`2.`, (double)K));
99	bool ok = diff_l2 <= threshold * v1_l2;
100
101	printf("%s\n\tL2 Norms"
102	"\n\t\tReference matrix:%g\n\t\tError:%g\n\t\tRelative_error:%g\n"
103	"\tAccuracy check: %s\n",
104	message, v1_l2, diff_l2, diff_l2 / v1_l2, ok ? "OK" : "FAILED");
105
106	return ok ? `0` : `1`;
107	}
108
109	} // namespace
110
111	int number_of_runs = `1`;
112	float fixed_beta = `0.f`;
113
114	engine eng(engine::kind::cpu, `0`); // We create a global engine for simplicity
115
116	// Create a _dynamic_ MatMul primitive that can work with arbitrary shapes
117	// and alpha parameters.
118	// Warning: current limitation is that beta parameter should be known in
119	// advance (use fixed_beta).
120	matmul dynamic_matmul_create() {
121	// We assume that beta is known at the primitive creation time
122	float beta = fixed_beta;
123
124	memory::dims a_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
125	memory::dims b_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
126	memory::dims c_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
127
128	memory::dims a_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
129	memory::dims b_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
130	memory::dims c_strides = {DNNL_RUNTIME_DIM_VAL, `1`};
131
132	memory::desc a_md(a_shape, memory::data_type::f32, a_strides);
133	memory::desc b_md(b_shape, memory::data_type::f32, b_strides);
134	memory::desc c_md(c_shape, memory::data_type::f32, c_strides);
135
136	// Create attributes (to handle alpha dynamically and beta if necessary)
137	primitive_attr attr;
138	attr.set_scales_mask(DNNL_ARG_WEIGHTS, / mask / `0`);
139	if (beta != `0.f`) {
140	post_ops po;
141	po.append_sum(beta);
142	attr.set_post_ops(po);
143	}
144
145	// Create a MatMul primitive
146	matmul::primitive_desc matmul_pd(eng, a_md, b_md, c_md, attr);
147	return matmul(matmul_pd);
148	}
149
150	// Execute a _dynamic_ MatMul primitive created earlier. All the parameters are
151	// passed at a run-time (except for beta which has to be specified at the
152	// primitive creation time due to the current limitation).
153	void dynamic_matmul_execute(matmul &matmul_p, char transA, char transB,
154	int64_t M, int64_t N, int64_t K, float alpha, const float *A,
155	int64_t lda, const float B, int64_t ldb, float* beta, float *C,
156	int64_t ldc) {
157	using dims = memory::dims;
158
159	if (beta != fixed_beta)
160	throw std::logic_error("Run-time beta is not yet supported.");
161
162	// Translate transA and transB
163	dims a_strides = tolower(transA) == `'n'` ? dims {lda, `1`} : dims {`1`, lda};
164	dims b_strides = tolower(transB) == `'n'` ? dims {ldb, `1`} : dims {`1`, ldb};
165
166	// Wrap raw pointers into oneDNN memories (with proper shapes)
167	memory A_m({{M, K}, memory::data_type::f32, a_strides}, eng, (void *)A);
168	memory B_m({{K, N}, memory::data_type::f32, b_strides}, eng, (void *)B);
169	memory C_m({{M, N}, memory::data_type::f32, {ldc, `1`}}, eng, (void *)C);
170
171	// Prepare oneDNN memory for alpha
172	memory alpha_m({{`1`}, memory::data_type::f32, {`1`}}, eng, &alpha);
173
174	// Execute the MatMul primitive
175	stream s(eng);
176	matmul_p.execute(s,
177	{{DNNL_ARG_SRC, A_m}, {DNNL_ARG_WEIGHTS, B_m}, {DNNL_ARG_DST, C_m},
178	{DNNL_ARG_ATTR_SCALES \| DNNL_ARG_WEIGHTS, alpha_m}});
179	s.wait();
180	}
181
182	void sgemm_and_matmul_with_params(char transA, char transB, int64_t M,
183	int64_t N, int64_t K, float alpha, float beta) {
184	if (beta != fixed_beta)
185	throw std::logic_error("Run-time beta is not yet supported.");
186
187	// Allocate and initialize matrices
188	std::vector<float> A(M * K);
189	init_vector(A);
190
191	std::vector<float> B(K * N);
192	init_vector(B);
193
194	std::vector<float> C_sgemm(M * N);
195	init_vector(C_sgemm);
196
197	std::vector<float> C_dynamic_matmul = C_sgemm;
198
199	// Prepare leading dimensions
200	int64_t lda = tolower(transA) == `'n'` ? K : M;
201	int64_t ldb = tolower(transB) == `'n'` ? N : K;
202	int64_t ldc = N;
203
204	// 1. Execute sgemm
205	for (int run = `0`; run < number_of_runs; ++run)
206	dnnl_sgemm(transA, transB, M, N, K, alpha, A.data(), lda, B.data(), ldb,
207	beta, C_sgemm.data(), ldc);
208
209	// 2.a Create dynamic MatMul
210	auto dynamic_matmul = dynamic_matmul_create();
211	// 2.b Execute
212	for (int run = `0`; run < number_of_runs; ++run)
213	dynamic_matmul_execute(dynamic_matmul, transA, transB, M, N, K, alpha,
214	A.data(), lda, B.data(), ldb, beta, C_dynamic_matmul.data(),
215	ldc);
216
217	int rc = `0`;
218	rc \|= compare_vectors(
219	C_sgemm, C_dynamic_matmul, K, "Compare SGEMM vs dynamic MatMul");
220	if (rc) throw std::logic_error("The resulting matrices diverged too much.");
221	}
222
223	void sgemm_and_matmul() {
224	sgemm_and_matmul_with_params(`'N'`, `'T'`, `10`, `20`, `30`, `1.1f`, fixed_beta);
225	}
226
227	int main(int argc, char **argv) {
228	return handle_example_errors({engine::kind::cpu}, sgemm_and_matmul);
229	}
230

Browse the source code of oneDNN/examples/tutorials/matmul/cpu_sgemm_and_matmul.cpp