1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/// @example cpu_sgemm_and_matmul.cpp
18/// > Annotated version: @ref cpu_sgemm_and_matmul_cpp
19///
20/// @page cpu_sgemm_and_matmul_cpp_short
21/// C++ API example demonstrating [MatMul](@ref dev_guide_matmul)
22/// as a replacement for SGEMM functions.
23///
24/// Concepts:
25/// - Create primitive once, use multiple times
26/// - Run-time tensor shapes: #DNNL_RUNTIME_DIM_VAL
27/// - Scales: dnnl::primitive_attr::set_scales_mask()
28///
29/// @page cpu_sgemm_and_matmul_cpp MatMul Tutorial: Comparison with SGEMM
30/// @copydetails cpu_sgemm_and_matmul_cpp_short
31///
32/// We will show two modes for the MatMul primitive:
33/// 1. The shapes of the input and output matrices are passed at execution time.
34/// This enables you to create a primitive only once and use it for different
35/// matrices, just like normal SGEMM (though with a handle -- oneDNN
36/// primitive).
37/// To indicate the unknown dimensions and floating point values, you should
38/// use #DNNL_RUNTIME_DIM_VAL and #DNNL_RUNTIME_F32_VAL respectively.
39/// 2. The shapes of the input and output matrices are passed at creation time,
40/// as in oneDNN programming model.
41/// This enables creating a highly specialized kernel for the given problem
42/// sizes with the loss of generality.
43///
44/// Users are free to choose between these two options, as well as any
45/// intermediate ones (e.g., specifying some of the parameters at creation time
46/// while leaving the others until execution time). This enables balancing
47/// between flexibility and performance.
48///
49/// @note
50/// The more you specify at creation time, the better performance is.
51///
52/// @include cpu_sgemm_and_matmul.cpp
53
54#include <cassert>
55#include <cctype>
56#include <cmath>
57#include <cstdio>
58#include <iostream>
59#include <random>
60#include <stdexcept>
61#include <vector>
62
63#include "oneapi/dnnl/dnnl.hpp"
64
65#include "example_utils.hpp"
66
67using namespace dnnl;
68
69namespace {
70
71void init_vector(std::vector<float> &v) {
72 std::mt19937 gen;
73 std::uniform_real_distribution<float> u(-1, 1);
74
75 for (auto &e : v)
76 e = u(gen);
77}
78
79int compare_vectors(const std::vector<float> &v1, const std::vector<float> &v2,
80 int64_t K, const char *message) {
81 double v1_l2 = 0, diff_l2 = 0;
82 for (size_t n = 0; n < v1.size(); ++n) {
83 float diff = v1[n] - v2[n];
84 v1_l2 += v1[n] * v1[n];
85 diff_l2 += diff * diff;
86 }
87
88 v1_l2 = std::sqrt(v1_l2);
89 diff_l2 = std::sqrt(diff_l2);
90
91 // Finding the reasonable (tight and accurate) threshold is quite difficult
92 // problem.
93 // The implementation testing might also use special data filling to
94 // alleviate issues related to the finite precision arithmetic.
95 // However, in simple cases the machine epsilon multiplied by log(K) should
96 // work reasonably well.
97 const double threshold = std::numeric_limits<float>::epsilon()
98 * std::log(std::max(2., (double)K));
99 bool ok = diff_l2 <= threshold * v1_l2;
100
101 printf("%s\n\tL2 Norms"
102 "\n\t\tReference matrix:%g\n\t\tError:%g\n\t\tRelative_error:%g\n"
103 "\tAccuracy check: %s\n",
104 message, v1_l2, diff_l2, diff_l2 / v1_l2, ok ? "OK" : "FAILED");
105
106 return ok ? 0 : 1;
107}
108
109} // namespace
110
111int number_of_runs = 1;
112float fixed_beta = 0.f;
113
114engine eng(engine::kind::cpu, 0); // We create a global engine for simplicity
115
116// Create a _dynamic_ MatMul primitive that can work with arbitrary shapes
117// and alpha parameters.
118// Warning: current limitation is that beta parameter should be known in
119// advance (use fixed_beta).
120matmul dynamic_matmul_create() {
121 // We assume that beta is known at the primitive creation time
122 float beta = fixed_beta;
123
124 memory::dims a_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
125 memory::dims b_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
126 memory::dims c_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
127
128 memory::dims a_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
129 memory::dims b_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL};
130 memory::dims c_strides = {DNNL_RUNTIME_DIM_VAL, 1};
131
132 memory::desc a_md(a_shape, memory::data_type::f32, a_strides);
133 memory::desc b_md(b_shape, memory::data_type::f32, b_strides);
134 memory::desc c_md(c_shape, memory::data_type::f32, c_strides);
135
136 // Create attributes (to handle alpha dynamically and beta if necessary)
137 primitive_attr attr;
138 attr.set_scales_mask(DNNL_ARG_WEIGHTS, /* mask */ 0);
139 if (beta != 0.f) {
140 post_ops po;
141 po.append_sum(beta);
142 attr.set_post_ops(po);
143 }
144
145 // Create a MatMul primitive
146 matmul::primitive_desc matmul_pd(eng, a_md, b_md, c_md, attr);
147 return matmul(matmul_pd);
148}
149
150// Execute a _dynamic_ MatMul primitive created earlier. All the parameters are
151// passed at a run-time (except for beta which has to be specified at the
152// primitive creation time due to the current limitation).
153void dynamic_matmul_execute(matmul &matmul_p, char transA, char transB,
154 int64_t M, int64_t N, int64_t K, float alpha, const float *A,
155 int64_t lda, const float *B, int64_t ldb, float beta, float *C,
156 int64_t ldc) {
157 using dims = memory::dims;
158
159 if (beta != fixed_beta)
160 throw std::logic_error("Run-time beta is not yet supported.");
161
162 // Translate transA and transB
163 dims a_strides = tolower(transA) == 'n' ? dims {lda, 1} : dims {1, lda};
164 dims b_strides = tolower(transB) == 'n' ? dims {ldb, 1} : dims {1, ldb};
165
166 // Wrap raw pointers into oneDNN memories (with proper shapes)
167 memory A_m({{M, K}, memory::data_type::f32, a_strides}, eng, (void *)A);
168 memory B_m({{K, N}, memory::data_type::f32, b_strides}, eng, (void *)B);
169 memory C_m({{M, N}, memory::data_type::f32, {ldc, 1}}, eng, (void *)C);
170
171 // Prepare oneDNN memory for alpha
172 memory alpha_m({{1}, memory::data_type::f32, {1}}, eng, &alpha);
173
174 // Execute the MatMul primitive
175 stream s(eng);
176 matmul_p.execute(s,
177 {{DNNL_ARG_SRC, A_m}, {DNNL_ARG_WEIGHTS, B_m}, {DNNL_ARG_DST, C_m},
178 {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, alpha_m}});
179 s.wait();
180}
181
182void sgemm_and_matmul_with_params(char transA, char transB, int64_t M,
183 int64_t N, int64_t K, float alpha, float beta) {
184 if (beta != fixed_beta)
185 throw std::logic_error("Run-time beta is not yet supported.");
186
187 // Allocate and initialize matrices
188 std::vector<float> A(M * K);
189 init_vector(A);
190
191 std::vector<float> B(K * N);
192 init_vector(B);
193
194 std::vector<float> C_sgemm(M * N);
195 init_vector(C_sgemm);
196
197 std::vector<float> C_dynamic_matmul = C_sgemm;
198
199 // Prepare leading dimensions
200 int64_t lda = tolower(transA) == 'n' ? K : M;
201 int64_t ldb = tolower(transB) == 'n' ? N : K;
202 int64_t ldc = N;
203
204 // 1. Execute sgemm
205 for (int run = 0; run < number_of_runs; ++run)
206 dnnl_sgemm(transA, transB, M, N, K, alpha, A.data(), lda, B.data(), ldb,
207 beta, C_sgemm.data(), ldc);
208
209 // 2.a Create dynamic MatMul
210 auto dynamic_matmul = dynamic_matmul_create();
211 // 2.b Execute
212 for (int run = 0; run < number_of_runs; ++run)
213 dynamic_matmul_execute(dynamic_matmul, transA, transB, M, N, K, alpha,
214 A.data(), lda, B.data(), ldb, beta, C_dynamic_matmul.data(),
215 ldc);
216
217 int rc = 0;
218 rc |= compare_vectors(
219 C_sgemm, C_dynamic_matmul, K, "Compare SGEMM vs dynamic MatMul");
220 if (rc) throw std::logic_error("The resulting matrices diverged too much.");
221}
222
223void sgemm_and_matmul() {
224 sgemm_and_matmul_with_params('N', 'T', 10, 20, 30, 1.1f, fixed_beta);
225}
226
227int main(int argc, char **argv) {
228 return handle_example_errors({engine::kind::cpu}, sgemm_and_matmul);
229}
230