1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /// @example cpu_sgemm_and_matmul.cpp |
18 | /// > Annotated version: @ref cpu_sgemm_and_matmul_cpp |
19 | /// |
20 | /// @page cpu_sgemm_and_matmul_cpp_short |
21 | /// C++ API example demonstrating [MatMul](@ref dev_guide_matmul) |
22 | /// as a replacement for SGEMM functions. |
23 | /// |
24 | /// Concepts: |
25 | /// - Create primitive once, use multiple times |
26 | /// - Run-time tensor shapes: #DNNL_RUNTIME_DIM_VAL |
27 | /// - Scales: dnnl::primitive_attr::set_scales_mask() |
28 | /// |
29 | /// @page cpu_sgemm_and_matmul_cpp MatMul Tutorial: Comparison with SGEMM |
30 | /// @copydetails cpu_sgemm_and_matmul_cpp_short |
31 | /// |
32 | /// We will show two modes for the MatMul primitive: |
33 | /// 1. The shapes of the input and output matrices are passed at execution time. |
34 | /// This enables you to create a primitive only once and use it for different |
35 | /// matrices, just like normal SGEMM (though with a handle -- oneDNN |
36 | /// primitive). |
37 | /// To indicate the unknown dimensions and floating point values, you should |
38 | /// use #DNNL_RUNTIME_DIM_VAL and #DNNL_RUNTIME_F32_VAL respectively. |
39 | /// 2. The shapes of the input and output matrices are passed at creation time, |
40 | /// as in oneDNN programming model. |
41 | /// This enables creating a highly specialized kernel for the given problem |
42 | /// sizes with the loss of generality. |
43 | /// |
44 | /// Users are free to choose between these two options, as well as any |
45 | /// intermediate ones (e.g., specifying some of the parameters at creation time |
46 | /// while leaving the others until execution time). This enables balancing |
47 | /// between flexibility and performance. |
48 | /// |
49 | /// @note |
50 | /// The more you specify at creation time, the better performance is. |
51 | /// |
52 | /// @include cpu_sgemm_and_matmul.cpp |
53 | |
54 | #include <cassert> |
55 | #include <cctype> |
56 | #include <cmath> |
57 | #include <cstdio> |
58 | #include <iostream> |
59 | #include <random> |
60 | #include <stdexcept> |
61 | #include <vector> |
62 | |
63 | #include "oneapi/dnnl/dnnl.hpp" |
64 | |
65 | #include "example_utils.hpp" |
66 | |
67 | using namespace dnnl; |
68 | |
69 | namespace { |
70 | |
71 | void init_vector(std::vector<float> &v) { |
72 | std::mt19937 gen; |
73 | std::uniform_real_distribution<float> u(-1, 1); |
74 | |
75 | for (auto &e : v) |
76 | e = u(gen); |
77 | } |
78 | |
79 | int compare_vectors(const std::vector<float> &v1, const std::vector<float> &v2, |
80 | int64_t K, const char *message) { |
81 | double v1_l2 = 0, diff_l2 = 0; |
82 | for (size_t n = 0; n < v1.size(); ++n) { |
83 | float diff = v1[n] - v2[n]; |
84 | v1_l2 += v1[n] * v1[n]; |
85 | diff_l2 += diff * diff; |
86 | } |
87 | |
88 | v1_l2 = std::sqrt(v1_l2); |
89 | diff_l2 = std::sqrt(diff_l2); |
90 | |
91 | // Finding the reasonable (tight and accurate) threshold is quite difficult |
92 | // problem. |
93 | // The implementation testing might also use special data filling to |
94 | // alleviate issues related to the finite precision arithmetic. |
95 | // However, in simple cases the machine epsilon multiplied by log(K) should |
96 | // work reasonably well. |
97 | const double threshold = std::numeric_limits<float>::epsilon() |
98 | * std::log(std::max(2., (double)K)); |
99 | bool ok = diff_l2 <= threshold * v1_l2; |
100 | |
101 | printf("%s\n\tL2 Norms" |
102 | "\n\t\tReference matrix:%g\n\t\tError:%g\n\t\tRelative_error:%g\n" |
103 | "\tAccuracy check: %s\n" , |
104 | message, v1_l2, diff_l2, diff_l2 / v1_l2, ok ? "OK" : "FAILED" ); |
105 | |
106 | return ok ? 0 : 1; |
107 | } |
108 | |
109 | } // namespace |
110 | |
111 | int number_of_runs = 1; |
112 | float fixed_beta = 0.f; |
113 | |
114 | engine eng(engine::kind::cpu, 0); // We create a global engine for simplicity |
115 | |
116 | // Create a _dynamic_ MatMul primitive that can work with arbitrary shapes |
117 | // and alpha parameters. |
118 | // Warning: current limitation is that beta parameter should be known in |
119 | // advance (use fixed_beta). |
120 | matmul dynamic_matmul_create() { |
121 | // We assume that beta is known at the primitive creation time |
122 | float beta = fixed_beta; |
123 | |
124 | memory::dims a_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL}; |
125 | memory::dims b_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL}; |
126 | memory::dims c_shape = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL}; |
127 | |
128 | memory::dims a_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL}; |
129 | memory::dims b_strides = {DNNL_RUNTIME_DIM_VAL, DNNL_RUNTIME_DIM_VAL}; |
130 | memory::dims c_strides = {DNNL_RUNTIME_DIM_VAL, 1}; |
131 | |
132 | memory::desc a_md(a_shape, memory::data_type::f32, a_strides); |
133 | memory::desc b_md(b_shape, memory::data_type::f32, b_strides); |
134 | memory::desc c_md(c_shape, memory::data_type::f32, c_strides); |
135 | |
136 | // Create attributes (to handle alpha dynamically and beta if necessary) |
137 | primitive_attr attr; |
138 | attr.set_scales_mask(DNNL_ARG_WEIGHTS, /* mask */ 0); |
139 | if (beta != 0.f) { |
140 | post_ops po; |
141 | po.append_sum(beta); |
142 | attr.set_post_ops(po); |
143 | } |
144 | |
145 | // Create a MatMul primitive |
146 | matmul::primitive_desc matmul_pd(eng, a_md, b_md, c_md, attr); |
147 | return matmul(matmul_pd); |
148 | } |
149 | |
150 | // Execute a _dynamic_ MatMul primitive created earlier. All the parameters are |
151 | // passed at a run-time (except for beta which has to be specified at the |
152 | // primitive creation time due to the current limitation). |
153 | void dynamic_matmul_execute(matmul &matmul_p, char transA, char transB, |
154 | int64_t M, int64_t N, int64_t K, float alpha, const float *A, |
155 | int64_t lda, const float *B, int64_t ldb, float beta, float *C, |
156 | int64_t ldc) { |
157 | using dims = memory::dims; |
158 | |
159 | if (beta != fixed_beta) |
160 | throw std::logic_error("Run-time beta is not yet supported." ); |
161 | |
162 | // Translate transA and transB |
163 | dims a_strides = tolower(transA) == 'n' ? dims {lda, 1} : dims {1, lda}; |
164 | dims b_strides = tolower(transB) == 'n' ? dims {ldb, 1} : dims {1, ldb}; |
165 | |
166 | // Wrap raw pointers into oneDNN memories (with proper shapes) |
167 | memory A_m({{M, K}, memory::data_type::f32, a_strides}, eng, (void *)A); |
168 | memory B_m({{K, N}, memory::data_type::f32, b_strides}, eng, (void *)B); |
169 | memory C_m({{M, N}, memory::data_type::f32, {ldc, 1}}, eng, (void *)C); |
170 | |
171 | // Prepare oneDNN memory for alpha |
172 | memory alpha_m({{1}, memory::data_type::f32, {1}}, eng, &alpha); |
173 | |
174 | // Execute the MatMul primitive |
175 | stream s(eng); |
176 | matmul_p.execute(s, |
177 | {{DNNL_ARG_SRC, A_m}, {DNNL_ARG_WEIGHTS, B_m}, {DNNL_ARG_DST, C_m}, |
178 | {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, alpha_m}}); |
179 | s.wait(); |
180 | } |
181 | |
182 | void sgemm_and_matmul_with_params(char transA, char transB, int64_t M, |
183 | int64_t N, int64_t K, float alpha, float beta) { |
184 | if (beta != fixed_beta) |
185 | throw std::logic_error("Run-time beta is not yet supported." ); |
186 | |
187 | // Allocate and initialize matrices |
188 | std::vector<float> A(M * K); |
189 | init_vector(A); |
190 | |
191 | std::vector<float> B(K * N); |
192 | init_vector(B); |
193 | |
194 | std::vector<float> C_sgemm(M * N); |
195 | init_vector(C_sgemm); |
196 | |
197 | std::vector<float> C_dynamic_matmul = C_sgemm; |
198 | |
199 | // Prepare leading dimensions |
200 | int64_t lda = tolower(transA) == 'n' ? K : M; |
201 | int64_t ldb = tolower(transB) == 'n' ? N : K; |
202 | int64_t ldc = N; |
203 | |
204 | // 1. Execute sgemm |
205 | for (int run = 0; run < number_of_runs; ++run) |
206 | dnnl_sgemm(transA, transB, M, N, K, alpha, A.data(), lda, B.data(), ldb, |
207 | beta, C_sgemm.data(), ldc); |
208 | |
209 | // 2.a Create dynamic MatMul |
210 | auto dynamic_matmul = dynamic_matmul_create(); |
211 | // 2.b Execute |
212 | for (int run = 0; run < number_of_runs; ++run) |
213 | dynamic_matmul_execute(dynamic_matmul, transA, transB, M, N, K, alpha, |
214 | A.data(), lda, B.data(), ldb, beta, C_dynamic_matmul.data(), |
215 | ldc); |
216 | |
217 | int rc = 0; |
218 | rc |= compare_vectors( |
219 | C_sgemm, C_dynamic_matmul, K, "Compare SGEMM vs dynamic MatMul" ); |
220 | if (rc) throw std::logic_error("The resulting matrices diverged too much." ); |
221 | } |
222 | |
223 | void sgemm_and_matmul() { |
224 | sgemm_and_matmul_with_params('N', 'T', 10, 20, 30, 1.1f, fixed_beta); |
225 | } |
226 | |
227 | int main(int argc, char **argv) { |
228 | return handle_example_errors({engine::kind::cpu}, sgemm_and_matmul); |
229 | } |
230 | |