1/*******************************************************************************
2* Copyright 2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include <algorithm>
18#include <chrono>
19#include <cmath>
20#include <iomanip>
21#include <iostream>
22#include <random>
23#include <string>
24#include <vector>
25
26#include "example_utils.hpp"
27#include "oneapi/dnnl/dnnl.hpp"
28
29using namespace dnnl;
30
31using tag = memory::format_tag;
32using dt = memory::data_type;
33
34struct gemm_dims_t {
35 memory::dim m, n, k;
36};
37
38static const int min_runs = 4;
39
40const char *get_type_string(dt type) {
41 const char *type_string = "unknown";
42
43#define TYPE_CASE(T) \
44 if (type == dt::T) type_string = #T;
45 TYPE_CASE(f16);
46 TYPE_CASE(f32);
47 TYPE_CASE(f64);
48 TYPE_CASE(bf16);
49 TYPE_CASE(s8);
50 TYPE_CASE(u8);
51#undef TYPE_CASE
52
53 return type_string;
54}
55
56void print_test_case(dt type, gemm_dims_t dims) {
57 std::cout << '[' << std::setw(4) << get_type_string(type);
58 if (dims.m == dims.n && dims.m == dims.k)
59 std::cout << " m = n = k = " << dims.m;
60 else
61 std::cout << " m = " << dims.m << ", n = " << dims.n
62 << ", k = " << dims.k;
63 std::cout << "] " << std::flush;
64}
65
66void fill_random(std::vector<float> &out, bool is_integer) {
67 static std::vector<float> random_data_i, random_data_f;
68 constexpr size_t nrand = 1037;
69
70 if (random_data_i.empty() || random_data_f.empty()) {
71 std::mt19937 generator;
72 std::uniform_int_distribution<int> dist_i(-16, 15);
73 std::uniform_real_distribution<float> dist_f(-1.0f, 1.0f);
74
75 random_data_i.resize(nrand);
76 for (auto &d : random_data_i)
77 d = static_cast<float>(dist_i(generator));
78
79 random_data_f.resize(nrand);
80 for (auto &d : random_data_f)
81 d = dist_f(generator);
82 }
83
84 auto &rd = is_integer ? random_data_i : random_data_f;
85
86 for (size_t i = 0; i < out.size(); i += nrand) {
87 size_t chunk = std::min(nrand, out.size() - i);
88 std::memcpy(&out[i], rd.data(), chunk * sizeof(float));
89 }
90}
91
92double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims,
93 double time_limit = 0.) {
94 bool is_integer = (type == dt::s8 || type == dt::u8);
95 bool quick_test = (time_limit == 0.);
96
97 // Create execution dnnl::engine.
98 dnnl::engine engine(engine_kind, 0);
99
100 // Create dnnl::stream.
101 dnnl::stream engine_stream(engine);
102
103 // Source (A), weights (B), and destination (C) matrix dimensions.
104 memory::dims a_dims = {dims.m, dims.k};
105 memory::dims b_dims = {dims.k, dims.n};
106 memory::dims c_dims = {dims.m, dims.n};
107
108 // Allocate buffers and random-initialize A/B
109 std::vector<float> a_data(product(a_dims));
110 std::vector<float> b_data(product(b_dims));
111 std::vector<float> c_data(product(c_dims));
112
113 fill_random(a_data, is_integer);
114 fill_random(b_data, is_integer);
115
116 // Create memory descriptors and memory objects for src, weights, bias, and
117 // dst.
118 auto a_md = memory::desc(a_dims, type, tag::any);
119 auto b_md = memory::desc(b_dims, type, tag::any);
120 auto c_md = memory::desc(c_dims, type, tag::any);
121
122 auto a_in_md = memory::desc(a_dims, dt::f32, tag::ab);
123 auto b_in_md = memory::desc(b_dims, dt::f32, tag::ab);
124
125 auto a_in_mem = memory(a_in_md, engine);
126 auto b_in_mem = memory(b_in_md, engine);
127
128 // Write data to memory object's handles.
129 write_to_dnnl_memory(a_data.data(), a_in_mem);
130 write_to_dnnl_memory(b_data.data(), b_in_mem);
131
132 // Create primitive descriptor.
133 auto matmul_pd = matmul::primitive_desc(engine, a_md, b_md, c_md);
134
135 // Repack and convert input data.
136 auto a_mem = memory(matmul_pd.src_desc(), engine);
137 reorder(a_in_mem, a_mem).execute(engine_stream, a_in_mem, a_mem);
138
139 auto b_mem = memory(matmul_pd.weights_desc(), engine);
140 reorder(b_in_mem, b_mem).execute(engine_stream, b_in_mem, b_mem);
141
142 auto c_mem = memory(matmul_pd.dst_desc(), engine);
143
144 // Create the primitive.
145 auto matmul_prim = matmul(matmul_pd);
146
147 // Start output.
148 if (!quick_test) print_test_case(type, dims);
149
150 // Primitive arguments.
151 std::unordered_map<int, memory> matmul_args;
152 matmul_args.insert({DNNL_ARG_SRC, a_mem});
153 matmul_args.insert({DNNL_ARG_WEIGHTS, b_mem});
154 matmul_args.insert({DNNL_ARG_DST, c_mem});
155
156 // Warmup executions.
157 matmul_prim.execute(engine_stream, matmul_args);
158 engine_stream.wait();
159
160 auto start_first = std::chrono::steady_clock::now();
161 matmul_prim.execute(engine_stream, matmul_args);
162 engine_stream.wait();
163 auto end_first = std::chrono::steady_clock::now();
164
165 std::chrono::duration<double> dur_first = end_first - start_first;
166
167 if (quick_test) return dur_first.count();
168
169 int runs = std::max(min_runs, int(time_limit / dur_first.count()));
170
171 // Timing runs.
172 auto start = std::chrono::steady_clock::now();
173
174 for (int i = 0; i <= runs; i++)
175 matmul_prim.execute(engine_stream, matmul_args);
176 engine_stream.wait();
177
178 auto end = std::chrono::steady_clock::now();
179
180 std::chrono::duration<double> duration = end - start;
181
182 // Display the result.
183 double avg_time = (duration.count() - dur_first.count()) / runs;
184 double total_ops = double(dims.m) * double(dims.n) * double(dims.k) * 2;
185 double perf = (total_ops / avg_time) * 1e-9;
186
187 auto scale_string = "G";
188 auto unit_string = is_integer ? "Op/s" : "Flop/s";
189
190 if (perf >= 1000) {
191 perf /= 1000;
192 scale_string = "T";
193 }
194
195 std::cout << perf << ' ' << scale_string << unit_string << std::endl;
196
197 return avg_time;
198}
199
200void run(engine::kind engine_kind, dt type, gemm_dims_t dims,
201 double time_limit) {
202 try {
203 if (dims.m * dims.n != 0) {
204 // Dimensions manually specified by user.
205 run_case(engine_kind, type, dims, time_limit);
206 } else {
207 // Automatically choose dimensions to fit time limit.
208 int mnk = 128;
209 const int max_mnk = 8192;
210
211 while (mnk < max_mnk) {
212 dims.m = dims.n = dims.k = mnk;
213 double time1 = run_case(engine_kind, type, dims);
214 double nruns_est = std::max(1., time_limit / time1);
215 double mnk_expand = std::exp2(
216 std::round(std::log2(nruns_est / min_runs) / 3.));
217 if (mnk_expand <= 1) break;
218 mnk = static_cast<int>(
219 std::min<double>(max_mnk, mnk * mnk_expand));
220 }
221
222 dims.m = dims.n = dims.k = mnk;
223 run_case(engine_kind, type, dims, time_limit);
224 }
225 } catch (dnnl::error &e) {
226 // Catch and report unimplemented cases.
227 if (e.status == dnnl_unimplemented) {
228 print_test_case(type, dims);
229 std::cout << "unsupported" << std::endl;
230 } else
231 throw;
232 }
233}
234
235void bad_args() {
236 std::cerr << "Usage: matmul-perf-cpp [cpu|gpu]\n"
237 " matmul-perf-cpp [cpu|gpu] <size>\n"
238 " matmul-perf-cpp [cpu|gpu] <m> <n> <k>\n"
239 "If a single <size> is specified, it is used for all three "
240 "dimensions (m/n/k).\n";
241 throw std::invalid_argument("Incorrect input arguments.");
242}
243
244void matmul_perf(engine::kind engine_kind, int argc, char **argv) {
245 gemm_dims_t dims = {0, 0, 0};
246
247 if (argc > 2) {
248 if (argc == 3)
249 dims.m = dims.n = dims.k = std::atoi(argv[2]);
250 else if (argc == 5) {
251 dims.m = std::atoi(argv[2]);
252 dims.n = std::atoi(argv[3]);
253 dims.k = std::atoi(argv[4]);
254 } else
255 bad_args();
256
257 if (dims.m <= 0 || dims.n <= 0 || dims.k <= 0) bad_args();
258 }
259
260 run(engine_kind, dt::f32, dims, 2.0);
261 run(engine_kind, dt::f16, dims, 2.0);
262 run(engine_kind, dt::bf16, dims, 2.0);
263 run(engine_kind, dt::s8, dims, 2.0);
264}
265
266int main(int argc, char **argv) {
267 return handle_example_errors(
268 matmul_perf, parse_engine_kind(argc, argv, 3), argc, argv);
269}
270