matmul_perf.cpp source code [oneDNN/examples/matmul_perf.cpp]

1	/*******************************************************************************
2	* Copyright 2022 Intel Corporation
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*******************************************************************************/
16
17	#include <algorithm>
18	#include <chrono>
19	#include <cmath>
20	#include <iomanip>
21	#include <iostream>
22	#include <random>
23	#include <string>
24	#include <vector>
25
26	#include "example_utils.hpp"
27	#include "oneapi/dnnl/dnnl.hpp"
28
29	using namespace dnnl;
30
31	using tag = memory::format_tag;
32	using dt = memory::data_type;
33
34	struct gemm_dims_t {
35	memory::dim m, n, k;
36	};
37
38	static const int min_runs = `4`;
39
40	const char *get_type_string(dt type) {
41	const char *type_string = "unknown";
42
43	#define TYPE_CASE(T) \
44	if (type == dt::T) type_string = #T;
45	TYPE_CASE(f16);
46	TYPE_CASE(f32);
47	TYPE_CASE(f64);
48	TYPE_CASE(bf16);
49	TYPE_CASE(s8);
50	TYPE_CASE(u8);
51	#undef TYPE_CASE
52
53	return type_string;
54	}
55
56	void print_test_case(dt type, gemm_dims_t dims) {
57	std::cout << `'['` << std::setw(`4`) << get_type_string(type);
58	if (dims.m == dims.n && dims.m == dims.k)
59	std::cout << " m = n = k = " << dims.m;
60	else
61	std::cout << " m = " << dims.m << ", n = " << dims.n
62	<< ", k = " << dims.k;
63	std::cout << "] " << std::flush;
64	}
65
66	void fill_random(std::vector<float> &out, bool is_integer) {
67	static std::vector<float> random_data_i, random_data_f;
68	constexpr size_t nrand = `1037`;
69
70	if (random_data_i.empty() \|\| random_data_f.empty()) {
71	std::mt19937 generator;
72	std::uniform_int_distribution<int> dist_i(-`16`, `15`);
73	std::uniform_real_distribution<float> dist_f(-`1.0f`, `1.0f`);
74
75	random_data_i.resize(nrand);
76	for (auto &d : random_data_i)
77	d = static_cast<float>(dist_i(generator));
78
79	random_data_f.resize(nrand);
80	for (auto &d : random_data_f)
81	d = dist_f(generator);
82	}
83
84	auto &rd = is_integer ? random_data_i : random_data_f;
85
86	for (size_t i = `0`; i < out.size(); i += nrand) {
87	size_t chunk = std::min(nrand, out.size() - i);
88	std::memcpy(&out[i], rd.data(), chunk * sizeof(float));
89	}
90	}
91
92	double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims,
93	double time_limit = `0.`) {
94	bool is_integer = (type == dt::s8 \|\| type == dt::u8);
95	bool quick_test = (time_limit == `0.`);
96
97	// Create execution dnnl::engine.
98	dnnl::engine engine(engine_kind, `0`);
99
100	// Create dnnl::stream.
101	dnnl::stream engine_stream(engine);
102
103	// Source (A), weights (B), and destination (C) matrix dimensions.
104	memory::dims a_dims = {dims.m, dims.k};
105	memory::dims b_dims = {dims.k, dims.n};
106	memory::dims c_dims = {dims.m, dims.n};
107
108	// Allocate buffers and random-initialize A/B
109	std::vector<float> a_data(product(a_dims));
110	std::vector<float> b_data(product(b_dims));
111	std::vector<float> c_data(product(c_dims));
112
113	fill_random(a_data, is_integer);
114	fill_random(b_data, is_integer);
115
116	// Create memory descriptors and memory objects for src, weights, bias, and
117	// dst.
118	auto a_md = memory::desc(a_dims, type, tag::any);
119	auto b_md = memory::desc(b_dims, type, tag::any);
120	auto c_md = memory::desc(c_dims, type, tag::any);
121
122	auto a_in_md = memory::desc(a_dims, dt::f32, tag::ab);
123	auto b_in_md = memory::desc(b_dims, dt::f32, tag::ab);
124
125	auto a_in_mem = memory(a_in_md, engine);
126	auto b_in_mem = memory(b_in_md, engine);
127
128	// Write data to memory object's handles.
129	write_to_dnnl_memory(a_data.data(), a_in_mem);
130	write_to_dnnl_memory(b_data.data(), b_in_mem);
131
132	// Create primitive descriptor.
133	auto matmul_pd = matmul::primitive_desc(engine, a_md, b_md, c_md);
134
135	// Repack and convert input data.
136	auto a_mem = memory(matmul_pd.src_desc(), engine);
137	reorder(a_in_mem, a_mem).execute(engine_stream, a_in_mem, a_mem);
138
139	auto b_mem = memory(matmul_pd.weights_desc(), engine);
140	reorder(b_in_mem, b_mem).execute(engine_stream, b_in_mem, b_mem);
141
142	auto c_mem = memory(matmul_pd.dst_desc(), engine);
143
144	// Create the primitive.
145	auto matmul_prim = matmul(matmul_pd);
146
147	// Start output.
148	if (!quick_test) print_test_case(type, dims);
149
150	// Primitive arguments.
151	std::unordered_map<int, memory> matmul_args;
152	matmul_args.insert({DNNL_ARG_SRC, a_mem});
153	matmul_args.insert({DNNL_ARG_WEIGHTS, b_mem});
154	matmul_args.insert({DNNL_ARG_DST, c_mem});
155
156	// Warmup executions.
157	matmul_prim.execute(engine_stream, matmul_args);
158	engine_stream.wait();
159
160	auto start_first = std::chrono::steady_clock::now();
161	matmul_prim.execute(engine_stream, matmul_args);
162	engine_stream.wait();
163	auto end_first = std::chrono::steady_clock::now();
164
165	std::chrono::duration<double> dur_first = end_first - start_first;
166
167	if (quick_test) return dur_first.count();
168
169	int runs = std::max(min_runs, int(time_limit / dur_first.count()));
170
171	// Timing runs.
172	auto start = std::chrono::steady_clock::now();
173
174	for (int i = `0`; i <= runs; i++)
175	matmul_prim.execute(engine_stream, matmul_args);
176	engine_stream.wait();
177
178	auto end = std::chrono::steady_clock::now();
179
180	std::chrono::duration<double> duration = end - start;
181
182	// Display the result.
183	double avg_time = (duration.count() - dur_first.count()) / runs;
184	double total_ops = double(dims.m) * double(dims.n) * double(dims.k) * `2`;
185	double perf = (total_ops / avg_time) * `1e-9`;
186
187	auto scale_string = "G";
188	auto unit_string = is_integer ? "Op/s" : "Flop/s";
189
190	if (perf >= `1000`) {
191	perf /= `1000`;
192	scale_string = "T";
193	}
194
195	std::cout << perf << `' '` << scale_string << unit_string << std::endl;
196
197	return avg_time;
198	}
199
200	void run(engine::kind engine_kind, dt type, gemm_dims_t dims,
201	double time_limit) {
202	try {
203	if (dims.m * dims.n != `0`) {
204	// Dimensions manually specified by user.
205	run_case(engine_kind, type, dims, time_limit);
206	} else {
207	// Automatically choose dimensions to fit time limit.
208	int mnk = `128`;
209	const int max_mnk = `8192`;
210
211	while (mnk < max_mnk) {
212	dims.m = dims.n = dims.k = mnk;
213	double time1 = run_case(engine_kind, type, dims);
214	double nruns_est = std::max(`1.`, time_limit / time1);
215	double mnk_expand = std::exp2(
216	std::round(std::log2(nruns_est / min_runs) / `3.`));
217	if (mnk_expand <= `1`) break;
218	mnk = static_cast<int>(
219	std::min<double>(max_mnk, mnk * mnk_expand));
220	}
221
222	dims.m = dims.n = dims.k = mnk;
223	run_case(engine_kind, type, dims, time_limit);
224	}
225	} catch (dnnl::error &e) {
226	// Catch and report unimplemented cases.
227	if (e.status == dnnl_unimplemented) {
228	print_test_case(type, dims);
229	std::cout << "unsupported" << std::endl;
230	} else
231	throw;
232	}
233	}
234
235	void bad_args() {
236	std::cerr << "Usage: matmul-perf-cpp [cpu\|gpu]\n"
237	" matmul-perf-cpp [cpu\|gpu] <size>\n"
238	" matmul-perf-cpp [cpu\|gpu] <m> <n> <k>\n"
239	"If a single <size> is specified, it is used for all three "
240	"dimensions (m/n/k).\n";
241	throw std::invalid_argument("Incorrect input arguments.");
242	}
243
244	void matmul_perf(engine::kind engine_kind, int argc, char **argv) {
245	gemm_dims_t dims = {`0`, `0`, `0`};
246
247	if (argc > `2`) {
248	if (argc == `3`)
249	dims.m = dims.n = dims.k = std::atoi(argv[`2`]);
250	else if (argc == `5`) {
251	dims.m = std::atoi(argv[`2`]);
252	dims.n = std::atoi(argv[`3`]);
253	dims.k = std::atoi(argv[`4`]);
254	} else
255	bad_args();
256
257	if (dims.m <= `0` \|\| dims.n <= `0` \|\| dims.k <= `0`) bad_args();
258	}
259
260	run(engine_kind, dt::f32, dims, `2.0`);
261	run(engine_kind, dt::f16, dims, `2.0`);
262	run(engine_kind, dt::bf16, dims, `2.0`);
263	run(engine_kind, dt::s8, dims, `2.0`);
264	}
265
266	int main(int argc, char **argv) {
267	return handle_example_errors(
268	matmul_perf, parse_engine_kind(argc, argv, `3`), argc, argv);
269	}
270

Browse the source code of oneDNN/examples/matmul_perf.cpp