1 | /******************************************************************************* |
2 | * Copyright 2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include <algorithm> |
18 | #include <chrono> |
19 | #include <cmath> |
20 | #include <iomanip> |
21 | #include <iostream> |
22 | #include <random> |
23 | #include <string> |
24 | #include <vector> |
25 | |
26 | #include "example_utils.hpp" |
27 | #include "oneapi/dnnl/dnnl.hpp" |
28 | |
29 | using namespace dnnl; |
30 | |
31 | using tag = memory::format_tag; |
32 | using dt = memory::data_type; |
33 | |
34 | struct gemm_dims_t { |
35 | memory::dim m, n, k; |
36 | }; |
37 | |
38 | static const int min_runs = 4; |
39 | |
40 | const char *get_type_string(dt type) { |
41 | const char *type_string = "unknown" ; |
42 | |
43 | #define TYPE_CASE(T) \ |
44 | if (type == dt::T) type_string = #T; |
45 | TYPE_CASE(f16); |
46 | TYPE_CASE(f32); |
47 | TYPE_CASE(f64); |
48 | TYPE_CASE(bf16); |
49 | TYPE_CASE(s8); |
50 | TYPE_CASE(u8); |
51 | #undef TYPE_CASE |
52 | |
53 | return type_string; |
54 | } |
55 | |
56 | void print_test_case(dt type, gemm_dims_t dims) { |
57 | std::cout << '[' << std::setw(4) << get_type_string(type); |
58 | if (dims.m == dims.n && dims.m == dims.k) |
59 | std::cout << " m = n = k = " << dims.m; |
60 | else |
61 | std::cout << " m = " << dims.m << ", n = " << dims.n |
62 | << ", k = " << dims.k; |
63 | std::cout << "] " << std::flush; |
64 | } |
65 | |
66 | void fill_random(std::vector<float> &out, bool is_integer) { |
67 | static std::vector<float> random_data_i, random_data_f; |
68 | constexpr size_t nrand = 1037; |
69 | |
70 | if (random_data_i.empty() || random_data_f.empty()) { |
71 | std::mt19937 generator; |
72 | std::uniform_int_distribution<int> dist_i(-16, 15); |
73 | std::uniform_real_distribution<float> dist_f(-1.0f, 1.0f); |
74 | |
75 | random_data_i.resize(nrand); |
76 | for (auto &d : random_data_i) |
77 | d = static_cast<float>(dist_i(generator)); |
78 | |
79 | random_data_f.resize(nrand); |
80 | for (auto &d : random_data_f) |
81 | d = dist_f(generator); |
82 | } |
83 | |
84 | auto &rd = is_integer ? random_data_i : random_data_f; |
85 | |
86 | for (size_t i = 0; i < out.size(); i += nrand) { |
87 | size_t chunk = std::min(nrand, out.size() - i); |
88 | std::memcpy(&out[i], rd.data(), chunk * sizeof(float)); |
89 | } |
90 | } |
91 | |
92 | double run_case(engine::kind engine_kind, dt type, gemm_dims_t dims, |
93 | double time_limit = 0.) { |
94 | bool is_integer = (type == dt::s8 || type == dt::u8); |
95 | bool quick_test = (time_limit == 0.); |
96 | |
97 | // Create execution dnnl::engine. |
98 | dnnl::engine engine(engine_kind, 0); |
99 | |
100 | // Create dnnl::stream. |
101 | dnnl::stream engine_stream(engine); |
102 | |
103 | // Source (A), weights (B), and destination (C) matrix dimensions. |
104 | memory::dims a_dims = {dims.m, dims.k}; |
105 | memory::dims b_dims = {dims.k, dims.n}; |
106 | memory::dims c_dims = {dims.m, dims.n}; |
107 | |
108 | // Allocate buffers and random-initialize A/B |
109 | std::vector<float> a_data(product(a_dims)); |
110 | std::vector<float> b_data(product(b_dims)); |
111 | std::vector<float> c_data(product(c_dims)); |
112 | |
113 | fill_random(a_data, is_integer); |
114 | fill_random(b_data, is_integer); |
115 | |
116 | // Create memory descriptors and memory objects for src, weights, bias, and |
117 | // dst. |
118 | auto a_md = memory::desc(a_dims, type, tag::any); |
119 | auto b_md = memory::desc(b_dims, type, tag::any); |
120 | auto c_md = memory::desc(c_dims, type, tag::any); |
121 | |
122 | auto a_in_md = memory::desc(a_dims, dt::f32, tag::ab); |
123 | auto b_in_md = memory::desc(b_dims, dt::f32, tag::ab); |
124 | |
125 | auto a_in_mem = memory(a_in_md, engine); |
126 | auto b_in_mem = memory(b_in_md, engine); |
127 | |
128 | // Write data to memory object's handles. |
129 | write_to_dnnl_memory(a_data.data(), a_in_mem); |
130 | write_to_dnnl_memory(b_data.data(), b_in_mem); |
131 | |
132 | // Create primitive descriptor. |
133 | auto matmul_pd = matmul::primitive_desc(engine, a_md, b_md, c_md); |
134 | |
135 | // Repack and convert input data. |
136 | auto a_mem = memory(matmul_pd.src_desc(), engine); |
137 | reorder(a_in_mem, a_mem).execute(engine_stream, a_in_mem, a_mem); |
138 | |
139 | auto b_mem = memory(matmul_pd.weights_desc(), engine); |
140 | reorder(b_in_mem, b_mem).execute(engine_stream, b_in_mem, b_mem); |
141 | |
142 | auto c_mem = memory(matmul_pd.dst_desc(), engine); |
143 | |
144 | // Create the primitive. |
145 | auto matmul_prim = matmul(matmul_pd); |
146 | |
147 | // Start output. |
148 | if (!quick_test) print_test_case(type, dims); |
149 | |
150 | // Primitive arguments. |
151 | std::unordered_map<int, memory> matmul_args; |
152 | matmul_args.insert({DNNL_ARG_SRC, a_mem}); |
153 | matmul_args.insert({DNNL_ARG_WEIGHTS, b_mem}); |
154 | matmul_args.insert({DNNL_ARG_DST, c_mem}); |
155 | |
156 | // Warmup executions. |
157 | matmul_prim.execute(engine_stream, matmul_args); |
158 | engine_stream.wait(); |
159 | |
160 | auto start_first = std::chrono::steady_clock::now(); |
161 | matmul_prim.execute(engine_stream, matmul_args); |
162 | engine_stream.wait(); |
163 | auto end_first = std::chrono::steady_clock::now(); |
164 | |
165 | std::chrono::duration<double> dur_first = end_first - start_first; |
166 | |
167 | if (quick_test) return dur_first.count(); |
168 | |
169 | int runs = std::max(min_runs, int(time_limit / dur_first.count())); |
170 | |
171 | // Timing runs. |
172 | auto start = std::chrono::steady_clock::now(); |
173 | |
174 | for (int i = 0; i <= runs; i++) |
175 | matmul_prim.execute(engine_stream, matmul_args); |
176 | engine_stream.wait(); |
177 | |
178 | auto end = std::chrono::steady_clock::now(); |
179 | |
180 | std::chrono::duration<double> duration = end - start; |
181 | |
182 | // Display the result. |
183 | double avg_time = (duration.count() - dur_first.count()) / runs; |
184 | double total_ops = double(dims.m) * double(dims.n) * double(dims.k) * 2; |
185 | double perf = (total_ops / avg_time) * 1e-9; |
186 | |
187 | auto scale_string = "G" ; |
188 | auto unit_string = is_integer ? "Op/s" : "Flop/s" ; |
189 | |
190 | if (perf >= 1000) { |
191 | perf /= 1000; |
192 | scale_string = "T" ; |
193 | } |
194 | |
195 | std::cout << perf << ' ' << scale_string << unit_string << std::endl; |
196 | |
197 | return avg_time; |
198 | } |
199 | |
200 | void run(engine::kind engine_kind, dt type, gemm_dims_t dims, |
201 | double time_limit) { |
202 | try { |
203 | if (dims.m * dims.n != 0) { |
204 | // Dimensions manually specified by user. |
205 | run_case(engine_kind, type, dims, time_limit); |
206 | } else { |
207 | // Automatically choose dimensions to fit time limit. |
208 | int mnk = 128; |
209 | const int max_mnk = 8192; |
210 | |
211 | while (mnk < max_mnk) { |
212 | dims.m = dims.n = dims.k = mnk; |
213 | double time1 = run_case(engine_kind, type, dims); |
214 | double nruns_est = std::max(1., time_limit / time1); |
215 | double mnk_expand = std::exp2( |
216 | std::round(std::log2(nruns_est / min_runs) / 3.)); |
217 | if (mnk_expand <= 1) break; |
218 | mnk = static_cast<int>( |
219 | std::min<double>(max_mnk, mnk * mnk_expand)); |
220 | } |
221 | |
222 | dims.m = dims.n = dims.k = mnk; |
223 | run_case(engine_kind, type, dims, time_limit); |
224 | } |
225 | } catch (dnnl::error &e) { |
226 | // Catch and report unimplemented cases. |
227 | if (e.status == dnnl_unimplemented) { |
228 | print_test_case(type, dims); |
229 | std::cout << "unsupported" << std::endl; |
230 | } else |
231 | throw; |
232 | } |
233 | } |
234 | |
235 | void bad_args() { |
236 | std::cerr << "Usage: matmul-perf-cpp [cpu|gpu]\n" |
237 | " matmul-perf-cpp [cpu|gpu] <size>\n" |
238 | " matmul-perf-cpp [cpu|gpu] <m> <n> <k>\n" |
239 | "If a single <size> is specified, it is used for all three " |
240 | "dimensions (m/n/k).\n" ; |
241 | throw std::invalid_argument("Incorrect input arguments." ); |
242 | } |
243 | |
244 | void matmul_perf(engine::kind engine_kind, int argc, char **argv) { |
245 | gemm_dims_t dims = {0, 0, 0}; |
246 | |
247 | if (argc > 2) { |
248 | if (argc == 3) |
249 | dims.m = dims.n = dims.k = std::atoi(argv[2]); |
250 | else if (argc == 5) { |
251 | dims.m = std::atoi(argv[2]); |
252 | dims.n = std::atoi(argv[3]); |
253 | dims.k = std::atoi(argv[4]); |
254 | } else |
255 | bad_args(); |
256 | |
257 | if (dims.m <= 0 || dims.n <= 0 || dims.k <= 0) bad_args(); |
258 | } |
259 | |
260 | run(engine_kind, dt::f32, dims, 2.0); |
261 | run(engine_kind, dt::f16, dims, 2.0); |
262 | run(engine_kind, dt::bf16, dims, 2.0); |
263 | run(engine_kind, dt::s8, dims, 2.0); |
264 | } |
265 | |
266 | int main(int argc, char **argv) { |
267 | return handle_example_errors( |
268 | matmul_perf, parse_engine_kind(argc, argv, 3), argc, argv); |
269 | } |
270 | |