1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <array> |
17 | #include <cstdlib> |
18 | #include <fstream> |
19 | #include <future> |
20 | #include <random> |
21 | #include <string> |
22 | |
23 | #include "Bench.h" |
24 | |
25 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
26 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
27 | |
28 | #include "llvm/Support/CommandLine.h" |
29 | #include "llvm/Support/FileSystem.h" |
30 | #include "llvm/Support/Signals.h" |
31 | |
32 | #include "tests/unittests/BackendTestUtils.h" |
33 | |
34 | using namespace glow; |
35 | |
36 | /* |
37 | * This class implements a Int8 Quantized GEMM/FC microbenchmark. There are a |
38 | * set of (m x k) * (k x n) = (m x n) matrix multiplications, chained together |
39 | * in multiple layers. |
40 | * |
41 | * Microbenchmarks are generally useful for understanding performance |
42 | * through targeted experiementation and are not representative of |
43 | * end-to-end workloads. |
44 | */ |
45 | // TODO: Move all the args passed by command line to LLVM options. |
46 | llvm::cl::OptionCategory int8GemmBenchCat("Int8GemmBench Category" ); |
47 | llvm::cl::opt<bool> checkCorrectness( |
48 | "check-results" , |
49 | llvm::cl::desc("Check the correctness of the results against the reference " |
50 | "backend (Interpreter)" ), |
51 | llvm::cl::Optional, llvm::cl::init(false), llvm::cl::cat(int8GemmBenchCat)); |
52 | |
53 | struct Int8GemmParam { |
54 | dim_t m_; |
55 | dim_t n_; |
56 | dim_t k_; |
57 | dim_t numLayers_; |
58 | dim_t numReps_; |
59 | dim_t numAsyncLaunches_; |
60 | dim_t numSplits_; |
61 | std::string backendStr_; |
62 | std::string devId_; |
63 | }; |
64 | |
65 | class Int8GemmBench : public Benchmark { |
66 | Int8GemmParam param_; |
67 | ExecutionContext context_; |
68 | PlaceholderBindings &bindings_; |
69 | std::unique_ptr<runtime::HostManager> hostManager_; |
70 | |
71 | // Refernce bindings and network: |
72 | ExecutionContext refContext_; |
73 | PlaceholderBindings &refBindings_; |
74 | std::unique_ptr<runtime::HostManager> refHostManager_; |
75 | |
76 | public: |
77 | explicit Int8GemmBench(Int8GemmParam param_) |
78 | : param_(param_), bindings_(*context_.getPlaceholderBindings()), |
79 | refBindings_(*refContext_.getPlaceholderBindings()) {} |
80 | |
81 | void addInt8GemmNode(std::unique_ptr<Module> &mod, Function *fn, |
82 | Int8GemmParam param, bool isRef) { |
83 | |
84 | PlaceholderBindings &bindings = isRef ? refBindings_ : bindings_; |
85 | auto *input = mod->createPlaceholder(ElemKind::Float16Ty, |
86 | {param.m_, param.k_}, "input" , false); |
87 | bindings.allocate(input)->getHandle<float16>().randomize(-5.f, 5.f, |
88 | mod->getPRNG()); |
89 | auto *output = mod->createPlaceholder( |
90 | ElemKind::Float16Ty, {param.m_, param.n_}, "output" , false); |
91 | auto *q_input = fn->createQuantize( |
92 | "int8_quantize" , input, |
93 | mod->uniqueType(ElemKind::Int8QTy, {param.m_, param.k_}, 1.0, 0)); |
94 | Node *cur = q_input; |
95 | |
96 | Placeholder *ones; |
97 | if (param.k_ > param.n_) { |
98 | ones = mod->createPlaceholder(ElemKind::Int8QTy, |
99 | {param.m_ * (param.k_ - param.n_)}, 1.0, 0, |
100 | "ones" , false); |
101 | bindings.allocate(ones)->getHandle<int8_t>().clear(1); |
102 | } |
103 | |
104 | Placeholder *weights; |
105 | Placeholder *bias; |
106 | |
107 | // Create multiple layers of FC nodes |
108 | for (size_t layer = 0; layer < param.numLayers_; layer++) { |
109 | weights = |
110 | mod->createPlaceholder(ElemKind::Int8QTy, {param.k_, param.n_}, 1.0, |
111 | 0, "weights" + std::to_string(layer), false); |
112 | bias = mod->createPlaceholder(ElemKind::Int32QTy, {param.n_}, 1.0, 0, |
113 | "bias" + std::to_string(layer), false); |
114 | |
115 | bindings.allocate(weights)->getHandle<int8_t>().randomize(-128, 127, |
116 | mod->getPRNG()); |
117 | bindings.allocate(bias)->getHandle<int32_t>().randomize(-128, 127, |
118 | mod->getPRNG()); |
119 | |
120 | Node *fc; |
121 | fc = fn->createFullyConnected("fc_" + std::to_string(layer), cur, weights, |
122 | bias); |
123 | cur = fc; |
124 | |
125 | // Handle non-square cases |
126 | if (param.k_ > param.n_ && layer < (param.numLayers_ - 1)) { |
127 | Node *reshape1 = fn->createReshape("reshape1_" + std::to_string(layer), |
128 | fc, {param.m_ * param.n_}); |
129 | Node *concat = fn->createConcat("concat_" + std::to_string(layer), |
130 | {reshape1, ones}, 0); |
131 | Node *reshape2 = fn->createReshape("reshape2_" + std::to_string(layer), |
132 | concat, {param.m_, param.k_}); |
133 | cur = reshape2; |
134 | } else if (param.k_ < param.n_ && layer < (param.numLayers_ - 1)) { |
135 | Node *slice = fn->createSlice("slice_" + std::to_string(layer), fc, |
136 | {0, 0}, {param.m_, param.k_}); |
137 | cur = slice; |
138 | } |
139 | } |
140 | auto *dequantized_fc = fn->createDequantize( |
141 | "int8_dequantize" , cur, |
142 | mod->uniqueType(ElemKind::Float16Ty, {param.m_, param.n_})); |
143 | cur = dequantized_fc; |
144 | fn->createSave("save1" , cur, output); |
145 | bindings.allocate(output); |
146 | ::glow::convertPlaceholdersToConstants(fn, bindings, {input, output}); |
147 | } |
148 | |
149 | void parallelize(Function *fn) { |
150 | // Model parallelize FCs |
151 | llvm::DenseMap<Node *, size_t> numOfChunks; |
152 | llvm::DenseMap<Node *, ParallelTransformKind> parOpts; |
153 | for (auto &N : fn->getNodes()) { |
154 | if (N.getKind() == Kinded::Kind::FullyConnectedNodeKind) { |
155 | numOfChunks[&N] = param_.numSplits_; |
156 | parOpts[&N] = ParallelTransformKind::Model; |
157 | } |
158 | } |
159 | |
160 | // Parallelize Quantize/Dequantize |
161 | for (auto &N : fn->getNodes()) { |
162 | if (N.getKind() == Kinded::Kind::QuantizeNodeKind || |
163 | N.getKind() == Kinded::Kind::DequantizeNodeKind) { |
164 | numOfChunks[&N] = param_.numSplits_; |
165 | parOpts[&N] = ParallelTransformKind::Data; |
166 | } |
167 | } |
168 | EXIT_ON_ERR(parallelizeOps(fn, numOfChunks, parOpts, 1)); |
169 | } |
170 | |
171 | void setup_internal(bool isRef) { |
172 | // Setup host manager |
173 | std::string backendStr = isRef ? "Interpreter" : param_.backendStr_.c_str(); |
174 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
175 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr); |
176 | if (param_.devId_ != "" ) { |
177 | config->parameters["DeviceID" ] = param_.devId_.c_str(); |
178 | } |
179 | configs.push_back(std::move(config)); |
180 | if (isRef) { |
181 | refHostManager_ = |
182 | glow::make_unique<runtime::HostManager>(std::move(configs)); |
183 | } else { |
184 | hostManager_ = |
185 | glow::make_unique<runtime::HostManager>(std::move(configs)); |
186 | } |
187 | |
188 | std::unique_ptr<Module> mod(new Module); |
189 | auto fn = mod->createFunction("singleNode" ); |
190 | |
191 | addInt8GemmNode(mod, fn, param_, isRef); |
192 | parallelize(fn); |
193 | optimize(fn, CompilationMode::Infer); |
194 | |
195 | CompilationContext ctx; |
196 | ctx.dumpFinalGraph = true; |
197 | if (isRef) { |
198 | EXIT_ON_ERR(refHostManager_->addNetwork(std::move(mod), ctx)); |
199 | } else { |
200 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
201 | } |
202 | } |
203 | |
204 | void setup() override { |
205 | if (checkCorrectness) { |
206 | setup_internal(/* isRef */ true); |
207 | } |
208 | setup_internal(/* isRef */ false); |
209 | } |
210 | |
211 | void checkOutput() { |
212 | // First run on the reference backend |
213 | dispatchInference("singleNode" , refHostManager_.get(), refContext_, |
214 | param_.numAsyncLaunches_, |
215 | /*useNewExecutionContext*/ true); |
216 | Tensor *refTensor = |
217 | refBindings_.get(refBindings_.getPlaceholderByNameSlow("output" )); |
218 | CHECK(refTensor) << "Reference Tensor not found" ; |
219 | |
220 | Tensor *noRefTensor = |
221 | bindings_.get(bindings_.getPlaceholderByNameSlow("output" )); |
222 | CHECK(noRefTensor) << "non-reference Tensor not found" ; |
223 | |
224 | // Compare the tensors |
225 | if (!noRefTensor->isEqual(*refTensor)) { |
226 | noRefTensor->dump(); |
227 | refTensor->dump(); |
228 | LOG(FATAL) << "Tensors don't match\n" ; |
229 | } else { |
230 | LOG(INFO) << "Tensors match\n" ; |
231 | } |
232 | } |
233 | |
234 | void run() override { |
235 | dispatchInference("singleNode" , hostManager_.get(), context_, |
236 | param_.numAsyncLaunches_, |
237 | /*useNewExecutionContext*/ true); |
238 | if (checkCorrectness) { |
239 | checkOutput(); |
240 | } |
241 | } |
242 | |
243 | void teardown() override {} |
244 | |
245 | double gops() const { |
246 | return 2.0 * param_.m_ * param_.n_ * param_.k_ * param_.numLayers_ / 1e9; |
247 | } |
248 | }; |
249 | |
250 | #define DEVICE_ID 9 |
251 | |
252 | Int8GemmParam parseArgs(int argc, char *argv[]) { |
253 | Int8GemmParam param; |
254 | |
255 | param.m_ = atoi(argv[1]); |
256 | param.n_ = atoi(argv[2]); |
257 | param.k_ = atoi(argv[3]); |
258 | param.numLayers_ = atoi(argv[4]); |
259 | param.numReps_ = atoi(argv[5]); |
260 | param.numAsyncLaunches_ = atoi(argv[6]); |
261 | param.numSplits_ = atoi(argv[7]); |
262 | param.backendStr_ = std::string(argv[8]); |
263 | |
264 | printf("m %zu\n" , (size_t)param.m_); |
265 | printf("n %zu\n" , (size_t)param.n_); |
266 | printf("k %zu\n" , (size_t)param.k_); |
267 | printf("numLayers %zu\n" , (size_t)param.numLayers_); |
268 | printf("numReps %zu\n" , (size_t)param.numReps_); |
269 | printf("numAsyncLaunches %zu\n" , (size_t)param.numAsyncLaunches_); |
270 | printf("numSplits %zu\n" , (size_t)param.numSplits_); |
271 | printf("backendStr %s\n" , param.backendStr_.c_str()); |
272 | |
273 | if (argc > DEVICE_ID) { |
274 | printf("devId %s\n" , argv[DEVICE_ID]); |
275 | param.devId_ = std::string(argv[DEVICE_ID]); |
276 | } else { |
277 | param.devId_ = std::string("" ); |
278 | } |
279 | printf("\n\n" ); |
280 | return param; |
281 | } |
282 | |
283 | int main(int argc, char *argv[]) { |
284 | printf("GEMM Microbenchmark\n" ); |
285 | printf("Usage: GemmBench m(Int) n(Int) k(Int) numLayers(Int) numReps(Int) " |
286 | "numAsyncLaunches(Int) numSplits(Int) backendStr(String) " |
287 | "dev_id(Int)\n" ); |
288 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
289 | "environment variable\n" ); |
290 | benchParseGlowOpts(argc, argv); |
291 | |
292 | std::vector<Int8GemmParam> params; |
293 | std::string ; |
294 | std::string runPrefix; |
295 | |
296 | // Using a config file |
297 | if (argc == 2) { |
298 | auto fname = std::string(argv[1]); |
299 | std::ifstream fin(fname.c_str()); |
300 | if (!fin) { |
301 | std::cout << "Could not open file: " << fname << std::endl; |
302 | exit(0); |
303 | } |
304 | std::string line; |
305 | while (getline(fin, line)) { |
306 | std::array<char, 1024> buf; |
307 | char *saveptr = nullptr; |
308 | std::vector<char *> argVec; |
309 | strcpy(buf.data(), line.c_str()); |
310 | char *ptr = strtok_r(buf.data(), " " , &saveptr); |
311 | while (ptr != nullptr) { |
312 | argVec.push_back(ptr); |
313 | ptr = strtok_r(nullptr, " " , &saveptr); |
314 | } |
315 | Int8GemmParam param = parseArgs(argVec.size(), argVec.data()); |
316 | params.push_back(param); |
317 | runHeader = std::string("_,benchName,_,filename" ); |
318 | runPrefix = std::string(strFormat("GemmBench,SW,%s" , fname.c_str())); |
319 | } |
320 | } else if (argc == 9 || argc == 10) { |
321 | Int8GemmParam param = parseArgs(argc, argv); |
322 | params.push_back(param); |
323 | runHeader = std::string( |
324 | "_,benchName,_,m,n,k,numLayers,numReps,numAsyncLaunches,numSplits," |
325 | "backendStr\n" ); |
326 | runPrefix = std::string(strFormat( |
327 | "GemmBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s" , (size_t)param.m_, |
328 | (size_t)param.n_, (size_t)param.k_, (size_t)param.numLayers_, |
329 | (size_t)param.numReps_, (size_t)param.numAsyncLaunches_, |
330 | (size_t)param.numSplits_, argv[8])); |
331 | } else { |
332 | llvm_unreachable("Invalid command line" ); |
333 | } |
334 | |
335 | for (auto param : params) { |
336 | Int8GemmBench b(param); |
337 | auto times = bench(&b, param.numReps_); |
338 | |
339 | printf("%s,runtime,gflopPerSec\n" , runHeader.c_str()); |
340 | for (auto t : times) { |
341 | printf("BenchResult,%s,%f,%f\n" , runPrefix.c_str(), |
342 | t / param.numAsyncLaunches_, |
343 | b.gops() * param.numAsyncLaunches_ / t); |
344 | } |
345 | double min = *(std::min_element(times.begin(), times.end())); |
346 | dim_t midElt = times.size() / 2; |
347 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
348 | double median = times[midElt]; |
349 | double medianRuntime = median / ((double)param.numAsyncLaunches_); |
350 | double minRuntime = min / ((double)param.numAsyncLaunches_); |
351 | printf("%s,medianRuntime,minRuntime,medianGflopPerSec,maxGflopPerSec\n" , |
352 | runHeader.c_str()); |
353 | printf("BenchSummary,%s,%f,%f,%f,%f\n" , runPrefix.c_str(), medianRuntime, |
354 | minRuntime, b.gops() / medianRuntime, b.gops() / minRuntime); |
355 | } |
356 | } |
357 | |