1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <algorithm>
17#include <cstdlib>
18#include <future>
19#include <random>
20
21#include "Bench.h"
22
23#include "glow/ExecutionEngine/ExecutionEngine.h"
24#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26using namespace glow;
27
28/*
29 * This class implements a performance proxy for a single layer of
30 * the BERT network.
31 */
32class BERTProxyLayerBench : public Benchmark {
33 dim_t maxSequenceLength_;
34 dim_t batchSize_;
35 dim_t hiddenSize_;
36 dim_t numHeads_;
37 dim_t numCores_;
38 std::unique_ptr<runtime::HostManager> hostManager_;
39 std::vector<std::unique_ptr<ExecutionContext>> contexts_;
40 dim_t asyncLaunchSize_;
41 const char *backendStr_;
42 ElemKind dtype_;
43 ElemKind FCWeightType_;
44 ElemKind FCBiasType_;
45 float FCWeightScale_;
46 int32_t FCWeightOffset_;
47 bool quantize;
48
49public:
50 BERTProxyLayerBench(dim_t maxSequenceLength_, dim_t batchSize_,
51 dim_t hiddenSize_, dim_t numHeads_, dim_t numCores_,
52 dim_t asyncLaunchSize_, const char *backendStr_,
53 const char *dtypeStr_, const char *useInt8FCs)
54 : maxSequenceLength_(maxSequenceLength_), batchSize_(batchSize_),
55 hiddenSize_(hiddenSize_), numHeads_(numHeads_), numCores_(numCores_),
56 asyncLaunchSize_(asyncLaunchSize_), backendStr_(backendStr_) {
57
58 dtype_ = ElemKind::Float16Ty;
59 quantize = false;
60 if (std::string(dtypeStr_) == "Float16") {
61 dtype_ = ElemKind::Float16Ty;
62 FCWeightType_ = ElemKind::Float16Ty;
63 FCBiasType_ = ElemKind::Float16Ty;
64 FCWeightScale_ = 1.0f;
65 FCWeightOffset_ = 0;
66 } else if (std::string(dtypeStr_) == "Float32") {
67 dtype_ = ElemKind::FloatTy;
68 FCWeightType_ = ElemKind::FloatTy;
69 FCBiasType_ = ElemKind::FloatTy;
70 FCWeightScale_ = 1.0f;
71 FCWeightOffset_ = 0;
72 }
73 // If quantization is requested then use Int8/Int32
74 if (std::string(useInt8FCs) == "True") {
75 FCWeightType_ = ElemKind::Int8QTy;
76 FCBiasType_ = ElemKind::Int32QTy;
77 quantize = true;
78 FCWeightScale_ = 1.0;
79 FCWeightOffset_ = 128;
80 }
81 }
82
83 // Handle different tensor types
84 void randomizeTensor(Tensor *tn, PseudoRNG rng) {
85 if (tn->getElementType() == ElemKind::FloatTy) {
86 tn->getHandle<float_t>().randomize(0.0f, 1.0f, rng);
87 } else if (tn->getElementType() == ElemKind::Float16Ty) {
88 tn->getHandle<float16_t>().randomize(0.0f, 1.0f, rng);
89 } else if (tn->getElementType() == ElemKind::Int8QTy) {
90 tn->getHandle<int8_t>().randomize(-127, 127, rng);
91 } else if (tn->getElementType() == ElemKind::Int32QTy) {
92 tn->getHandle<int32_t>().randomize(-128, 128, rng);
93 }
94 }
95
96 // Handle different tensor types
97 void setTensor(Tensor *tn, float val) {
98 if (tn->getElementType() == ElemKind::FloatTy) {
99 tn->getHandle<float_t>().clear(val);
100 } else if (tn->getElementType() == ElemKind::Float16Ty) {
101 tn->getHandle<float16_t>().clear(val);
102 } else if (tn->getElementType() == ElemKind::Int8QTy) {
103 tn->getHandle<int8_t>().clear(val);
104 } else if (tn->getElementType() == ElemKind::Int32QTy) {
105 tn->getHandle<int32_t>().clear(val);
106 }
107 }
108
109 Node *createFC(Function *fn, std::unique_ptr<Module> &mod, std::string name,
110 Node *In, Constant *W, Constant *b) {
111 // Optionally add nodes for quantization of FCs
112 if (quantize) {
113 TypeRef InQTy = mod->uniqueType(FCWeightType_, In->dims(0), 2.0, -128.0);
114 auto *InQ = fn->createQuantize(name, In, InQTy);
115 auto *FCQ = fn->createFullyConnected(name, InQ, W, b);
116 TypeRef FCQTy = mod->uniqueType(dtype_, FCQ->dims(0));
117 Node *FCO = fn->createDequantize(name, FCQ, FCQTy);
118 return FCO;
119 } else {
120 return fn->createFullyConnected(name, In, W, b);
121 }
122 }
123
124 void setup() override {
125
126 // Create execution contexts here
127 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
128 std::unique_ptr<ExecutionContext> context(new ExecutionContext);
129 contexts_.push_back(std::move(context));
130 }
131
132 // Setup host manager
133 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
134 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
135 configs.push_back(std::move(config));
136 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
137
138 std::unique_ptr<Module> mod(new Module);
139 auto fn = mod->createFunction("singleNode");
140
141 // Input Placeholder ((maxSequenceLength*batchSize) x hiddenSize) (split)
142 Placeholder *input = mod->createPlaceholder(
143 dtype_, {maxSequenceLength_ * batchSize_, hiddenSize_}, "input", false);
144
145 // for each context, add input bindings
146 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
147 randomizeTensor(contexts_[i]->getPlaceholderBindings()->allocate(input),
148 mod->getPRNG());
149 }
150
151 // Weights/bias constants for QKV GEMM
152 Tensor W_QKV_Tensor =
153 (quantize) ? Tensor(FCWeightType_, {hiddenSize_, 3 * hiddenSize_},
154 FCWeightScale_, FCWeightOffset_)
155 : Tensor(FCWeightType_, {hiddenSize_, 3 * hiddenSize_});
156 randomizeTensor(&W_QKV_Tensor, mod->getPRNG());
157 Constant *W_QKV = mod->createConstant("W_QKV", W_QKV_Tensor);
158 Tensor b_QKV_Tensor = (quantize) ? Tensor(FCBiasType_, {3 * hiddenSize_},
159 FCWeightScale_, FCWeightOffset_)
160 : Tensor(FCBiasType_, {3 * hiddenSize_});
161 setTensor(&b_QKV_Tensor, 0.0f);
162 Constant *b_QKV = mod->createConstant("b_QKV", b_QKV_Tensor);
163
164 // Weights/bias constants for ZxWo FC
165 Tensor W_ZWO_Tensor =
166 (quantize) ? Tensor(FCWeightType_, {hiddenSize_, hiddenSize_},
167 FCWeightScale_, FCWeightOffset_)
168 : Tensor(FCWeightType_, {hiddenSize_, hiddenSize_});
169 randomizeTensor(&W_ZWO_Tensor, mod->getPRNG());
170 Constant *W_ZWO = mod->createConstant("W_ZWO", W_ZWO_Tensor);
171 Tensor b_ZWO_Tensor = (quantize) ? Tensor(FCBiasType_, {hiddenSize_},
172 FCWeightScale_, FCWeightOffset_)
173 : Tensor(FCBiasType_, {hiddenSize_});
174 randomizeTensor(&b_ZWO_Tensor, mod->getPRNG());
175 Constant *b_ZWO = mod->createConstant("b_ZWO", b_ZWO_Tensor);
176
177 // Constant scaling factor
178 float sqrt_dk_flt =
179 (float)(1.0 / std::sqrt(((double)hiddenSize_) / ((double)numHeads_)));
180
181 // Softmax expected output. Not needed for inference
182 Tensor expected_Tensor(ElemKind::Int64ITy,
183 {maxSequenceLength_ * batchSize_, 1});
184 Constant *expected = mod->createConstant("expected", expected_Tensor);
185
186 // Weights/bias constants for FC1
187 Tensor W_FC1_Tensor =
188 (quantize) ? Tensor(FCWeightType_, {hiddenSize_, 4 * hiddenSize_},
189 FCWeightScale_, FCWeightOffset_)
190 : Tensor(FCWeightType_, {hiddenSize_, 4 * hiddenSize_});
191 randomizeTensor(&W_FC1_Tensor, mod->getPRNG());
192 Constant *W_FC1 = mod->createConstant("W_FC1", W_FC1_Tensor);
193 Tensor b_FC1_Tensor = (quantize) ? Tensor(FCBiasType_, {4 * hiddenSize_},
194 FCWeightScale_, FCWeightOffset_)
195 : Tensor(FCBiasType_, {4 * hiddenSize_});
196 randomizeTensor(&b_FC1_Tensor, mod->getPRNG());
197 Constant *b_FC1 = mod->createConstant("b_FC1", b_FC1_Tensor);
198
199 // Weights/bias constants for FC2
200 Tensor W_FC2_Tensor =
201 (quantize) ? Tensor(FCWeightType_, {4 * hiddenSize_, hiddenSize_},
202 FCWeightScale_, FCWeightOffset_)
203 : Tensor(FCWeightType_, {4 * hiddenSize_, hiddenSize_});
204 randomizeTensor(&W_FC2_Tensor, mod->getPRNG());
205 Constant *W_FC2 = mod->createConstant("W_FC2", W_FC2_Tensor);
206 Tensor b_FC2_Tensor = (quantize) ? Tensor(FCBiasType_, {hiddenSize_},
207 FCWeightScale_, FCWeightOffset_)
208 : Tensor(FCBiasType_, {hiddenSize_});
209 randomizeTensor(&b_FC2_Tensor, mod->getPRNG());
210 Constant *b_FC2 = mod->createConstant("b_FC2", b_FC2_Tensor);
211
212 // batchSizePerCore is the number of sentences assigned to each
213 // core (each data-parallel chunk)
214 auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_);
215
216 // rowSizePerCore is the number of tokens assigned to each
217 // core (each data-parallel chunk)
218 dim_t numNonzeroCores = 0;
219 std::vector<dim_t> rowSizePerCore;
220 for (dim_t i = 0; i < batchSizePerCore.size(); i++) {
221 if (batchSizePerCore[i] > 0) {
222 rowSizePerCore.push_back(batchSizePerCore[i] * maxSequenceLength_);
223 numNonzeroCores++;
224 }
225 }
226
227 // Split the batch across cores in a data-parallel fashion
228 std::vector<SliceNode *> inputs(numNonzeroCores);
229 std::vector<SaveNode *> S(numNonzeroCores);
230
231 // Split the input into cores of data-parallel fashion
232 fn->createSplit("DPsplit", input, numNonzeroCores, 0, rowSizePerCore,
233 inputs);
234
235 // For each core (sub-batch), create a network which does one layer
236 for (int core = 0; core < int(numNonzeroCores); core++) {
237
238 // Layer Norm 1 bias and scale
239 Tensor LN1_scale_Tensor(dtype_, {hiddenSize_});
240 randomizeTensor(&LN1_scale_Tensor, mod->getPRNG());
241 Constant *LN1_scale = mod->createConstant("LN1_scale", LN1_scale_Tensor);
242 Tensor LN1_bias_Tensor(dtype_, {hiddenSize_});
243 randomizeTensor(&LN1_bias_Tensor, mod->getPRNG());
244 Constant *LN1_bias = mod->createConstant("LN1_bias", LN1_bias_Tensor);
245
246 // Layer Norm 2 bias and scale
247 Tensor LN2_scale_Tensor(dtype_, {hiddenSize_});
248 randomizeTensor(&LN2_scale_Tensor, mod->getPRNG());
249 Constant *LN2_scale = mod->createConstant("LN2_scale", LN2_scale_Tensor);
250 Tensor LN2_bias_Tensor(dtype_, {hiddenSize_});
251 randomizeTensor(&LN2_bias_Tensor, mod->getPRNG());
252 Constant *LN2_bias = mod->createConstant("LN2_bias", LN2_bias_Tensor);
253
254 // QKV GEMM
255 auto *QKV = createFC(fn, mod, strFormat("Gemm_QKV_core%d", core),
256 inputs[core], W_QKV, b_QKV);
257
258 // Split into Q, K, V
259 std::vector<SliceNode *> outputs(3);
260 fn->createSplit(strFormat("split_core%d", core), QKV, 3, 1, {}, outputs);
261 SliceNode *Q = outputs[0];
262 SliceNode *K = outputs[1];
263 SliceNode *V = outputs[2];
264
265 // Multi-headed attention split
266 std::vector<SliceNode *> Qsplits(numHeads_); // maxSequenceLength x 64
267 std::vector<SliceNode *> Ksplits(numHeads_); // maxSequenceLength x 64
268 std::vector<SliceNode *> Vsplits(numHeads_); // maxSequenceLength x 64
269 std::vector<NodeValue> Zsplits(numHeads_); // maxSequenceLength x 64
270 fn->createSplit(strFormat("splitQ_core%d", core), Q, numHeads_, 1, {},
271 Qsplits);
272 fn->createSplit(strFormat("splitK_core%d", core), K, numHeads_, 1, {},
273 Ksplits);
274 fn->createSplit(strFormat("splitV_core%d", core), V, numHeads_, 1, {},
275 Vsplits);
276
277 for (int i = 0; i < int(numHeads_); i++) {
278 // Split the subbatch into individual sentences for the
279 // batch matmul
280 std::vector<SliceNode *> QBatchSplits(batchSizePerCore[core]);
281 std::vector<SliceNode *> KBatchSplits(batchSizePerCore[core]);
282 std::vector<SliceNode *> VBatchSplits(batchSizePerCore[core]);
283 std::vector<NodeValue> ZBatchSplits(batchSizePerCore[core]);
284
285 fn->createSplit(strFormat("splitBatchQ_core%d", core), Qsplits[i],
286 batchSizePerCore[core], 0, {}, QBatchSplits);
287 fn->createSplit(strFormat("splitBatchK_core%d", core), Ksplits[i],
288 batchSizePerCore[core], 0, {}, KBatchSplits);
289 fn->createSplit(strFormat("splitBatchV_core%d", core), Vsplits[i],
290 batchSizePerCore[core], 0, {}, VBatchSplits);
291
292 // BatchMatMul
293 for (int b = 0; b < int(batchSizePerCore[core]); b++) {
294
295 auto *Kt =
296 fn->createTranspose(strFormat("transpose_core%d_%d", core, i),
297 KBatchSplits[b], {1, 0});
298 // Tmp = Q * K^T
299 auto *tmp =
300 fn->createMatMul(strFormat("matmul_Q_KT_core%d_%d", core, i),
301 QBatchSplits[b], Kt->getResult());
302
303 // Softmax_output = softmax(Tmp / sqrt(dk))
304 auto *sqrt_dk_splat =
305 fn->createSplat(strFormat("sqrt_dk_core%d_%d", core, i),
306 tmp->getResult().getType(), sqrt_dk_flt);
307 auto *tmp_div = fn->createMul(strFormat("div_core%d_%d", core, i),
308 tmp, sqrt_dk_splat);
309 auto *softmax_output = fn->createSoftMax(
310 strFormat("softmax_core%d_%d", core, i), tmp_div, expected);
311
312 ZBatchSplits[b] =
313 fn->createMatMul(strFormat("matmul_tmp_v_core%d_%d", core, i),
314 softmax_output, VBatchSplits[b]);
315 }
316
317 // Concatenate all the Z matrices for the whole subbatch
318 Zsplits[i] =
319 fn->createConcat(strFormat("concat_core%d", core), ZBatchSplits, 0);
320 }
321
322 // Concatenate all the Z matrices that we previously split on the hidden
323 // dimension
324 auto *Z = fn->createConcat(strFormat("concat_core%d", core), Zsplits, 1);
325
326 // Z x W_o
327 auto *ZWO = createFC(fn, mod, strFormat("Gemm_ZWO_core%d", core), Z,
328 W_ZWO, b_ZWO);
329
330 // Layer norm
331 auto *ZWO_norm = fn->createLayerNormalization(
332 strFormat("LayerNorm1_core%d", core), ZWO->getNthResult(0).getType(),
333 ZWO, LN1_scale, LN1_bias, 1e-5);
334
335 // FC1
336 auto *FC1 = createFC(fn, mod, strFormat("Gemm_FC1_core%d", core),
337 ZWO_norm, W_FC1, b_FC1);
338
339 // Create gelu
340 auto *FC1_gelu = fn->createGELU(strFormat("GELU_FC1_core%d", core), FC1);
341
342 // FC2
343 auto *FC2 = createFC(fn, mod, strFormat("Gemm_FC2_core%d", core),
344 FC1_gelu, W_FC2, b_FC2);
345
346 // Layer norm
347 auto *FC2_norm = fn->createLayerNormalization(
348 strFormat("LayerNorm2_core%d", core), FC2->getNthResult(0).getType(),
349 FC2, LN2_scale, LN2_bias, 1e-5);
350
351 // Save result
352 S[core] = fn->createSave(strFormat("save_core%d", core), FC2_norm);
353 for (int i = 0; i < int(asyncLaunchSize_); i++) {
354 contexts_[i]->getPlaceholderBindings()->allocate(
355 S[core]->getPlaceholder());
356 }
357 } // For each core
358
359 // Special case for batch-1, use model parallelism for FCs
360 if ((batchSize_ == 1) && (numCores_ > 1)) {
361 executeVerticalFCWeightsSplit(fn, numCores_, hiddenSize_);
362 }
363
364 CompilationContext ctx;
365 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
366 fn->dumpDAG(std::string("BERT.dot"));
367 }
368
369 void run() override {
370 std::vector<std::unique_ptr<ExecutionContext>> localContexts(
371 asyncLaunchSize_);
372 std::vector<std::promise<void>> promises(asyncLaunchSize_);
373 std::vector<std::future<void>> futures;
374
375 // Launch a number of parallel requests
376 int i = 0;
377 for (auto &promise : promises) {
378 futures.push_back(promise.get_future());
379 hostManager_->runNetwork(
380 "singleNode", std::move(contexts_[i]),
381 [&localContexts, &promise,
382 i](runtime::RunIdentifierTy, Error err,
383 std::unique_ptr<ExecutionContext> contextPtr) {
384 EXIT_ON_ERR(std::move(err));
385 localContexts[i] = std::move(contextPtr);
386 promise.set_value();
387 });
388 i++;
389 }
390 for (auto &fut : futures) {
391 fut.wait();
392 }
393 for (dim_t j = 0; j < asyncLaunchSize_; j++) {
394 contexts_[j] = std::move(localContexts[j]);
395 }
396 }
397
398 void teardown() override {}
399
400 // Only counting GEMMs
401 double gflops() const {
402 double num_flops = 0.0;
403
404 // QKV
405 num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * 3 * hiddenSize_;
406
407 // BMM
408 num_flops += 2.0 * hiddenSize_ * maxSequenceLength_ * maxSequenceLength_;
409 num_flops += 2.0 * hiddenSize_ * maxSequenceLength_ * maxSequenceLength_;
410
411 // ZWO
412 num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * hiddenSize_;
413
414 // FC1
415 num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * 4 * hiddenSize_;
416
417 // FC2
418 num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * 4 * hiddenSize_;
419
420 return batchSize_ * num_flops / 1e9;
421 }
422};
423
424int main(int argc, char *argv[]) {
425 printf(
426 "Usage: BERTLayerBench maxSequenceLength batchSize hiddenSize numHeads "
427 "numCores "
428 "numReps numAsyncLaunches backendStr dtypeStr useInt8FCs\n");
429 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
430 "environment variable\n");
431 benchParseGlowOpts(argc, argv);
432 assert(argc == 11);
433 size_t maxSequenceLength = atoi(argv[1]);
434 size_t batchSize = atoi(argv[2]);
435 size_t hiddenSize = atoi(argv[3]);
436 size_t numHeads = atoi(argv[4]);
437 size_t numCores = atoi(argv[5]);
438 size_t numReps = atoi(argv[6]);
439 size_t numAsyncLaunches = atoi(argv[7]);
440 const char *backendStr = argv[8];
441 const char *dtypeStr = argv[9];
442 const char *useInt8FCs = argv[10];
443 assert(numReps > 0);
444
445 BERTProxyLayerBench b(maxSequenceLength, batchSize, hiddenSize, numHeads,
446 numCores, numAsyncLaunches, backendStr, dtypeStr,
447 useInt8FCs);
448
449 auto times = bench(&b, numReps);
450 printf("_,benchName,maxSequenceLength,batchSize,hiddenSize,numHeads,numCores,"
451 "numReps,"
452 "numAsyncLaunches,"
453 "backendStr,dtypeStr,useInt8FCs,averageTime,averageGFLOP\n");
454 for (auto t : times) {
455 printf("BenchResult,BERTProxyLayerBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%"
456 "s,%s,%f,%f\n",
457 maxSequenceLength, batchSize, hiddenSize, numHeads, numCores,
458 numReps, numAsyncLaunches, backendStr, dtypeStr, useInt8FCs,
459 t / numAsyncLaunches, b.gflops() * numAsyncLaunches / t);
460 }
461 double min = *(std::min_element(times.begin(), times.end()));
462 size_t midElt = times.size() / 2;
463 std::nth_element(times.begin(), times.begin() + midElt, times.end());
464 double median = times[midElt];
465 double median_runtime = median / ((double)numAsyncLaunches);
466 double min_runtime = min / ((double)numAsyncLaunches);
467 printf("_,benchName,maxSequenceLength,batchSize,hiddenSize,numHeads,numCores,"
468 "numReps,"
469 "numAsyncLaunches,"
470 "backendStr,dtypeStr,useInt8FCs,medianRuntime,minRuntime,medianGFLOPS,"
471 "minGFLOPS\n");
472 printf("Total gflop: %f\n", b.gflops());
473 printf("BenchSummary,BERTProxyLayerBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%"
474 "s,%s,"
475 "%f,%f,%f,%"
476 "f\n",
477 maxSequenceLength, batchSize, hiddenSize, numHeads, numCores, numReps,
478 numAsyncLaunches, backendStr, dtypeStr, useInt8FCs, median_runtime,
479 min_runtime, b.gflops() / median_runtime, b.gflops() / min_runtime);
480}
481