1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <algorithm> |
17 | #include <cstdlib> |
18 | #include <future> |
19 | #include <random> |
20 | |
21 | #include "Bench.h" |
22 | |
23 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
24 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
25 | |
26 | using namespace glow; |
27 | |
28 | /* |
29 | * This class implements a performance proxy for a single layer of |
30 | * the BERT network. |
31 | */ |
32 | class BERTProxyLayerBench : public Benchmark { |
33 | dim_t maxSequenceLength_; |
34 | dim_t batchSize_; |
35 | dim_t hiddenSize_; |
36 | dim_t numHeads_; |
37 | dim_t numCores_; |
38 | std::unique_ptr<runtime::HostManager> hostManager_; |
39 | std::vector<std::unique_ptr<ExecutionContext>> contexts_; |
40 | dim_t asyncLaunchSize_; |
41 | const char *backendStr_; |
42 | ElemKind dtype_; |
43 | ElemKind FCWeightType_; |
44 | ElemKind FCBiasType_; |
45 | float FCWeightScale_; |
46 | int32_t FCWeightOffset_; |
47 | bool quantize; |
48 | |
49 | public: |
50 | BERTProxyLayerBench(dim_t maxSequenceLength_, dim_t batchSize_, |
51 | dim_t hiddenSize_, dim_t numHeads_, dim_t numCores_, |
52 | dim_t asyncLaunchSize_, const char *backendStr_, |
53 | const char *dtypeStr_, const char *useInt8FCs) |
54 | : maxSequenceLength_(maxSequenceLength_), batchSize_(batchSize_), |
55 | hiddenSize_(hiddenSize_), numHeads_(numHeads_), numCores_(numCores_), |
56 | asyncLaunchSize_(asyncLaunchSize_), backendStr_(backendStr_) { |
57 | |
58 | dtype_ = ElemKind::Float16Ty; |
59 | quantize = false; |
60 | if (std::string(dtypeStr_) == "Float16" ) { |
61 | dtype_ = ElemKind::Float16Ty; |
62 | FCWeightType_ = ElemKind::Float16Ty; |
63 | FCBiasType_ = ElemKind::Float16Ty; |
64 | FCWeightScale_ = 1.0f; |
65 | FCWeightOffset_ = 0; |
66 | } else if (std::string(dtypeStr_) == "Float32" ) { |
67 | dtype_ = ElemKind::FloatTy; |
68 | FCWeightType_ = ElemKind::FloatTy; |
69 | FCBiasType_ = ElemKind::FloatTy; |
70 | FCWeightScale_ = 1.0f; |
71 | FCWeightOffset_ = 0; |
72 | } |
73 | // If quantization is requested then use Int8/Int32 |
74 | if (std::string(useInt8FCs) == "True" ) { |
75 | FCWeightType_ = ElemKind::Int8QTy; |
76 | FCBiasType_ = ElemKind::Int32QTy; |
77 | quantize = true; |
78 | FCWeightScale_ = 1.0; |
79 | FCWeightOffset_ = 128; |
80 | } |
81 | } |
82 | |
83 | // Handle different tensor types |
84 | void randomizeTensor(Tensor *tn, PseudoRNG rng) { |
85 | if (tn->getElementType() == ElemKind::FloatTy) { |
86 | tn->getHandle<float_t>().randomize(0.0f, 1.0f, rng); |
87 | } else if (tn->getElementType() == ElemKind::Float16Ty) { |
88 | tn->getHandle<float16_t>().randomize(0.0f, 1.0f, rng); |
89 | } else if (tn->getElementType() == ElemKind::Int8QTy) { |
90 | tn->getHandle<int8_t>().randomize(-127, 127, rng); |
91 | } else if (tn->getElementType() == ElemKind::Int32QTy) { |
92 | tn->getHandle<int32_t>().randomize(-128, 128, rng); |
93 | } |
94 | } |
95 | |
96 | // Handle different tensor types |
97 | void setTensor(Tensor *tn, float val) { |
98 | if (tn->getElementType() == ElemKind::FloatTy) { |
99 | tn->getHandle<float_t>().clear(val); |
100 | } else if (tn->getElementType() == ElemKind::Float16Ty) { |
101 | tn->getHandle<float16_t>().clear(val); |
102 | } else if (tn->getElementType() == ElemKind::Int8QTy) { |
103 | tn->getHandle<int8_t>().clear(val); |
104 | } else if (tn->getElementType() == ElemKind::Int32QTy) { |
105 | tn->getHandle<int32_t>().clear(val); |
106 | } |
107 | } |
108 | |
109 | Node *createFC(Function *fn, std::unique_ptr<Module> &mod, std::string name, |
110 | Node *In, Constant *W, Constant *b) { |
111 | // Optionally add nodes for quantization of FCs |
112 | if (quantize) { |
113 | TypeRef InQTy = mod->uniqueType(FCWeightType_, In->dims(0), 2.0, -128.0); |
114 | auto *InQ = fn->createQuantize(name, In, InQTy); |
115 | auto *FCQ = fn->createFullyConnected(name, InQ, W, b); |
116 | TypeRef FCQTy = mod->uniqueType(dtype_, FCQ->dims(0)); |
117 | Node *FCO = fn->createDequantize(name, FCQ, FCQTy); |
118 | return FCO; |
119 | } else { |
120 | return fn->createFullyConnected(name, In, W, b); |
121 | } |
122 | } |
123 | |
124 | void setup() override { |
125 | |
126 | // Create execution contexts here |
127 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
128 | std::unique_ptr<ExecutionContext> context(new ExecutionContext); |
129 | contexts_.push_back(std::move(context)); |
130 | } |
131 | |
132 | // Setup host manager |
133 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
134 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
135 | configs.push_back(std::move(config)); |
136 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
137 | |
138 | std::unique_ptr<Module> mod(new Module); |
139 | auto fn = mod->createFunction("singleNode" ); |
140 | |
141 | // Input Placeholder ((maxSequenceLength*batchSize) x hiddenSize) (split) |
142 | Placeholder *input = mod->createPlaceholder( |
143 | dtype_, {maxSequenceLength_ * batchSize_, hiddenSize_}, "input" , false); |
144 | |
145 | // for each context, add input bindings |
146 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
147 | randomizeTensor(contexts_[i]->getPlaceholderBindings()->allocate(input), |
148 | mod->getPRNG()); |
149 | } |
150 | |
151 | // Weights/bias constants for QKV GEMM |
152 | Tensor W_QKV_Tensor = |
153 | (quantize) ? Tensor(FCWeightType_, {hiddenSize_, 3 * hiddenSize_}, |
154 | FCWeightScale_, FCWeightOffset_) |
155 | : Tensor(FCWeightType_, {hiddenSize_, 3 * hiddenSize_}); |
156 | randomizeTensor(&W_QKV_Tensor, mod->getPRNG()); |
157 | Constant *W_QKV = mod->createConstant("W_QKV" , W_QKV_Tensor); |
158 | Tensor b_QKV_Tensor = (quantize) ? Tensor(FCBiasType_, {3 * hiddenSize_}, |
159 | FCWeightScale_, FCWeightOffset_) |
160 | : Tensor(FCBiasType_, {3 * hiddenSize_}); |
161 | setTensor(&b_QKV_Tensor, 0.0f); |
162 | Constant *b_QKV = mod->createConstant("b_QKV" , b_QKV_Tensor); |
163 | |
164 | // Weights/bias constants for ZxWo FC |
165 | Tensor W_ZWO_Tensor = |
166 | (quantize) ? Tensor(FCWeightType_, {hiddenSize_, hiddenSize_}, |
167 | FCWeightScale_, FCWeightOffset_) |
168 | : Tensor(FCWeightType_, {hiddenSize_, hiddenSize_}); |
169 | randomizeTensor(&W_ZWO_Tensor, mod->getPRNG()); |
170 | Constant *W_ZWO = mod->createConstant("W_ZWO" , W_ZWO_Tensor); |
171 | Tensor b_ZWO_Tensor = (quantize) ? Tensor(FCBiasType_, {hiddenSize_}, |
172 | FCWeightScale_, FCWeightOffset_) |
173 | : Tensor(FCBiasType_, {hiddenSize_}); |
174 | randomizeTensor(&b_ZWO_Tensor, mod->getPRNG()); |
175 | Constant *b_ZWO = mod->createConstant("b_ZWO" , b_ZWO_Tensor); |
176 | |
177 | // Constant scaling factor |
178 | float sqrt_dk_flt = |
179 | (float)(1.0 / std::sqrt(((double)hiddenSize_) / ((double)numHeads_))); |
180 | |
181 | // Softmax expected output. Not needed for inference |
182 | Tensor expected_Tensor(ElemKind::Int64ITy, |
183 | {maxSequenceLength_ * batchSize_, 1}); |
184 | Constant *expected = mod->createConstant("expected" , expected_Tensor); |
185 | |
186 | // Weights/bias constants for FC1 |
187 | Tensor W_FC1_Tensor = |
188 | (quantize) ? Tensor(FCWeightType_, {hiddenSize_, 4 * hiddenSize_}, |
189 | FCWeightScale_, FCWeightOffset_) |
190 | : Tensor(FCWeightType_, {hiddenSize_, 4 * hiddenSize_}); |
191 | randomizeTensor(&W_FC1_Tensor, mod->getPRNG()); |
192 | Constant *W_FC1 = mod->createConstant("W_FC1" , W_FC1_Tensor); |
193 | Tensor b_FC1_Tensor = (quantize) ? Tensor(FCBiasType_, {4 * hiddenSize_}, |
194 | FCWeightScale_, FCWeightOffset_) |
195 | : Tensor(FCBiasType_, {4 * hiddenSize_}); |
196 | randomizeTensor(&b_FC1_Tensor, mod->getPRNG()); |
197 | Constant *b_FC1 = mod->createConstant("b_FC1" , b_FC1_Tensor); |
198 | |
199 | // Weights/bias constants for FC2 |
200 | Tensor W_FC2_Tensor = |
201 | (quantize) ? Tensor(FCWeightType_, {4 * hiddenSize_, hiddenSize_}, |
202 | FCWeightScale_, FCWeightOffset_) |
203 | : Tensor(FCWeightType_, {4 * hiddenSize_, hiddenSize_}); |
204 | randomizeTensor(&W_FC2_Tensor, mod->getPRNG()); |
205 | Constant *W_FC2 = mod->createConstant("W_FC2" , W_FC2_Tensor); |
206 | Tensor b_FC2_Tensor = (quantize) ? Tensor(FCBiasType_, {hiddenSize_}, |
207 | FCWeightScale_, FCWeightOffset_) |
208 | : Tensor(FCBiasType_, {hiddenSize_}); |
209 | randomizeTensor(&b_FC2_Tensor, mod->getPRNG()); |
210 | Constant *b_FC2 = mod->createConstant("b_FC2" , b_FC2_Tensor); |
211 | |
212 | // batchSizePerCore is the number of sentences assigned to each |
213 | // core (each data-parallel chunk) |
214 | auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_); |
215 | |
216 | // rowSizePerCore is the number of tokens assigned to each |
217 | // core (each data-parallel chunk) |
218 | dim_t numNonzeroCores = 0; |
219 | std::vector<dim_t> rowSizePerCore; |
220 | for (dim_t i = 0; i < batchSizePerCore.size(); i++) { |
221 | if (batchSizePerCore[i] > 0) { |
222 | rowSizePerCore.push_back(batchSizePerCore[i] * maxSequenceLength_); |
223 | numNonzeroCores++; |
224 | } |
225 | } |
226 | |
227 | // Split the batch across cores in a data-parallel fashion |
228 | std::vector<SliceNode *> inputs(numNonzeroCores); |
229 | std::vector<SaveNode *> S(numNonzeroCores); |
230 | |
231 | // Split the input into cores of data-parallel fashion |
232 | fn->createSplit("DPsplit" , input, numNonzeroCores, 0, rowSizePerCore, |
233 | inputs); |
234 | |
235 | // For each core (sub-batch), create a network which does one layer |
236 | for (int core = 0; core < int(numNonzeroCores); core++) { |
237 | |
238 | // Layer Norm 1 bias and scale |
239 | Tensor LN1_scale_Tensor(dtype_, {hiddenSize_}); |
240 | randomizeTensor(&LN1_scale_Tensor, mod->getPRNG()); |
241 | Constant *LN1_scale = mod->createConstant("LN1_scale" , LN1_scale_Tensor); |
242 | Tensor LN1_bias_Tensor(dtype_, {hiddenSize_}); |
243 | randomizeTensor(&LN1_bias_Tensor, mod->getPRNG()); |
244 | Constant *LN1_bias = mod->createConstant("LN1_bias" , LN1_bias_Tensor); |
245 | |
246 | // Layer Norm 2 bias and scale |
247 | Tensor LN2_scale_Tensor(dtype_, {hiddenSize_}); |
248 | randomizeTensor(&LN2_scale_Tensor, mod->getPRNG()); |
249 | Constant *LN2_scale = mod->createConstant("LN2_scale" , LN2_scale_Tensor); |
250 | Tensor LN2_bias_Tensor(dtype_, {hiddenSize_}); |
251 | randomizeTensor(&LN2_bias_Tensor, mod->getPRNG()); |
252 | Constant *LN2_bias = mod->createConstant("LN2_bias" , LN2_bias_Tensor); |
253 | |
254 | // QKV GEMM |
255 | auto *QKV = createFC(fn, mod, strFormat("Gemm_QKV_core%d" , core), |
256 | inputs[core], W_QKV, b_QKV); |
257 | |
258 | // Split into Q, K, V |
259 | std::vector<SliceNode *> outputs(3); |
260 | fn->createSplit(strFormat("split_core%d" , core), QKV, 3, 1, {}, outputs); |
261 | SliceNode *Q = outputs[0]; |
262 | SliceNode *K = outputs[1]; |
263 | SliceNode *V = outputs[2]; |
264 | |
265 | // Multi-headed attention split |
266 | std::vector<SliceNode *> Qsplits(numHeads_); // maxSequenceLength x 64 |
267 | std::vector<SliceNode *> Ksplits(numHeads_); // maxSequenceLength x 64 |
268 | std::vector<SliceNode *> Vsplits(numHeads_); // maxSequenceLength x 64 |
269 | std::vector<NodeValue> Zsplits(numHeads_); // maxSequenceLength x 64 |
270 | fn->createSplit(strFormat("splitQ_core%d" , core), Q, numHeads_, 1, {}, |
271 | Qsplits); |
272 | fn->createSplit(strFormat("splitK_core%d" , core), K, numHeads_, 1, {}, |
273 | Ksplits); |
274 | fn->createSplit(strFormat("splitV_core%d" , core), V, numHeads_, 1, {}, |
275 | Vsplits); |
276 | |
277 | for (int i = 0; i < int(numHeads_); i++) { |
278 | // Split the subbatch into individual sentences for the |
279 | // batch matmul |
280 | std::vector<SliceNode *> QBatchSplits(batchSizePerCore[core]); |
281 | std::vector<SliceNode *> KBatchSplits(batchSizePerCore[core]); |
282 | std::vector<SliceNode *> VBatchSplits(batchSizePerCore[core]); |
283 | std::vector<NodeValue> ZBatchSplits(batchSizePerCore[core]); |
284 | |
285 | fn->createSplit(strFormat("splitBatchQ_core%d" , core), Qsplits[i], |
286 | batchSizePerCore[core], 0, {}, QBatchSplits); |
287 | fn->createSplit(strFormat("splitBatchK_core%d" , core), Ksplits[i], |
288 | batchSizePerCore[core], 0, {}, KBatchSplits); |
289 | fn->createSplit(strFormat("splitBatchV_core%d" , core), Vsplits[i], |
290 | batchSizePerCore[core], 0, {}, VBatchSplits); |
291 | |
292 | // BatchMatMul |
293 | for (int b = 0; b < int(batchSizePerCore[core]); b++) { |
294 | |
295 | auto *Kt = |
296 | fn->createTranspose(strFormat("transpose_core%d_%d" , core, i), |
297 | KBatchSplits[b], {1, 0}); |
298 | // Tmp = Q * K^T |
299 | auto *tmp = |
300 | fn->createMatMul(strFormat("matmul_Q_KT_core%d_%d" , core, i), |
301 | QBatchSplits[b], Kt->getResult()); |
302 | |
303 | // Softmax_output = softmax(Tmp / sqrt(dk)) |
304 | auto *sqrt_dk_splat = |
305 | fn->createSplat(strFormat("sqrt_dk_core%d_%d" , core, i), |
306 | tmp->getResult().getType(), sqrt_dk_flt); |
307 | auto *tmp_div = fn->createMul(strFormat("div_core%d_%d" , core, i), |
308 | tmp, sqrt_dk_splat); |
309 | auto *softmax_output = fn->createSoftMax( |
310 | strFormat("softmax_core%d_%d" , core, i), tmp_div, expected); |
311 | |
312 | ZBatchSplits[b] = |
313 | fn->createMatMul(strFormat("matmul_tmp_v_core%d_%d" , core, i), |
314 | softmax_output, VBatchSplits[b]); |
315 | } |
316 | |
317 | // Concatenate all the Z matrices for the whole subbatch |
318 | Zsplits[i] = |
319 | fn->createConcat(strFormat("concat_core%d" , core), ZBatchSplits, 0); |
320 | } |
321 | |
322 | // Concatenate all the Z matrices that we previously split on the hidden |
323 | // dimension |
324 | auto *Z = fn->createConcat(strFormat("concat_core%d" , core), Zsplits, 1); |
325 | |
326 | // Z x W_o |
327 | auto *ZWO = createFC(fn, mod, strFormat("Gemm_ZWO_core%d" , core), Z, |
328 | W_ZWO, b_ZWO); |
329 | |
330 | // Layer norm |
331 | auto *ZWO_norm = fn->createLayerNormalization( |
332 | strFormat("LayerNorm1_core%d" , core), ZWO->getNthResult(0).getType(), |
333 | ZWO, LN1_scale, LN1_bias, 1e-5); |
334 | |
335 | // FC1 |
336 | auto *FC1 = createFC(fn, mod, strFormat("Gemm_FC1_core%d" , core), |
337 | ZWO_norm, W_FC1, b_FC1); |
338 | |
339 | // Create gelu |
340 | auto *FC1_gelu = fn->createGELU(strFormat("GELU_FC1_core%d" , core), FC1); |
341 | |
342 | // FC2 |
343 | auto *FC2 = createFC(fn, mod, strFormat("Gemm_FC2_core%d" , core), |
344 | FC1_gelu, W_FC2, b_FC2); |
345 | |
346 | // Layer norm |
347 | auto *FC2_norm = fn->createLayerNormalization( |
348 | strFormat("LayerNorm2_core%d" , core), FC2->getNthResult(0).getType(), |
349 | FC2, LN2_scale, LN2_bias, 1e-5); |
350 | |
351 | // Save result |
352 | S[core] = fn->createSave(strFormat("save_core%d" , core), FC2_norm); |
353 | for (int i = 0; i < int(asyncLaunchSize_); i++) { |
354 | contexts_[i]->getPlaceholderBindings()->allocate( |
355 | S[core]->getPlaceholder()); |
356 | } |
357 | } // For each core |
358 | |
359 | // Special case for batch-1, use model parallelism for FCs |
360 | if ((batchSize_ == 1) && (numCores_ > 1)) { |
361 | executeVerticalFCWeightsSplit(fn, numCores_, hiddenSize_); |
362 | } |
363 | |
364 | CompilationContext ctx; |
365 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
366 | fn->dumpDAG(std::string("BERT.dot" )); |
367 | } |
368 | |
369 | void run() override { |
370 | std::vector<std::unique_ptr<ExecutionContext>> localContexts( |
371 | asyncLaunchSize_); |
372 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
373 | std::vector<std::future<void>> futures; |
374 | |
375 | // Launch a number of parallel requests |
376 | int i = 0; |
377 | for (auto &promise : promises) { |
378 | futures.push_back(promise.get_future()); |
379 | hostManager_->runNetwork( |
380 | "singleNode" , std::move(contexts_[i]), |
381 | [&localContexts, &promise, |
382 | i](runtime::RunIdentifierTy, Error err, |
383 | std::unique_ptr<ExecutionContext> contextPtr) { |
384 | EXIT_ON_ERR(std::move(err)); |
385 | localContexts[i] = std::move(contextPtr); |
386 | promise.set_value(); |
387 | }); |
388 | i++; |
389 | } |
390 | for (auto &fut : futures) { |
391 | fut.wait(); |
392 | } |
393 | for (dim_t j = 0; j < asyncLaunchSize_; j++) { |
394 | contexts_[j] = std::move(localContexts[j]); |
395 | } |
396 | } |
397 | |
398 | void teardown() override {} |
399 | |
400 | // Only counting GEMMs |
401 | double gflops() const { |
402 | double num_flops = 0.0; |
403 | |
404 | // QKV |
405 | num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * 3 * hiddenSize_; |
406 | |
407 | // BMM |
408 | num_flops += 2.0 * hiddenSize_ * maxSequenceLength_ * maxSequenceLength_; |
409 | num_flops += 2.0 * hiddenSize_ * maxSequenceLength_ * maxSequenceLength_; |
410 | |
411 | // ZWO |
412 | num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * hiddenSize_; |
413 | |
414 | // FC1 |
415 | num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * 4 * hiddenSize_; |
416 | |
417 | // FC2 |
418 | num_flops += 2.0 * maxSequenceLength_ * hiddenSize_ * 4 * hiddenSize_; |
419 | |
420 | return batchSize_ * num_flops / 1e9; |
421 | } |
422 | }; |
423 | |
424 | int main(int argc, char *argv[]) { |
425 | printf( |
426 | "Usage: BERTLayerBench maxSequenceLength batchSize hiddenSize numHeads " |
427 | "numCores " |
428 | "numReps numAsyncLaunches backendStr dtypeStr useInt8FCs\n" ); |
429 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
430 | "environment variable\n" ); |
431 | benchParseGlowOpts(argc, argv); |
432 | assert(argc == 11); |
433 | size_t maxSequenceLength = atoi(argv[1]); |
434 | size_t batchSize = atoi(argv[2]); |
435 | size_t hiddenSize = atoi(argv[3]); |
436 | size_t numHeads = atoi(argv[4]); |
437 | size_t numCores = atoi(argv[5]); |
438 | size_t numReps = atoi(argv[6]); |
439 | size_t numAsyncLaunches = atoi(argv[7]); |
440 | const char *backendStr = argv[8]; |
441 | const char *dtypeStr = argv[9]; |
442 | const char *useInt8FCs = argv[10]; |
443 | assert(numReps > 0); |
444 | |
445 | BERTProxyLayerBench b(maxSequenceLength, batchSize, hiddenSize, numHeads, |
446 | numCores, numAsyncLaunches, backendStr, dtypeStr, |
447 | useInt8FCs); |
448 | |
449 | auto times = bench(&b, numReps); |
450 | printf("_,benchName,maxSequenceLength,batchSize,hiddenSize,numHeads,numCores," |
451 | "numReps," |
452 | "numAsyncLaunches," |
453 | "backendStr,dtypeStr,useInt8FCs,averageTime,averageGFLOP\n" ); |
454 | for (auto t : times) { |
455 | printf("BenchResult,BERTProxyLayerBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%" |
456 | "s,%s,%f,%f\n" , |
457 | maxSequenceLength, batchSize, hiddenSize, numHeads, numCores, |
458 | numReps, numAsyncLaunches, backendStr, dtypeStr, useInt8FCs, |
459 | t / numAsyncLaunches, b.gflops() * numAsyncLaunches / t); |
460 | } |
461 | double min = *(std::min_element(times.begin(), times.end())); |
462 | size_t midElt = times.size() / 2; |
463 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
464 | double median = times[midElt]; |
465 | double median_runtime = median / ((double)numAsyncLaunches); |
466 | double min_runtime = min / ((double)numAsyncLaunches); |
467 | printf("_,benchName,maxSequenceLength,batchSize,hiddenSize,numHeads,numCores," |
468 | "numReps," |
469 | "numAsyncLaunches," |
470 | "backendStr,dtypeStr,useInt8FCs,medianRuntime,minRuntime,medianGFLOPS," |
471 | "minGFLOPS\n" ); |
472 | printf("Total gflop: %f\n" , b.gflops()); |
473 | printf("BenchSummary,BERTProxyLayerBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%" |
474 | "s,%s," |
475 | "%f,%f,%f,%" |
476 | "f\n" , |
477 | maxSequenceLength, batchSize, hiddenSize, numHeads, numCores, numReps, |
478 | numAsyncLaunches, backendStr, dtypeStr, useInt8FCs, median_runtime, |
479 | min_runtime, b.gflops() / median_runtime, b.gflops() / min_runtime); |
480 | } |
481 | |