BERTProxyLayerBench.cpp source code [glow/tests/benchmark/BERTProxyLayerBench.cpp]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#include <algorithm>
17	#include <cstdlib>
18	#include <future>
19	#include <random>
20
21	#include "Bench.h"
22
23	#include "glow/ExecutionEngine/ExecutionEngine.h"
24	#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26	using namespace glow;
27
28	/*
29	* This class implements a performance proxy for a single layer of
30	* the BERT network.
31	*/
32	class BERTProxyLayerBench : public Benchmark {
33	dim_t maxSequenceLength_;
34	dim_t batchSize_;
35	dim_t hiddenSize_;
36	dim_t numHeads_;
37	dim_t numCores_;
38	std::unique_ptr<runtime::HostManager> hostManager_;
39	std::vector<std::unique_ptr<ExecutionContext>> contexts_;
40	dim_t asyncLaunchSize_;
41	const char *backendStr_;
42	ElemKind dtype_;
43	ElemKind FCWeightType_;
44	ElemKind FCBiasType_;
45	float FCWeightScale_;
46	int32_t FCWeightOffset_;
47	bool quantize;
48
49	public:
50	BERTProxyLayerBench(dim_t maxSequenceLength_, dim_t batchSize_,
51	dim_t hiddenSize_, dim_t numHeads_, dim_t numCores_,
52	dim_t asyncLaunchSize_, const char *backendStr_,
53	const char dtypeStr_, const* char *useInt8FCs)
54	: maxSequenceLength_(maxSequenceLength_), batchSize_(batchSize_),
55	hiddenSize_(hiddenSize_), numHeads_(numHeads_), numCores_(numCores_),
56	asyncLaunchSize_(asyncLaunchSize_), backendStr_(backendStr_) {
57
58	dtype_ = ElemKind::Float16Ty;
59	quantize = false;
60	if (std::string (dtypeStr_) == "Float16") {
61	dtype_ = ElemKind::Float16Ty;
62	FCWeightType_ = ElemKind::Float16Ty;
63	FCBiasType_ = ElemKind::Float16Ty;
64	FCWeightScale_ = `1.0f`;
65	FCWeightOffset_ = `0`;
66	} else if (std::string (dtypeStr_) == "Float32") {
67	dtype_ = ElemKind::FloatTy;
68	FCWeightType_ = ElemKind::FloatTy;
69	FCBiasType_ = ElemKind::FloatTy;
70	FCWeightScale_ = `1.0f`;
71	FCWeightOffset_ = `0`;
72	}
73	// If quantization is requested then use Int8/Int32
74	if (std::string (useInt8FCs) == "True") {
75	FCWeightType_ = ElemKind::Int8QTy;
76	FCBiasType_ = ElemKind::Int32QTy;
77	quantize = true;
78	FCWeightScale_ = `1.0`;
79	FCWeightOffset_ = `128`;
80	}
81	}
82
83	// Handle different tensor types
84	void randomizeTensor(Tensor *tn, PseudoRNG rng) {
85	if (tn->getElementType() == ElemKind::FloatTy) {
86	tn->getHandle<float_t>().randomize(`0.0f`, `1.0f`, rng);
87	} else if (tn->getElementType() == ElemKind::Float16Ty) {
88	tn->getHandle<float16_t>().randomize(`0.0f`, `1.0f`, rng);
89	} else if (tn->getElementType() == ElemKind::Int8QTy) {
90	tn->getHandle<int8_t>().randomize(-`127`, `127`, rng);
91	} else if (tn->getElementType() == ElemKind::Int32QTy) {
92	tn->getHandle<int32_t>().randomize(-`128`, `128`, rng);
93	}
94	}
95
96	// Handle different tensor types
97	void setTensor(Tensor tn, float* val) {
98	if (tn->getElementType() == ElemKind::FloatTy) {
99	tn->getHandle<float_t>().clear(val);
100	} else if (tn->getElementType() == ElemKind::Float16Ty) {
101	tn->getHandle<float16_t>().clear(val);
102	} else if (tn->getElementType() == ElemKind::Int8QTy) {
103	tn->getHandle<int8_t>().clear(val);
104	} else if (tn->getElementType() == ElemKind::Int32QTy) {
105	tn->getHandle<int32_t>().clear(val);
106	}
107	}
108
109	Node createFC(Function fn, std::unique_ptr<Module> &mod, std::string name,
110	Node In, Constant W, Constant *b) {
111	// Optionally add nodes for quantization of FCs
112	if (quantize) {
113	TypeRef InQTy = mod ->uniqueType(FCWeightType_, In->dims(`0`), `2.0`, -`128.0`);
114	auto *InQ = fn->createQuantize(name, In, InQTy);
115	auto *FCQ = fn->createFullyConnected(name, InQ, W, b);
116	TypeRef FCQTy = mod ->uniqueType(dtype_, FCQ->dims(`0`));
117	Node *FCO = fn->createDequantize(name, FCQ, FCQTy);
118	return FCO;
119	} else {
120	return fn->createFullyConnected(name, In, W, b);
121	}
122	}
123
124	void setup() override {
125
126	// Create execution contexts here
127	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
128	std::unique_ptr<ExecutionContext> context(new ExecutionContext);
129	contexts_.push_back(std::move(context));
130	}
131
132	// Setup host manager
133	std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
134	auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
135	configs.push_back(std::move(config));
136	hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
137
138	std::unique_ptr<Module> mod(new Module);
139	auto fn = mod ->createFunction("singleNode");
140
141	// Input Placeholder ((maxSequenceLengthbatchSize) x hiddenSize) (split)*
142	Placeholder *input = mod ->createPlaceholder(
143	dtype_, {maxSequenceLength_ * batchSize_, hiddenSize_}, "input", false);
144
145	// for each context, add input bindings
146	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
147	randomizeTensor(contexts_[i]->getPlaceholderBindings()->allocate(input),
148	mod ->getPRNG());
149	}
150
151	// Weights/bias constants for QKV GEMM
152	Tensor W_QKV_Tensor =
153	(quantize) ? Tensor (FCWeightType_, {hiddenSize_, `3` * hiddenSize_},
154	FCWeightScale_, FCWeightOffset_)
155	: Tensor (FCWeightType_, {hiddenSize_, `3` * hiddenSize_});
156	randomizeTensor(&W_QKV_Tensor, mod ->getPRNG());
157	Constant *W_QKV = mod ->createConstant("W_QKV", W_QKV_Tensor);
158	Tensor b_QKV_Tensor = (quantize) ? Tensor (FCBiasType_, {`3` * hiddenSize_},
159	FCWeightScale_, FCWeightOffset_)
160	: Tensor (FCBiasType_, {`3` * hiddenSize_});
161	setTensor(&b_QKV_Tensor, `0.0f`);
162	Constant *b_QKV = mod ->createConstant("b_QKV", b_QKV_Tensor);
163
164	// Weights/bias constants for ZxWo FC
165	Tensor W_ZWO_Tensor =
166	(quantize) ? Tensor (FCWeightType_, {hiddenSize_, hiddenSize_},
167	FCWeightScale_, FCWeightOffset_)
168	: Tensor (FCWeightType_, {hiddenSize_, hiddenSize_});
169	randomizeTensor(&W_ZWO_Tensor, mod ->getPRNG());
170	Constant *W_ZWO = mod ->createConstant("W_ZWO", W_ZWO_Tensor);
171	Tensor b_ZWO_Tensor = (quantize) ? Tensor (FCBiasType_, {hiddenSize_},
172	FCWeightScale_, FCWeightOffset_)
173	: Tensor (FCBiasType_, {hiddenSize_});
174	randomizeTensor(&b_ZWO_Tensor, mod ->getPRNG());
175	Constant *b_ZWO = mod ->createConstant("b_ZWO", b_ZWO_Tensor);
176
177	// Constant scaling factor
178	float sqrt_dk_flt =
179	(float)(`1.0` / std::sqrt(((double)hiddenSize_) / ((double)numHeads_)));
180
181	// Softmax expected output. Not needed for inference
182	Tensor expected_Tensor(ElemKind::Int64ITy,
183	{maxSequenceLength_ * batchSize_, `1`});
184	Constant *expected = mod ->createConstant("expected", expected_Tensor);
185
186	// Weights/bias constants for FC1
187	Tensor W_FC1_Tensor =
188	(quantize) ? Tensor (FCWeightType_, {hiddenSize_, `4` * hiddenSize_},
189	FCWeightScale_, FCWeightOffset_)
190	: Tensor (FCWeightType_, {hiddenSize_, `4` * hiddenSize_});
191	randomizeTensor(&W_FC1_Tensor, mod ->getPRNG());
192	Constant *W_FC1 = mod ->createConstant("W_FC1", W_FC1_Tensor);
193	Tensor b_FC1_Tensor = (quantize) ? Tensor (FCBiasType_, {`4` * hiddenSize_},
194	FCWeightScale_, FCWeightOffset_)
195	: Tensor (FCBiasType_, {`4` * hiddenSize_});
196	randomizeTensor(&b_FC1_Tensor, mod ->getPRNG());
197	Constant *b_FC1 = mod ->createConstant("b_FC1", b_FC1_Tensor);
198
199	// Weights/bias constants for FC2
200	Tensor W_FC2_Tensor =
201	(quantize) ? Tensor (FCWeightType_, {`4` * hiddenSize_, hiddenSize_},
202	FCWeightScale_, FCWeightOffset_)
203	: Tensor (FCWeightType_, {`4` * hiddenSize_, hiddenSize_});
204	randomizeTensor(&W_FC2_Tensor, mod ->getPRNG());
205	Constant *W_FC2 = mod ->createConstant("W_FC2", W_FC2_Tensor);
206	Tensor b_FC2_Tensor = (quantize) ? Tensor (FCBiasType_, {hiddenSize_},
207	FCWeightScale_, FCWeightOffset_)
208	: Tensor (FCBiasType_, {hiddenSize_});
209	randomizeTensor(&b_FC2_Tensor, mod ->getPRNG());
210	Constant *b_FC2 = mod ->createConstant("b_FC2", b_FC2_Tensor);
211
212	// batchSizePerCore is the number of sentences assigned to each
213	// core (each data-parallel chunk)
214	auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_);
215
216	// rowSizePerCore is the number of tokens assigned to each
217	// core (each data-parallel chunk)
218	dim_t numNonzeroCores = `0`;
219	std::vector<dim_t> rowSizePerCore;
220	for (dim_t i = `0`; i < batchSizePerCore.size(); i++) {
221	if (batchSizePerCore [i] > `0`) {
222	rowSizePerCore.push_back(batchSizePerCore [i] * maxSequenceLength_);
223	numNonzeroCores++;
224	}
225	}
226
227	// Split the batch across cores in a data-parallel fashion
228	std::vector<SliceNode *> inputs(numNonzeroCores);
229	std::vector<SaveNode *> S(numNonzeroCores);
230
231	// Split the input into cores of data-parallel fashion
232	fn->createSplit("DPsplit", input, numNonzeroCores, `0`, rowSizePerCore,
233	inputs);
234
235	// For each core (sub-batch), create a network which does one layer
236	for (int core = `0`; core < int(numNonzeroCores); core++) {
237
238	// Layer Norm 1 bias and scale
239	Tensor LN1_scale_Tensor(dtype_, {hiddenSize_});
240	randomizeTensor(&LN1_scale_Tensor, mod ->getPRNG());
241	Constant *LN1_scale = mod ->createConstant("LN1_scale", LN1_scale_Tensor);
242	Tensor LN1_bias_Tensor(dtype_, {hiddenSize_});
243	randomizeTensor(&LN1_bias_Tensor, mod ->getPRNG());
244	Constant *LN1_bias = mod ->createConstant("LN1_bias", LN1_bias_Tensor);
245
246	// Layer Norm 2 bias and scale
247	Tensor LN2_scale_Tensor(dtype_, {hiddenSize_});
248	randomizeTensor(&LN2_scale_Tensor, mod ->getPRNG());
249	Constant *LN2_scale = mod ->createConstant("LN2_scale", LN2_scale_Tensor);
250	Tensor LN2_bias_Tensor(dtype_, {hiddenSize_});
251	randomizeTensor(&LN2_bias_Tensor, mod ->getPRNG());
252	Constant *LN2_bias = mod ->createConstant("LN2_bias", LN2_bias_Tensor);
253
254	// QKV GEMM
255	auto *QKV = createFC(fn, mod, strFormat("Gemm_QKV_core%d", core),
256	inputs [core], W_QKV, b_QKV);
257
258	// Split into Q, K, V
259	std::vector<SliceNode *> outputs(`3`);
260	fn->createSplit(strFormat("split_core%d", core), QKV, `3`, `1`, {}, outputs);
261	SliceNode *Q = outputs [`0`];
262	SliceNode *K = outputs [`1`];
263	SliceNode *V = outputs [`2`];
264
265	// Multi-headed attention split
266	std::vector<SliceNode > Qsplits(numHeads_); // maxSequenceLength x 64*
267	std::vector<SliceNode > Ksplits(numHeads_); // maxSequenceLength x 64*
268	std::vector<SliceNode > Vsplits(numHeads_); // maxSequenceLength x 64*
269	std::vector<NodeValue> Zsplits(numHeads_); // maxSequenceLength x 64
270	fn->createSplit(strFormat("splitQ_core%d", core), Q, numHeads_, `1`, {},
271	Qsplits);
272	fn->createSplit(strFormat("splitK_core%d", core), K, numHeads_, `1`, {},
273	Ksplits);
274	fn->createSplit(strFormat("splitV_core%d", core), V, numHeads_, `1`, {},
275	Vsplits);
276
277	for (int i = `0`; i < int(numHeads_); i++) {
278	// Split the subbatch into individual sentences for the
279	// batch matmul
280	std::vector<SliceNode *> QBatchSplits(batchSizePerCore [core]);
281	std::vector<SliceNode *> KBatchSplits(batchSizePerCore [core]);
282	std::vector<SliceNode *> VBatchSplits(batchSizePerCore [core]);
283	std::vector<NodeValue> ZBatchSplits(batchSizePerCore [core]);
284
285	fn->createSplit(strFormat("splitBatchQ_core%d", core), Qsplits [i],
286	batchSizePerCore [core], `0`, {}, QBatchSplits);
287	fn->createSplit(strFormat("splitBatchK_core%d", core), Ksplits [i],
288	batchSizePerCore [core], `0`, {}, KBatchSplits);
289	fn->createSplit(strFormat("splitBatchV_core%d", core), Vsplits [i],
290	batchSizePerCore [core], `0`, {}, VBatchSplits);
291
292	// BatchMatMul
293	for (int b = `0`; b < int(batchSizePerCore [core]); b++) {
294
295	auto *Kt =
296	fn->createTranspose(strFormat("transpose_core%d_%d", core, i),
297	KBatchSplits [b], {`1`, `0`});
298	// Tmp = Q K^T*
299	auto *tmp =
300	fn->createMatMul(strFormat("matmul_Q_KT_core%d_%d", core, i),
301	QBatchSplits [b], Kt->getResult());
302
303	// Softmax_output = softmax(Tmp / sqrt(dk))
304	auto *sqrt_dk_splat =
305	fn->createSplat(strFormat("sqrt_dk_core%d_%d", core, i),
306	tmp->getResult().getType(), sqrt_dk_flt);
307	auto *tmp_div = fn->createMul(strFormat("div_core%d_%d", core, i),
308	tmp, sqrt_dk_splat);
309	auto *softmax_output = fn->createSoftMax(
310	strFormat("softmax_core%d_%d", core, i), tmp_div, expected);
311
312	ZBatchSplits [b] =
313	fn->createMatMul(strFormat("matmul_tmp_v_core%d_%d", core, i),
314	softmax_output, VBatchSplits [b]);
315	}
316
317	// Concatenate all the Z matrices for the whole subbatch
318	Zsplits [i] =
319	fn->createConcat(strFormat("concat_core%d", core), ZBatchSplits, `0`);
320	}
321
322	// Concatenate all the Z matrices that we previously split on the hidden
323	// dimension
324	auto *Z = fn->createConcat(strFormat("concat_core%d", core), Zsplits, `1`);
325
326	// Z x W_o
327	auto *ZWO = createFC(fn, mod, strFormat("Gemm_ZWO_core%d", core), Z,
328	W_ZWO, b_ZWO);
329
330	// Layer norm
331	auto *ZWO_norm = fn->createLayerNormalization(
332	strFormat("LayerNorm1_core%d", core), ZWO->getNthResult(`0`).getType(),
333	ZWO, LN1_scale, LN1_bias, `1e-5`);
334
335	// FC1
336	auto *FC1 = createFC(fn, mod, strFormat("Gemm_FC1_core%d", core),
337	ZWO_norm, W_FC1, b_FC1);
338
339	// Create gelu
340	auto *FC1_gelu = fn->createGELU(strFormat("GELU_FC1_core%d", core), FC1);
341
342	// FC2
343	auto *FC2 = createFC(fn, mod, strFormat("Gemm_FC2_core%d", core),
344	FC1_gelu, W_FC2, b_FC2);
345
346	// Layer norm
347	auto *FC2_norm = fn->createLayerNormalization(
348	strFormat("LayerNorm2_core%d", core), FC2->getNthResult(`0`).getType(),
349	FC2, LN2_scale, LN2_bias, `1e-5`);
350
351	// Save result
352	S [core] = fn->createSave(strFormat("save_core%d", core), FC2_norm);
353	for (int i = `0`; i < int(asyncLaunchSize_); i++) {
354	contexts_[i]->getPlaceholderBindings()->allocate(
355	S [core]->getPlaceholder());
356	}
357	} // For each core
358
359	// Special case for batch-1, use model parallelism for FCs
360	if ((batchSize_ == `1`) && (numCores_ > `1`)) {
361	executeVerticalFCWeightsSplit(fn, numCores_, hiddenSize_);
362	}
363
364	CompilationContext ctx;
365	EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
366	fn->dumpDAG(std::string ("BERT.dot"));
367	}
368
369	void run() override {
370	std::vector<std::unique_ptr<ExecutionContext>> localContexts(
371	asyncLaunchSize_);
372	std::vector<std::promise<void>> promises(asyncLaunchSize_);
373	std::vector<std::future<void>> futures;
374
375	// Launch a number of parallel requests
376	int i = `0`;
377	for (auto &promise : promises) {
378	futures.push_back(promise.get_future());
379	hostManager_->runNetwork(
380	"singleNode", std::move(contexts_[i]),
381	[&localContexts, &promise,
382	i](runtime::RunIdentifierTy, Error err,
383	std::unique_ptr<ExecutionContext> contextPtr) {
384	EXIT_ON_ERR(std::move(err));
385	localContexts [i] = std::move(contextPtr);
386	promise.set_value();
387	});
388	i++;
389	}
390	for (auto &fut : futures) {
391	fut.wait();
392	}
393	for (dim_t j = `0`; j < asyncLaunchSize_; j++) {
394	contexts_[j] = std::move(localContexts [j]);
395	}
396	}
397
398	void teardown() override {}
399
400	// Only counting GEMMs
401	double gflops() const {
402	double num_flops = `0.0`;
403
404	// QKV
405	num_flops += `2.0` * maxSequenceLength_ * hiddenSize_ * `3` * hiddenSize_;
406
407	// BMM
408	num_flops += `2.0` * hiddenSize_ * maxSequenceLength_ * maxSequenceLength_;
409	num_flops += `2.0` * hiddenSize_ * maxSequenceLength_ * maxSequenceLength_;
410
411	// ZWO
412	num_flops += `2.0` * maxSequenceLength_ * hiddenSize_ * hiddenSize_;
413
414	// FC1
415	num_flops += `2.0` * maxSequenceLength_ * hiddenSize_ * `4` * hiddenSize_;
416
417	// FC2
418	num_flops += `2.0` * maxSequenceLength_ * hiddenSize_ * `4` * hiddenSize_;
419
420	return batchSize_ * num_flops / `1e9`;
421	}
422	};
423
424	int main(int argc, char *argv[]) {
425	printf(
426	"Usage: BERTLayerBench maxSequenceLength batchSize hiddenSize numHeads "
427	"numCores "
428	"numReps numAsyncLaunches backendStr dtypeStr useInt8FCs\n");
429	printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
430	"environment variable\n");
431	benchParseGlowOpts(argc, argv);
432	assert(argc == `11`);
433	size_t maxSequenceLength = atoi(argv[`1`]);
434	size_t batchSize = atoi(argv[`2`]);
435	size_t hiddenSize = atoi(argv[`3`]);
436	size_t numHeads = atoi(argv[`4`]);
437	size_t numCores = atoi(argv[`5`]);
438	size_t numReps = atoi(argv[`6`]);
439	size_t numAsyncLaunches = atoi(argv[`7`]);
440	const char *backendStr = argv[`8`];
441	const char *dtypeStr = argv[`9`];
442	const char *useInt8FCs = argv[`10`];
443	assert(numReps > `0`);
444
445	BERTProxyLayerBench b(maxSequenceLength, batchSize, hiddenSize, numHeads,
446	numCores, numAsyncLaunches, backendStr, dtypeStr,
447	useInt8FCs);
448
449	auto times = bench(&b, numReps);
450	printf("_,benchName,maxSequenceLength,batchSize,hiddenSize,numHeads,numCores,"
451	"numReps,"
452	"numAsyncLaunches,"
453	"backendStr,dtypeStr,useInt8FCs,averageTime,averageGFLOP\n");
454	for (auto t : times) {
455	printf("BenchResult,BERTProxyLayerBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%"
456	"s,%s,%f,%f\n",
457	maxSequenceLength, batchSize, hiddenSize, numHeads, numCores,
458	numReps, numAsyncLaunches, backendStr, dtypeStr, useInt8FCs,
459	t / numAsyncLaunches, b.gflops() * numAsyncLaunches / t);
460	}
461	double min = *(std::min_element(times.begin(), times.end()));
462	size_t midElt = times.size() / `2`;
463	std::nth_element(times.begin(), times.begin() + midElt, times.end());
464	double median = times [midElt];
465	double median_runtime = median / ((double)numAsyncLaunches);
466	double min_runtime = min / ((double)numAsyncLaunches);
467	printf("_,benchName,maxSequenceLength,batchSize,hiddenSize,numHeads,numCores,"
468	"numReps,"
469	"numAsyncLaunches,"
470	"backendStr,dtypeStr,useInt8FCs,medianRuntime,minRuntime,medianGFLOPS,"
471	"minGFLOPS\n");
472	printf("Total gflop: %f\n", b.gflops());
473	printf("BenchSummary,BERTProxyLayerBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%"
474	"s,%s,"
475	"%f,%f,%f,%"
476	"f\n",
477	maxSequenceLength, batchSize, hiddenSize, numHeads, numCores, numReps,
478	numAsyncLaunches, backendStr, dtypeStr, useInt8FCs, median_runtime,
479	min_runtime, b.gflops() / median_runtime, b.gflops() / min_runtime);
480	}
481

Browse the source code of glow/tests/benchmark/BERTProxyLayerBench.cpp