ResNetBench.cpp source code [glow/tests/benchmark/ResNetBench.cpp]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#include <algorithm>
17	#include <cstdlib>
18	#include <future>
19	#include <random>
20
21	#include "Bench.h"
22
23	#include "glow/ExecutionEngine/ExecutionEngine.h"
24	#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26	using namespace glow;
27
28	namespace {
29	llvm::cl::OptionCategory category("ResNetBench Options");
30
31	llvm::cl::opt<std::string> backend("backend", llvm::cl::desc ("Backend to use"),
32	llvm::cl::Optional,
33	llvm::cl::init("Interpreter"),
34	llvm::cl::cat (category));
35	llvm::cl::opt<unsigned>
36	numReps("numReps", llvm::cl::desc ("Number of benchmark repititions"),
37	llvm::cl::init(`1`), llvm::cl::value_desc ("N"),
38	llvm::cl::cat (category));
39
40	llvm::cl::opt<unsigned> batchSize("batchSize",
41	llvm::cl::desc ("Image batch size"),
42	llvm::cl::init(`1`), llvm::cl::value_desc ("N"),
43	llvm::cl::cat (category));
44
45	llvm::cl::opt<unsigned> baseSize("baseSize",
46	llvm::cl::desc ("Image H and W initial size"),
47	llvm::cl::init(`224`), llvm::cl::value_desc ("N"),
48	llvm::cl::cat (category));
49
50	llvm::cl::opt<unsigned>
51	numBins("numBins", llvm::cl::desc ("Number of image sizes to create"),
52	llvm::cl::init(`1`), llvm::cl::value_desc ("N"),
53	llvm::cl::cat (category));
54
55	llvm::cl::opt<unsigned> stepSize(
56	"stepSize", llvm::cl::desc ("Difference between each dimension in bins"),
57	llvm::cl::init(`10`), llvm::cl::value_desc ("N"), llvm::cl::cat (category));
58
59	llvm::cl::opt<unsigned> replicationCount("replicationCount",
60	llvm::cl::desc ("replicationCount"),
61	llvm::cl::init(`1`),
62	llvm::cl::value_desc ("N"),
63	llvm::cl::cat (category));
64
65	llvm::cl::opt<bool> saturateHost("saturateHost", llvm::cl::desc ("saturateHost"),
66	llvm::cl::init(true), llvm::cl::cat (category));
67
68	llvm::cl::opt<bool> convertToFP16("convertToFP16",
69	llvm::cl::desc ("convertToFP16"),
70	llvm::cl::init(true),
71	llvm::cl::cat (category));
72
73	llvm::cl::opt<bool>
74	fpEverywhere("fpEverywhere",
75	llvm::cl::desc ("Run model in fp instead quantized"),
76	llvm::cl::init(false), llvm::cl::cat (category));
77
78	llvm::cl::opt<bool> dumpDAG("dumpDAG",
79	llvm::cl::desc ("Dump the final glow graph"),
80	llvm::cl::init(true), llvm::cl::cat (category));
81
82	llvm::cl::opt<unsigned> numDevices("numDevices",
83	llvm::cl::desc ("Number of backend devices"),
84	llvm::cl::init(`1`), llvm::cl::value_desc ("N"),
85	llvm::cl::cat (category));
86
87	llvm::cl::opt<unsigned> maxActiveRequests(
88	"maxActiveRequests", llvm::cl::desc ("Maximum active Glow requests"),
89	llvm::cl::init(`250`), llvm::cl::value_desc ("N"), llvm::cl::cat (category));
90
91	llvm::cl::opt<unsigned> numBatches("numBatches",
92	llvm::cl::desc ("Number of batches to run"),
93	llvm::cl::init(`10`),
94	llvm::cl::value_desc ("N"),
95	llvm::cl::cat (category));
96
97	llvm::cl::opt<unsigned>
98	numRequesters("numRequesters", llvm::cl::desc ("Number of request threads"),
99	llvm::cl::init(`1`), llvm::cl::value_desc ("N"),
100	llvm::cl::cat (category));
101
102	llvm::cl::opt<int>
103	logEvery("logEvery", llvm::cl::desc ("Log every N requests on first thead"),
104	llvm::cl::init(`1000`), llvm::cl::value_desc ("N"),
105	llvm::cl::cat (category));
106
107	llvm::cl::opt<unsigned> numCompileThreads(
108	"numCompileThreads",
109	llvm::cl::desc ("Number of threads to use for compilation"),
110	llvm::cl::init(`1`), llvm::cl::value_desc ("N"), llvm::cl::cat (category));
111
112	llvm::cl::opt<bool>
113	avgPool("avgPool",
114	llvm::cl::desc ("Add quantized AdaptiveAvgPool node to the graph. "
115	"If fpEverywhere then the node will also be fp."),
116	llvm::cl::init(true), llvm::cl::cat (category));
117
118	llvm::cl::opt<bool>
119	avgPoolFP("avgPoolFP",
120	llvm::cl::desc ("Add fp AdaptiveAvgPool node to the graph."),
121	llvm::cl::init(false), llvm::cl::cat (category));
122
123	enum class Block {
124	Bottleneck,
125	BasicBlock,
126	};
127
128	unsigned_t getExpansion(Block block) {
129	switch (block) {
130	case Block::Bottleneck:
131	return `4`;
132	case Block::BasicBlock:
133	return `1`;
134	}
135	LOG(FATAL) << "Unsupported block";
136	}
137
138	class ResNetBuilder {
139	private:
140	Function F_ = nullptr*;
141	// Hack to distinguish weights of each convolution to prevent constant weight
142	// sharing within graph but enable weight sharing across graphs
143	unsigned_t nextFilterValue_ = `1`;
144
145	const Block block_;
146	const unsigned_t groups_;
147	const unsigned_t widthPerGroup_;
148	const unsigned_t dilation_ = `1`;
149	const unsigned_t inPlanes_ = `64`;
150	const std::vector<unsigned_t> layers_;
151
152	NodeValue createConv(NodeValue input, unsigned_t outChannels,
153	unsigned_t kernel, unsigned_t stride = `1`,
154	unsigned_t pad = `0`, unsigned_t dilation = `1`,
155	unsigned_t groups = `1`, bool fp = false) {
156
157	if (fpEverywhere) {
158	fp = true;
159	}
160
161	ShapeNHWC inputShape(input.dims());
162
163	assert(inputShape.c % groups == `0`);
164	assert(outChannels % groups == `0`);
165
166	auto *filter = F_->getParent()->createConstant(
167	ElemKind::FloatTy, {outChannels, kernel, kernel, inputShape.c / groups},
168	"filter");
169
170	size_t fanIn = kernel * kernel * inputShape.c;
171	filter->getPayloadMutable().init(Tensor::InitKind::Xavier, fanIn,
172	F_->getParent()->getPRNG());
173
174	// Need to be constant so that all networks have the same weights
175	filter->getPayloadMutable().init(Tensor::InitKind::Broadcast,
176	float(`20` + nextFilterValue_++),
177	F_->getParent()->getPRNG());
178
179	auto bias = F_->getParent()->createConstant(ElemKind::FloatTy,
180	{outChannels}, "bias");
181	bias->getPayloadMutable().init(Tensor::InitKind::Broadcast,
182	float(`20` + nextFilterValue_++),
183	F_->getParent()->getPRNG());
184
185	std::vector<unsigned_t> kernels = {kernel, kernel};
186	std::vector<unsigned_t> strides = {stride, stride};
187	std::vector<unsigned_t> pads = {pad, pad, pad, pad};
188	std::vector<unsigned_t> dilations = {dilation, dilation};
189
190	auto outSz = calculateConvPoolOutputDims(inputShape.h, inputShape.w,
191	kernels, strides, pads, dilations);
192	std::array<dim_t, `4`> outDims = {
193	{inputShape.n, outSz.first, outSz.second, outChannels}};
194
195	if (fp) {
196	auto *outTy = F_->getParent()->uniqueType(ElemKind::FloatTy, outDims);
197	return F_
198	->createConv("conv", input, filter, bias, outTy, kernels, strides,
199	pads, groups, dilations)
200	->getResult();
201	} else {
202	auto *outTy =
203	F_->getParent()->uniqueType(ElemKind::Int8QTy, outDims, `1.0`, `0`);
204
205	return F_
206	->createChannelwiseQuantizedConv(
207	"conv", input, filter, bias, /filterScales/ nullptr,
208	/filterOffsets/ nullptr, /biasScales/ nullptr,
209	/biasOffsets/ nullptr, outTy, kernels, strides, pads, groups,
210	dilations,
211	/quantizeFilter/ true, /quantizeBias/ false,
212	/schema/ quantization::Schema::Symmetric)
213	->getResult();
214	}
215	}
216
217	NodeValue conv3x3(NodeValue input, unsigned_t outPlanes,
218	unsigned_t stride = `1`, unsigned_t groups = `1`,
219	unsigned_t dilation = `1`) {
220	return createConv(input, outPlanes, /kernel/ `3`, stride, /pad/ dilation,
221	dilation, groups);
222	}
223
224	NodeValue conv1x1(NodeValue input, unsigned_t outPlanes,
225	unsigned_t stride = `1`) {
226	return createConv(input, outPlanes, /kernel/ `1`, stride);
227	}
228
229	NodeValue createRelu(NodeValue input) {
230	return F_->createRELU("relu", input)->getResult();
231	}
232
233	NodeValue createAdd(NodeValue lhs, NodeValue rhs) {
234	if (isQuantizedElemKind(lhs.getElementType())) {
235	return F_->createAdd("qadd", lhs.getType(), lhs, rhs);
236	} else {
237	return F_->createAdd("add", lhs, rhs);
238	}
239	}
240
241	NodeValue createBN(NodeValue input) {
242	// Emulate fused Conv + BN
243	auto inputKind = input.getNode()->getKind();
244	if (inputKind == Kinded::Kind::ConvolutionNodeKind \|\|
245	inputKind == Kinded::Kind::ChannelwiseQuantizedConvolutionNodeKind) {
246	return input;
247	}
248	LOG(FATAL)
249	<< "Fake batchnorm op has to be after a convolution to emulate fusion";
250	}
251
252	NodeValue makeAvgPool(NodeValue input) {
253	auto inputDims = input.dims();
254	auto *outTy = F_->getParent()->uniqueTypeWithNewShape(
255	input.getType(), {inputDims [`0`], `1`, `1`, inputDims [`3`]});
256	return F_->createAdaptiveAvgPool("adaptive_avg_pool", input, outTy)
257	->getResult();
258	}
259
260	NodeValue makeBlock(NodeValue input, NodeValue residual, unsigned_t planes,
261	unsigned_t stride = `1`, unsigned_t groups = `1`,
262	unsigned_t baseWidth = `64`, unsigned_t dilation = `1`) {
263	unsigned_t expansion = getExpansion(block_);
264	NodeValue next = input;
265	if (block_ == Block::Bottleneck) {
266	auto width = unsigned_t(planes * (baseWidth / `64.0`)) * groups;
267	next = conv1x1(next, width);
268	next = createBN(next);
269	next = createRelu(next);
270
271	next = conv3x3(next, width, stride, groups, dilation);
272	next = createBN(next);
273	next = createRelu(next);
274
275	next = conv1x1(next, planes * expansion);
276	next = createBN(next);
277	next = createAdd(next, residual);
278	next = createRelu(next);
279
280	} else if (block_ == Block::BasicBlock) {
281	next = conv3x3(next, planes, stride);
282	next = createBN(next);
283	next = createRelu(next);
284
285	next = conv3x3(next, planes);
286	next = createBN(next);
287	next = createAdd(next, residual);
288	next = createRelu(next);
289	} else {
290	LOG(FATAL) << "Unknown block";
291	}
292	return next;
293	}
294
295	NodeValue makeLayer(NodeValue input, unsigned_t planes, unsigned_t blocks,
296	unsigned_t stride) {
297	NodeValue next = input;
298	NodeValue residual = next;
299	auto blockExpansion = getExpansion(block_);
300	if (stride != `1` \|\| inPlanes_ != planes * blockExpansion) {
301	residual = conv1x1(next, planes * blockExpansion, stride);
302	}
303	next = makeBlock(next, residual, planes, stride, groups_, widthPerGroup_,
304	dilation_);
305
306	for (unsigned_t i = `1`; i < blocks; ++i) {
307	residual = next;
308	next = makeBlock(next, residual, planes, /stride/ `1`, groups_,
309	widthPerGroup_, dilation_);
310	}
311
312	return next;
313	}
314
315	public:
316	ResNetBuilder(Block block, llvm::ArrayRef<unsigned_t> layers,
317	unsigned_t groups = `1`, unsigned_t widthPerGroup = `64`)
318	: block_(block), groups_(groups), widthPerGroup_(widthPerGroup),
319	layers_(layers.vec()) {}
320
321	Placeholder build(Placeholder input, Function *F) {
322	F_ = F;
323	nextFilterValue_ = `1`;
324	NodeValue next = input->getOutput();
325	next = F_->createTranspose("NCHW2NHWC", next, NCHW2NHWC);
326	next =
327	createConv(next, /outChannels/ inPlanes_, /kernel/ `7`, /stride/ `2`,
328	/pad/ `3`, /dilation/ `1`, /groups/ `1`, /fp/ true);
329	next = createBN(next);
330	next = createRelu(next);
331	next = F_->createMaxPool("maxpool", next, /kernel/ `3`, /sride/ `2`,
332	/pad/
333	`1`)
334	->getResult();
335	if (!fpEverywhere) {
336	next = F_->createQuantize("quant", next, ElemKind::Int8QTy, `1.0`, `0`);
337	}
338	next = makeLayer(next, /planes/ `64`, /blocks/ layers_[`0`],
339	/stride/ `1`);
340
341	next = makeLayer(next, /planes/ `128`, /blocks/ layers_[`1`],
342	/stride/ `2`);
343
344	next = makeLayer(next, /planes/ `256`, /blocks/ layers_[`2`],
345	/stride/ `2`);
346
347	next = makeLayer(next, /planes/ `512`, /blocks/ layers_[`3`],
348	/stride/ `2`);
349	if (avgPool) {
350	next = makeAvgPool(next);
351	}
352	next = makeAvgPool(next);
353	if (!fpEverywhere) {
354	next =
355	F_->createDequantize("dequant", next, ElemKind::FloatTy)->getResult();
356	}
357	if (avgPoolFP) {
358	next = makeAvgPool(next);
359	}
360	next = F_->createTranspose("NHWC2NCHW", next, NHWC2NCHW);
361	Placeholder *output = F_->createSave("save", next)->getPlaceholder();
362	F_ = nullptr;
363	return output;
364	}
365	};
366
367	struct FunctionBundle {
368	std::string name;
369	Placeholder *input;
370	Placeholder *output;
371	};
372
373	ResNetBuilder resnext101_32x4d() {
374	return ResNetBuilder (Block::Bottleneck, {`3`, `4`, `23`, `3`},
375	/groups/ `32`,
376	/widthPerGroup/ `4`);
377	}
378
379	// ResNetBuilder resnet50() {
380	// return ResNetBuilder(Block::Bottleneck, {3, 4, 6, 3});
381	// }
382
383	/*
384	* This class implements a performance proxy for ResNet-like models
385	*/
386	class ResNetBench : public Benchmark {
387	private:
388	ResNetBuilder builder_;
389	std::vector<ShapeNCHW> shapes_;
390	std::string backendName_;
391	std::unique_ptr<runtime::HostManager> hostManager_;
392	std::vector<FunctionBundle> bundles_;
393	int64_t compilationTime_;
394
395	std::vector<FunctionBundle> makeNetworks(ResNetBuilder builder, Module &mod,
396	llvm::ArrayRef<ShapeNCHW> shapes) {
397	std::vector<FunctionBundle> res;
398	for (const auto &shape : shapes) {
399	std::string shapeStr =
400	strFormat("%dx%dx%dx%d", int(shape.n), int(shape.c), int(shape.h),
401	int(shape.w));
402	auto *F = mod.createFunction(strFormat("F_%s", shapeStr.c_str()));
403	Placeholder *input = mod.createPlaceholder(
404	ElemKind::FloatTy, {shape.n, shape.c, shape.h, shape.w}, "input",
405	false);
406	Placeholder *output = builder.build(input, F);
407	FunctionBundle bundle;
408	bundle.name = F->getName().str();
409	bundle.input = input;
410	bundle.output = output;
411	res.push_back(std::move(bundle));
412	}
413	return res;
414	}
415
416	public:
417	ResNetBench(ResNetBuilder builder, llvm::ArrayRef<ShapeNCHW> shapes,
418	std::string backendName)
419	: builder_(builder), shapes_(shapes.vec()), backendName_(backendName) {}
420
421	void setup() override {
422	std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
423	for (unsigned_t i = `0`; i < numDevices; ++i) {
424	auto config = std::make_unique<runtime::DeviceConfig>(backendName_);
425	config ->deviceID = i;
426	configs.push_back(std::move(config));
427	}
428
429	glow::runtime::HostConfig hostConfig;
430	hostConfig.maxActiveRequests = maxActiveRequests;
431
432	hostManager_ =
433	std::make_unique<runtime::HostManager>(std::move(configs), hostConfig);
434
435	const auto numCompileThreadsToUse =
436	std::min(size_t(numCompileThreads), shapes_.size());
437
438	// Divide Functions up for compilation threads
439	LOG(INFO) << "Building networks";
440	std::vector<std::unique_ptr<Module>> modules;
441	for (size_t i = `0`; i < numCompileThreadsToUse; ++i) {
442	auto mod = std::make_unique<Module>();
443	const auto beginIt =
444	shapes_.begin() + ((shapes_.size() / numCompileThreadsToUse) * i);
445	const auto endIt =
446	i == numCompileThreadsToUse - `1`
447	? shapes_.end()
448	: shapes_.begin() +
449	((shapes_.size() / numCompileThreadsToUse) * (i + `1`));
450	std::vector<ShapeNCHW> threadShapes{beginIt, endIt};
451	auto bundles = makeNetworks(builder_, *mod, threadShapes);
452	for (auto &bundle : bundles) {
453	bundles_.push_back(std::move(bundle));
454	}
455	modules.push_back(std::move(mod));
456	}
457
458	auto compileFn = [this](std::unique_ptr<Module> mod) {
459	glow::CompilationContext cctx;
460	cctx.replicationCount = replicationCount;
461	cctx.saturateHost = saturateHost;
462	cctx.precisionConfig.convertToFP16 = convertToFP16;
463	cctx.dumpFinalGraph = dumpDAG;
464	hostManager_->addNetwork(std::move(mod), cctx);
465	};
466
467	// Compile modules in parallel
468	LOG(INFO) << "Compiling networks";
469	int64_t compilationStartTime = TraceEvent::now();
470	std::vector<std::thread> threads;
471	for (size_t i = `1`; i < numCompileThreadsToUse; ++i) {
472	auto mod = std::move(modules [i]);
473	std::thread t(compileFn, std::move(mod));
474	threads.push_back(std::move(t));
475	}
476
477	compileFn(std::move(modules [`0`]));
478
479	for (auto &t : threads) {
480	t.join();
481	}
482
483	int64_t compilationEndTime = TraceEvent::now();
484	compilationTime_ = compilationEndTime - compilationStartTime;
485
486	// Run a few warmups
487	LOG(INFO) << "Running warmups";
488	runImpl(`2` * bundles_.size());
489	}
490
491	void runImpl(unsigned_t numRuns, int32_t threadNum = -`1`) {
492	std::unique_ptr<ExecutionContext> ctx =
493	glow::make_unique<ExecutionContext>();
494
495	auto *bindings = ctx ->getPlaceholderBindings();
496
497	for (unsigned_t i = `0`; i < numRuns; i++) {
498	if (logEvery > `0` && i > `0` && threadNum == `0` && i % logEvery == `0`) {
499	LOG(INFO) << "Thread 0 reached request " << i;
500	}
501	// Add threadNum to offset the theads
502	auto nextBundleNum = (std::max(threadNum, `0`) + i) % bundles_.size();
503	const auto &bundle = bundles_[nextBundleNum];
504	bindings->allocate(bundle.input);
505	bindings->allocate(bundle.output);
506	auto err = hostManager_->runNetworkBlocking(bundle.name, ctx);
507	}
508	}
509
510	void run() override {
511	std::vector<std::thread> threads;
512	unsigned_t reqsPerThread = numBatches / numRequesters;
513	unsigned_t numReqs = numRequesters * reqsPerThread;
514
515	LOG(INFO) << "Running";
516	int64_t startTime = TraceEvent::now();
517	for (unsigned_t i = `0`; i < numRequesters; ++i) {
518	threads.push_back(std::thread (
519	[this, reqsPerThread, i]() { runImpl(reqsPerThread, i); }));
520	}
521
522	for (auto &thread : threads) {
523	thread.join();
524	}
525	int64_t endTime = TraceEvent::now();
526	int64_t totatTimeMs = (endTime - startTime) / `1000`;
527
528	std::cout << "Total runtime: " << totatTimeMs << "ms" << std::endl;
529	if (totatTimeMs > `0`) {
530	std::cout << "Avg requests/second: "
531	<< numReqs / (double(totatTimeMs) / `1000`) << std::endl;
532	std::cout << "Avg images/second: "
533	<< (batchSize * numReqs) / (double(totatTimeMs) / `1000`)
534	<< std::endl;
535	std::cout << "Avg runtime per request " << double(totatTimeMs) / numReqs
536	<< "ms" << std::endl;
537	}
538	std::cout << "numBins: " << numBins << std::endl;
539	std::cout << "baseSize: " << baseSize << "x" << baseSize << std::endl;
540	std::cout << "batchSize: " << batchSize << std::endl;
541	std::cout << "stepSize: " << stepSize << std::endl;
542	std::cout << "replicationCount: " << replicationCount << std::endl;
543	std::cout << "numDevices: " << numDevices << std::endl;
544	std::cout << "numRequesters: " << numRequesters << std::endl;
545	std::cout << "compilation time: " << compilationTime_ / `1000` << "ms"
546	<< std::endl;
547	}
548
549	void teardown() override { LOG(INFO) << "Teardown"; }
550	};
551
552	std::vector<ShapeNCHW> generateShapes(dim_t batchSize, dim_t baseSize,
553	dim_t numBins, dim_t stepSize) {
554	assert(numBins > `0`);
555	std::vector<ShapeNCHW> shapes;
556	shapes.emplace_back(batchSize, `3`, baseSize, baseSize);
557
558	ShapeNCHW hStepped(batchSize, `3`, baseSize, baseSize);
559	ShapeNCHW wStepped(batchSize, `3`, baseSize, baseSize);
560	for (dim_t i = `1`; i < numBins; ++i) {
561	if (i % `2` == `0`) {
562	hStepped.h += stepSize;
563	shapes.push_back(hStepped);
564	} else {
565	wStepped.w += stepSize;
566	shapes.push_back(wStepped);
567	}
568	}
569	return shapes;
570	}
571	} // namespace
572
573	int main(int argc, char *argv[]) {
574	llvm::cl::ParseCommandLineOptions(argc, argv, "ResNet benchmark");
575
576	CHECK(!avgPool \|\| !avgPoolFP) << "avgPool and avgPoolFP can't be true or "
577	"pooling will occur two times";
578
579	std::vector<ShapeNCHW> shapes =
580	generateShapes(batchSize, baseSize, numBins, stepSize);
581	auto builder = resnext101_32x4d();
582	ResNetBench b(builder, shapes, backend);
583
584	bench(&b, numReps);
585	}
586

Browse the source code of glow/tests/benchmark/ResNetBench.cpp