1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <algorithm>
17#include <cstdlib>
18#include <future>
19#include <random>
20
21#include "Bench.h"
22
23#include "glow/ExecutionEngine/ExecutionEngine.h"
24#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26using namespace glow;
27
28namespace {
29llvm::cl::OptionCategory category("ResNetBench Options");
30
31llvm::cl::opt<std::string> backend("backend", llvm::cl::desc("Backend to use"),
32 llvm::cl::Optional,
33 llvm::cl::init("Interpreter"),
34 llvm::cl::cat(category));
35llvm::cl::opt<unsigned>
36 numReps("numReps", llvm::cl::desc("Number of benchmark repititions"),
37 llvm::cl::init(1), llvm::cl::value_desc("N"),
38 llvm::cl::cat(category));
39
40llvm::cl::opt<unsigned> batchSize("batchSize",
41 llvm::cl::desc("Image batch size"),
42 llvm::cl::init(1), llvm::cl::value_desc("N"),
43 llvm::cl::cat(category));
44
45llvm::cl::opt<unsigned> baseSize("baseSize",
46 llvm::cl::desc("Image H and W initial size"),
47 llvm::cl::init(224), llvm::cl::value_desc("N"),
48 llvm::cl::cat(category));
49
50llvm::cl::opt<unsigned>
51 numBins("numBins", llvm::cl::desc("Number of image sizes to create"),
52 llvm::cl::init(1), llvm::cl::value_desc("N"),
53 llvm::cl::cat(category));
54
55llvm::cl::opt<unsigned> stepSize(
56 "stepSize", llvm::cl::desc("Difference between each dimension in bins"),
57 llvm::cl::init(10), llvm::cl::value_desc("N"), llvm::cl::cat(category));
58
59llvm::cl::opt<unsigned> replicationCount("replicationCount",
60 llvm::cl::desc("replicationCount"),
61 llvm::cl::init(1),
62 llvm::cl::value_desc("N"),
63 llvm::cl::cat(category));
64
65llvm::cl::opt<bool> saturateHost("saturateHost", llvm::cl::desc("saturateHost"),
66 llvm::cl::init(true), llvm::cl::cat(category));
67
68llvm::cl::opt<bool> convertToFP16("convertToFP16",
69 llvm::cl::desc("convertToFP16"),
70 llvm::cl::init(true),
71 llvm::cl::cat(category));
72
73llvm::cl::opt<bool>
74 fpEverywhere("fpEverywhere",
75 llvm::cl::desc("Run model in fp instead quantized"),
76 llvm::cl::init(false), llvm::cl::cat(category));
77
78llvm::cl::opt<bool> dumpDAG("dumpDAG",
79 llvm::cl::desc("Dump the final glow graph"),
80 llvm::cl::init(true), llvm::cl::cat(category));
81
82llvm::cl::opt<unsigned> numDevices("numDevices",
83 llvm::cl::desc("Number of backend devices"),
84 llvm::cl::init(1), llvm::cl::value_desc("N"),
85 llvm::cl::cat(category));
86
87llvm::cl::opt<unsigned> maxActiveRequests(
88 "maxActiveRequests", llvm::cl::desc("Maximum active Glow requests"),
89 llvm::cl::init(250), llvm::cl::value_desc("N"), llvm::cl::cat(category));
90
91llvm::cl::opt<unsigned> numBatches("numBatches",
92 llvm::cl::desc("Number of batches to run"),
93 llvm::cl::init(10),
94 llvm::cl::value_desc("N"),
95 llvm::cl::cat(category));
96
97llvm::cl::opt<unsigned>
98 numRequesters("numRequesters", llvm::cl::desc("Number of request threads"),
99 llvm::cl::init(1), llvm::cl::value_desc("N"),
100 llvm::cl::cat(category));
101
102llvm::cl::opt<int>
103 logEvery("logEvery", llvm::cl::desc("Log every N requests on first thead"),
104 llvm::cl::init(1000), llvm::cl::value_desc("N"),
105 llvm::cl::cat(category));
106
107llvm::cl::opt<unsigned> numCompileThreads(
108 "numCompileThreads",
109 llvm::cl::desc("Number of threads to use for compilation"),
110 llvm::cl::init(1), llvm::cl::value_desc("N"), llvm::cl::cat(category));
111
112llvm::cl::opt<bool>
113 avgPool("avgPool",
114 llvm::cl::desc("Add quantized AdaptiveAvgPool node to the graph. "
115 "If fpEverywhere then the node will also be fp."),
116 llvm::cl::init(true), llvm::cl::cat(category));
117
118llvm::cl::opt<bool>
119 avgPoolFP("avgPoolFP",
120 llvm::cl::desc("Add fp AdaptiveAvgPool node to the graph."),
121 llvm::cl::init(false), llvm::cl::cat(category));
122
123enum class Block {
124 Bottleneck,
125 BasicBlock,
126};
127
128unsigned_t getExpansion(Block block) {
129 switch (block) {
130 case Block::Bottleneck:
131 return 4;
132 case Block::BasicBlock:
133 return 1;
134 }
135 LOG(FATAL) << "Unsupported block";
136}
137
138class ResNetBuilder {
139private:
140 Function *F_ = nullptr;
141 // Hack to distinguish weights of each convolution to prevent constant weight
142 // sharing within graph but enable weight sharing across graphs
143 unsigned_t nextFilterValue_ = 1;
144
145 const Block block_;
146 const unsigned_t groups_;
147 const unsigned_t widthPerGroup_;
148 const unsigned_t dilation_ = 1;
149 const unsigned_t inPlanes_ = 64;
150 const std::vector<unsigned_t> layers_;
151
152 NodeValue createConv(NodeValue input, unsigned_t outChannels,
153 unsigned_t kernel, unsigned_t stride = 1,
154 unsigned_t pad = 0, unsigned_t dilation = 1,
155 unsigned_t groups = 1, bool fp = false) {
156
157 if (fpEverywhere) {
158 fp = true;
159 }
160
161 ShapeNHWC inputShape(input.dims());
162
163 assert(inputShape.c % groups == 0);
164 assert(outChannels % groups == 0);
165
166 auto *filter = F_->getParent()->createConstant(
167 ElemKind::FloatTy, {outChannels, kernel, kernel, inputShape.c / groups},
168 "filter");
169
170 size_t fanIn = kernel * kernel * inputShape.c;
171 filter->getPayloadMutable().init(Tensor::InitKind::Xavier, fanIn,
172 F_->getParent()->getPRNG());
173
174 // Need to be constant so that all networks have the same weights
175 filter->getPayloadMutable().init(Tensor::InitKind::Broadcast,
176 float(20 + nextFilterValue_++),
177 F_->getParent()->getPRNG());
178
179 auto bias = F_->getParent()->createConstant(ElemKind::FloatTy,
180 {outChannels}, "bias");
181 bias->getPayloadMutable().init(Tensor::InitKind::Broadcast,
182 float(20 + nextFilterValue_++),
183 F_->getParent()->getPRNG());
184
185 std::vector<unsigned_t> kernels = {kernel, kernel};
186 std::vector<unsigned_t> strides = {stride, stride};
187 std::vector<unsigned_t> pads = {pad, pad, pad, pad};
188 std::vector<unsigned_t> dilations = {dilation, dilation};
189
190 auto outSz = calculateConvPoolOutputDims(inputShape.h, inputShape.w,
191 kernels, strides, pads, dilations);
192 std::array<dim_t, 4> outDims = {
193 {inputShape.n, outSz.first, outSz.second, outChannels}};
194
195 if (fp) {
196 auto *outTy = F_->getParent()->uniqueType(ElemKind::FloatTy, outDims);
197 return F_
198 ->createConv("conv", input, filter, bias, outTy, kernels, strides,
199 pads, groups, dilations)
200 ->getResult();
201 } else {
202 auto *outTy =
203 F_->getParent()->uniqueType(ElemKind::Int8QTy, outDims, 1.0, 0);
204
205 return F_
206 ->createChannelwiseQuantizedConv(
207 "conv", input, filter, bias, /*filterScales*/ nullptr,
208 /*filterOffsets*/ nullptr, /*biasScales*/ nullptr,
209 /*biasOffsets*/ nullptr, outTy, kernels, strides, pads, groups,
210 dilations,
211 /*quantizeFilter*/ true, /*quantizeBias*/ false,
212 /*schema*/ quantization::Schema::Symmetric)
213 ->getResult();
214 }
215 }
216
217 NodeValue conv3x3(NodeValue input, unsigned_t outPlanes,
218 unsigned_t stride = 1, unsigned_t groups = 1,
219 unsigned_t dilation = 1) {
220 return createConv(input, outPlanes, /*kernel*/ 3, stride, /*pad*/ dilation,
221 dilation, groups);
222 }
223
224 NodeValue conv1x1(NodeValue input, unsigned_t outPlanes,
225 unsigned_t stride = 1) {
226 return createConv(input, outPlanes, /*kernel*/ 1, stride);
227 }
228
229 NodeValue createRelu(NodeValue input) {
230 return F_->createRELU("relu", input)->getResult();
231 }
232
233 NodeValue createAdd(NodeValue lhs, NodeValue rhs) {
234 if (isQuantizedElemKind(lhs.getElementType())) {
235 return F_->createAdd("qadd", lhs.getType(), lhs, rhs);
236 } else {
237 return F_->createAdd("add", lhs, rhs);
238 }
239 }
240
241 NodeValue createBN(NodeValue input) {
242 // Emulate fused Conv + BN
243 auto inputKind = input.getNode()->getKind();
244 if (inputKind == Kinded::Kind::ConvolutionNodeKind ||
245 inputKind == Kinded::Kind::ChannelwiseQuantizedConvolutionNodeKind) {
246 return input;
247 }
248 LOG(FATAL)
249 << "Fake batchnorm op has to be after a convolution to emulate fusion";
250 }
251
252 NodeValue makeAvgPool(NodeValue input) {
253 auto inputDims = input.dims();
254 auto *outTy = F_->getParent()->uniqueTypeWithNewShape(
255 input.getType(), {inputDims[0], 1, 1, inputDims[3]});
256 return F_->createAdaptiveAvgPool("adaptive_avg_pool", input, outTy)
257 ->getResult();
258 }
259
260 NodeValue makeBlock(NodeValue input, NodeValue residual, unsigned_t planes,
261 unsigned_t stride = 1, unsigned_t groups = 1,
262 unsigned_t baseWidth = 64, unsigned_t dilation = 1) {
263 unsigned_t expansion = getExpansion(block_);
264 NodeValue next = input;
265 if (block_ == Block::Bottleneck) {
266 auto width = unsigned_t(planes * (baseWidth / 64.0)) * groups;
267 next = conv1x1(next, width);
268 next = createBN(next);
269 next = createRelu(next);
270
271 next = conv3x3(next, width, stride, groups, dilation);
272 next = createBN(next);
273 next = createRelu(next);
274
275 next = conv1x1(next, planes * expansion);
276 next = createBN(next);
277 next = createAdd(next, residual);
278 next = createRelu(next);
279
280 } else if (block_ == Block::BasicBlock) {
281 next = conv3x3(next, planes, stride);
282 next = createBN(next);
283 next = createRelu(next);
284
285 next = conv3x3(next, planes);
286 next = createBN(next);
287 next = createAdd(next, residual);
288 next = createRelu(next);
289 } else {
290 LOG(FATAL) << "Unknown block";
291 }
292 return next;
293 }
294
295 NodeValue makeLayer(NodeValue input, unsigned_t planes, unsigned_t blocks,
296 unsigned_t stride) {
297 NodeValue next = input;
298 NodeValue residual = next;
299 auto blockExpansion = getExpansion(block_);
300 if (stride != 1 || inPlanes_ != planes * blockExpansion) {
301 residual = conv1x1(next, planes * blockExpansion, stride);
302 }
303 next = makeBlock(next, residual, planes, stride, groups_, widthPerGroup_,
304 dilation_);
305
306 for (unsigned_t i = 1; i < blocks; ++i) {
307 residual = next;
308 next = makeBlock(next, residual, planes, /*stride*/ 1, groups_,
309 widthPerGroup_, dilation_);
310 }
311
312 return next;
313 }
314
315public:
316 ResNetBuilder(Block block, llvm::ArrayRef<unsigned_t> layers,
317 unsigned_t groups = 1, unsigned_t widthPerGroup = 64)
318 : block_(block), groups_(groups), widthPerGroup_(widthPerGroup),
319 layers_(layers.vec()) {}
320
321 Placeholder *build(Placeholder *input, Function *F) {
322 F_ = F;
323 nextFilterValue_ = 1;
324 NodeValue next = input->getOutput();
325 next = F_->createTranspose("NCHW2NHWC", next, NCHW2NHWC);
326 next =
327 createConv(next, /*outChannels*/ inPlanes_, /*kernel*/ 7, /*stride*/ 2,
328 /*pad*/ 3, /*dilation*/ 1, /*groups*/ 1, /*fp*/ true);
329 next = createBN(next);
330 next = createRelu(next);
331 next = F_->createMaxPool("maxpool", next, /*kernel*/ 3, /*sride*/ 2,
332 /*pad*/
333 1)
334 ->getResult();
335 if (!fpEverywhere) {
336 next = F_->createQuantize("quant", next, ElemKind::Int8QTy, 1.0, 0);
337 }
338 next = makeLayer(next, /*planes*/ 64, /*blocks*/ layers_[0],
339 /*stride*/ 1);
340
341 next = makeLayer(next, /*planes*/ 128, /*blocks*/ layers_[1],
342 /*stride*/ 2);
343
344 next = makeLayer(next, /*planes*/ 256, /*blocks*/ layers_[2],
345 /*stride*/ 2);
346
347 next = makeLayer(next, /*planes*/ 512, /*blocks*/ layers_[3],
348 /*stride*/ 2);
349 if (avgPool) {
350 next = makeAvgPool(next);
351 }
352 next = makeAvgPool(next);
353 if (!fpEverywhere) {
354 next =
355 F_->createDequantize("dequant", next, ElemKind::FloatTy)->getResult();
356 }
357 if (avgPoolFP) {
358 next = makeAvgPool(next);
359 }
360 next = F_->createTranspose("NHWC2NCHW", next, NHWC2NCHW);
361 Placeholder *output = F_->createSave("save", next)->getPlaceholder();
362 F_ = nullptr;
363 return output;
364 }
365};
366
367struct FunctionBundle {
368 std::string name;
369 Placeholder *input;
370 Placeholder *output;
371};
372
373ResNetBuilder resnext101_32x4d() {
374 return ResNetBuilder(Block::Bottleneck, {3, 4, 23, 3},
375 /*groups*/ 32,
376 /*widthPerGroup*/ 4);
377}
378
379// ResNetBuilder resnet50() {
380// return ResNetBuilder(Block::Bottleneck, {3, 4, 6, 3});
381// }
382
383/*
384 * This class implements a performance proxy for ResNet-like models
385 */
386class ResNetBench : public Benchmark {
387private:
388 ResNetBuilder builder_;
389 std::vector<ShapeNCHW> shapes_;
390 std::string backendName_;
391 std::unique_ptr<runtime::HostManager> hostManager_;
392 std::vector<FunctionBundle> bundles_;
393 int64_t compilationTime_;
394
395 std::vector<FunctionBundle> makeNetworks(ResNetBuilder builder, Module &mod,
396 llvm::ArrayRef<ShapeNCHW> shapes) {
397 std::vector<FunctionBundle> res;
398 for (const auto &shape : shapes) {
399 std::string shapeStr =
400 strFormat("%dx%dx%dx%d", int(shape.n), int(shape.c), int(shape.h),
401 int(shape.w));
402 auto *F = mod.createFunction(strFormat("F_%s", shapeStr.c_str()));
403 Placeholder *input = mod.createPlaceholder(
404 ElemKind::FloatTy, {shape.n, shape.c, shape.h, shape.w}, "input",
405 false);
406 Placeholder *output = builder.build(input, F);
407 FunctionBundle bundle;
408 bundle.name = F->getName().str();
409 bundle.input = input;
410 bundle.output = output;
411 res.push_back(std::move(bundle));
412 }
413 return res;
414 }
415
416public:
417 ResNetBench(ResNetBuilder builder, llvm::ArrayRef<ShapeNCHW> shapes,
418 std::string backendName)
419 : builder_(builder), shapes_(shapes.vec()), backendName_(backendName) {}
420
421 void setup() override {
422 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
423 for (unsigned_t i = 0; i < numDevices; ++i) {
424 auto config = std::make_unique<runtime::DeviceConfig>(backendName_);
425 config->deviceID = i;
426 configs.push_back(std::move(config));
427 }
428
429 glow::runtime::HostConfig hostConfig;
430 hostConfig.maxActiveRequests = maxActiveRequests;
431
432 hostManager_ =
433 std::make_unique<runtime::HostManager>(std::move(configs), hostConfig);
434
435 const auto numCompileThreadsToUse =
436 std::min(size_t(numCompileThreads), shapes_.size());
437
438 // Divide Functions up for compilation threads
439 LOG(INFO) << "Building networks";
440 std::vector<std::unique_ptr<Module>> modules;
441 for (size_t i = 0; i < numCompileThreadsToUse; ++i) {
442 auto mod = std::make_unique<Module>();
443 const auto beginIt =
444 shapes_.begin() + ((shapes_.size() / numCompileThreadsToUse) * i);
445 const auto endIt =
446 i == numCompileThreadsToUse - 1
447 ? shapes_.end()
448 : shapes_.begin() +
449 ((shapes_.size() / numCompileThreadsToUse) * (i + 1));
450 std::vector<ShapeNCHW> threadShapes{beginIt, endIt};
451 auto bundles = makeNetworks(builder_, *mod, threadShapes);
452 for (auto &bundle : bundles) {
453 bundles_.push_back(std::move(bundle));
454 }
455 modules.push_back(std::move(mod));
456 }
457
458 auto compileFn = [this](std::unique_ptr<Module> mod) {
459 glow::CompilationContext cctx;
460 cctx.replicationCount = replicationCount;
461 cctx.saturateHost = saturateHost;
462 cctx.precisionConfig.convertToFP16 = convertToFP16;
463 cctx.dumpFinalGraph = dumpDAG;
464 hostManager_->addNetwork(std::move(mod), cctx);
465 };
466
467 // Compile modules in parallel
468 LOG(INFO) << "Compiling networks";
469 int64_t compilationStartTime = TraceEvent::now();
470 std::vector<std::thread> threads;
471 for (size_t i = 1; i < numCompileThreadsToUse; ++i) {
472 auto mod = std::move(modules[i]);
473 std::thread t(compileFn, std::move(mod));
474 threads.push_back(std::move(t));
475 }
476
477 compileFn(std::move(modules[0]));
478
479 for (auto &t : threads) {
480 t.join();
481 }
482
483 int64_t compilationEndTime = TraceEvent::now();
484 compilationTime_ = compilationEndTime - compilationStartTime;
485
486 // Run a few warmups
487 LOG(INFO) << "Running warmups";
488 runImpl(2 * bundles_.size());
489 }
490
491 void runImpl(unsigned_t numRuns, int32_t threadNum = -1) {
492 std::unique_ptr<ExecutionContext> ctx =
493 glow::make_unique<ExecutionContext>();
494
495 auto *bindings = ctx->getPlaceholderBindings();
496
497 for (unsigned_t i = 0; i < numRuns; i++) {
498 if (logEvery > 0 && i > 0 && threadNum == 0 && i % logEvery == 0) {
499 LOG(INFO) << "Thread 0 reached request " << i;
500 }
501 // Add threadNum to offset the theads
502 auto nextBundleNum = (std::max(threadNum, 0) + i) % bundles_.size();
503 const auto &bundle = bundles_[nextBundleNum];
504 bindings->allocate(bundle.input);
505 bindings->allocate(bundle.output);
506 auto err = hostManager_->runNetworkBlocking(bundle.name, ctx);
507 }
508 }
509
510 void run() override {
511 std::vector<std::thread> threads;
512 unsigned_t reqsPerThread = numBatches / numRequesters;
513 unsigned_t numReqs = numRequesters * reqsPerThread;
514
515 LOG(INFO) << "Running";
516 int64_t startTime = TraceEvent::now();
517 for (unsigned_t i = 0; i < numRequesters; ++i) {
518 threads.push_back(std::thread(
519 [this, reqsPerThread, i]() { runImpl(reqsPerThread, i); }));
520 }
521
522 for (auto &thread : threads) {
523 thread.join();
524 }
525 int64_t endTime = TraceEvent::now();
526 int64_t totatTimeMs = (endTime - startTime) / 1000;
527
528 std::cout << "Total runtime: " << totatTimeMs << "ms" << std::endl;
529 if (totatTimeMs > 0) {
530 std::cout << "Avg requests/second: "
531 << numReqs / (double(totatTimeMs) / 1000) << std::endl;
532 std::cout << "Avg images/second: "
533 << (batchSize * numReqs) / (double(totatTimeMs) / 1000)
534 << std::endl;
535 std::cout << "Avg runtime per request " << double(totatTimeMs) / numReqs
536 << "ms" << std::endl;
537 }
538 std::cout << "numBins: " << numBins << std::endl;
539 std::cout << "baseSize: " << baseSize << "x" << baseSize << std::endl;
540 std::cout << "batchSize: " << batchSize << std::endl;
541 std::cout << "stepSize: " << stepSize << std::endl;
542 std::cout << "replicationCount: " << replicationCount << std::endl;
543 std::cout << "numDevices: " << numDevices << std::endl;
544 std::cout << "numRequesters: " << numRequesters << std::endl;
545 std::cout << "compilation time: " << compilationTime_ / 1000 << "ms"
546 << std::endl;
547 }
548
549 void teardown() override { LOG(INFO) << "Teardown"; }
550};
551
552std::vector<ShapeNCHW> generateShapes(dim_t batchSize, dim_t baseSize,
553 dim_t numBins, dim_t stepSize) {
554 assert(numBins > 0);
555 std::vector<ShapeNCHW> shapes;
556 shapes.emplace_back(batchSize, 3, baseSize, baseSize);
557
558 ShapeNCHW hStepped(batchSize, 3, baseSize, baseSize);
559 ShapeNCHW wStepped(batchSize, 3, baseSize, baseSize);
560 for (dim_t i = 1; i < numBins; ++i) {
561 if (i % 2 == 0) {
562 hStepped.h += stepSize;
563 shapes.push_back(hStepped);
564 } else {
565 wStepped.w += stepSize;
566 shapes.push_back(wStepped);
567 }
568 }
569 return shapes;
570}
571} // namespace
572
573int main(int argc, char *argv[]) {
574 llvm::cl::ParseCommandLineOptions(argc, argv, "ResNet benchmark");
575
576 CHECK(!avgPool || !avgPoolFP) << "avgPool and avgPoolFP can't be true or "
577 "pooling will occur two times";
578
579 std::vector<ShapeNCHW> shapes =
580 generateShapes(batchSize, baseSize, numBins, stepSize);
581 auto builder = resnext101_32x4d();
582 ResNetBench b(builder, shapes, backend);
583
584 bench(&b, numReps);
585}
586