1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <algorithm> |
17 | #include <cstdlib> |
18 | #include <future> |
19 | #include <random> |
20 | |
21 | #include "Bench.h" |
22 | |
23 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
24 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
25 | |
26 | using namespace glow; |
27 | |
28 | namespace { |
29 | llvm::cl::OptionCategory category("ResNetBench Options" ); |
30 | |
31 | llvm::cl::opt<std::string> backend("backend" , llvm::cl::desc("Backend to use" ), |
32 | llvm::cl::Optional, |
33 | llvm::cl::init("Interpreter" ), |
34 | llvm::cl::cat(category)); |
35 | llvm::cl::opt<unsigned> |
36 | numReps("numReps" , llvm::cl::desc("Number of benchmark repititions" ), |
37 | llvm::cl::init(1), llvm::cl::value_desc("N" ), |
38 | llvm::cl::cat(category)); |
39 | |
40 | llvm::cl::opt<unsigned> batchSize("batchSize" , |
41 | llvm::cl::desc("Image batch size" ), |
42 | llvm::cl::init(1), llvm::cl::value_desc("N" ), |
43 | llvm::cl::cat(category)); |
44 | |
45 | llvm::cl::opt<unsigned> baseSize("baseSize" , |
46 | llvm::cl::desc("Image H and W initial size" ), |
47 | llvm::cl::init(224), llvm::cl::value_desc("N" ), |
48 | llvm::cl::cat(category)); |
49 | |
50 | llvm::cl::opt<unsigned> |
51 | numBins("numBins" , llvm::cl::desc("Number of image sizes to create" ), |
52 | llvm::cl::init(1), llvm::cl::value_desc("N" ), |
53 | llvm::cl::cat(category)); |
54 | |
55 | llvm::cl::opt<unsigned> stepSize( |
56 | "stepSize" , llvm::cl::desc("Difference between each dimension in bins" ), |
57 | llvm::cl::init(10), llvm::cl::value_desc("N" ), llvm::cl::cat(category)); |
58 | |
59 | llvm::cl::opt<unsigned> replicationCount("replicationCount" , |
60 | llvm::cl::desc("replicationCount" ), |
61 | llvm::cl::init(1), |
62 | llvm::cl::value_desc("N" ), |
63 | llvm::cl::cat(category)); |
64 | |
65 | llvm::cl::opt<bool> saturateHost("saturateHost" , llvm::cl::desc("saturateHost" ), |
66 | llvm::cl::init(true), llvm::cl::cat(category)); |
67 | |
68 | llvm::cl::opt<bool> convertToFP16("convertToFP16" , |
69 | llvm::cl::desc("convertToFP16" ), |
70 | llvm::cl::init(true), |
71 | llvm::cl::cat(category)); |
72 | |
73 | llvm::cl::opt<bool> |
74 | fpEverywhere("fpEverywhere" , |
75 | llvm::cl::desc("Run model in fp instead quantized" ), |
76 | llvm::cl::init(false), llvm::cl::cat(category)); |
77 | |
78 | llvm::cl::opt<bool> dumpDAG("dumpDAG" , |
79 | llvm::cl::desc("Dump the final glow graph" ), |
80 | llvm::cl::init(true), llvm::cl::cat(category)); |
81 | |
82 | llvm::cl::opt<unsigned> numDevices("numDevices" , |
83 | llvm::cl::desc("Number of backend devices" ), |
84 | llvm::cl::init(1), llvm::cl::value_desc("N" ), |
85 | llvm::cl::cat(category)); |
86 | |
87 | llvm::cl::opt<unsigned> maxActiveRequests( |
88 | "maxActiveRequests" , llvm::cl::desc("Maximum active Glow requests" ), |
89 | llvm::cl::init(250), llvm::cl::value_desc("N" ), llvm::cl::cat(category)); |
90 | |
91 | llvm::cl::opt<unsigned> numBatches("numBatches" , |
92 | llvm::cl::desc("Number of batches to run" ), |
93 | llvm::cl::init(10), |
94 | llvm::cl::value_desc("N" ), |
95 | llvm::cl::cat(category)); |
96 | |
97 | llvm::cl::opt<unsigned> |
98 | numRequesters("numRequesters" , llvm::cl::desc("Number of request threads" ), |
99 | llvm::cl::init(1), llvm::cl::value_desc("N" ), |
100 | llvm::cl::cat(category)); |
101 | |
102 | llvm::cl::opt<int> |
103 | logEvery("logEvery" , llvm::cl::desc("Log every N requests on first thead" ), |
104 | llvm::cl::init(1000), llvm::cl::value_desc("N" ), |
105 | llvm::cl::cat(category)); |
106 | |
107 | llvm::cl::opt<unsigned> numCompileThreads( |
108 | "numCompileThreads" , |
109 | llvm::cl::desc("Number of threads to use for compilation" ), |
110 | llvm::cl::init(1), llvm::cl::value_desc("N" ), llvm::cl::cat(category)); |
111 | |
112 | llvm::cl::opt<bool> |
113 | avgPool("avgPool" , |
114 | llvm::cl::desc("Add quantized AdaptiveAvgPool node to the graph. " |
115 | "If fpEverywhere then the node will also be fp." ), |
116 | llvm::cl::init(true), llvm::cl::cat(category)); |
117 | |
118 | llvm::cl::opt<bool> |
119 | avgPoolFP("avgPoolFP" , |
120 | llvm::cl::desc("Add fp AdaptiveAvgPool node to the graph." ), |
121 | llvm::cl::init(false), llvm::cl::cat(category)); |
122 | |
123 | enum class Block { |
124 | Bottleneck, |
125 | BasicBlock, |
126 | }; |
127 | |
128 | unsigned_t getExpansion(Block block) { |
129 | switch (block) { |
130 | case Block::Bottleneck: |
131 | return 4; |
132 | case Block::BasicBlock: |
133 | return 1; |
134 | } |
135 | LOG(FATAL) << "Unsupported block" ; |
136 | } |
137 | |
138 | class ResNetBuilder { |
139 | private: |
140 | Function *F_ = nullptr; |
141 | // Hack to distinguish weights of each convolution to prevent constant weight |
142 | // sharing within graph but enable weight sharing across graphs |
143 | unsigned_t nextFilterValue_ = 1; |
144 | |
145 | const Block block_; |
146 | const unsigned_t groups_; |
147 | const unsigned_t widthPerGroup_; |
148 | const unsigned_t dilation_ = 1; |
149 | const unsigned_t inPlanes_ = 64; |
150 | const std::vector<unsigned_t> layers_; |
151 | |
152 | NodeValue createConv(NodeValue input, unsigned_t outChannels, |
153 | unsigned_t kernel, unsigned_t stride = 1, |
154 | unsigned_t pad = 0, unsigned_t dilation = 1, |
155 | unsigned_t groups = 1, bool fp = false) { |
156 | |
157 | if (fpEverywhere) { |
158 | fp = true; |
159 | } |
160 | |
161 | ShapeNHWC inputShape(input.dims()); |
162 | |
163 | assert(inputShape.c % groups == 0); |
164 | assert(outChannels % groups == 0); |
165 | |
166 | auto *filter = F_->getParent()->createConstant( |
167 | ElemKind::FloatTy, {outChannels, kernel, kernel, inputShape.c / groups}, |
168 | "filter" ); |
169 | |
170 | size_t fanIn = kernel * kernel * inputShape.c; |
171 | filter->getPayloadMutable().init(Tensor::InitKind::Xavier, fanIn, |
172 | F_->getParent()->getPRNG()); |
173 | |
174 | // Need to be constant so that all networks have the same weights |
175 | filter->getPayloadMutable().init(Tensor::InitKind::Broadcast, |
176 | float(20 + nextFilterValue_++), |
177 | F_->getParent()->getPRNG()); |
178 | |
179 | auto bias = F_->getParent()->createConstant(ElemKind::FloatTy, |
180 | {outChannels}, "bias" ); |
181 | bias->getPayloadMutable().init(Tensor::InitKind::Broadcast, |
182 | float(20 + nextFilterValue_++), |
183 | F_->getParent()->getPRNG()); |
184 | |
185 | std::vector<unsigned_t> kernels = {kernel, kernel}; |
186 | std::vector<unsigned_t> strides = {stride, stride}; |
187 | std::vector<unsigned_t> pads = {pad, pad, pad, pad}; |
188 | std::vector<unsigned_t> dilations = {dilation, dilation}; |
189 | |
190 | auto outSz = calculateConvPoolOutputDims(inputShape.h, inputShape.w, |
191 | kernels, strides, pads, dilations); |
192 | std::array<dim_t, 4> outDims = { |
193 | {inputShape.n, outSz.first, outSz.second, outChannels}}; |
194 | |
195 | if (fp) { |
196 | auto *outTy = F_->getParent()->uniqueType(ElemKind::FloatTy, outDims); |
197 | return F_ |
198 | ->createConv("conv" , input, filter, bias, outTy, kernels, strides, |
199 | pads, groups, dilations) |
200 | ->getResult(); |
201 | } else { |
202 | auto *outTy = |
203 | F_->getParent()->uniqueType(ElemKind::Int8QTy, outDims, 1.0, 0); |
204 | |
205 | return F_ |
206 | ->createChannelwiseQuantizedConv( |
207 | "conv" , input, filter, bias, /*filterScales*/ nullptr, |
208 | /*filterOffsets*/ nullptr, /*biasScales*/ nullptr, |
209 | /*biasOffsets*/ nullptr, outTy, kernels, strides, pads, groups, |
210 | dilations, |
211 | /*quantizeFilter*/ true, /*quantizeBias*/ false, |
212 | /*schema*/ quantization::Schema::Symmetric) |
213 | ->getResult(); |
214 | } |
215 | } |
216 | |
217 | NodeValue conv3x3(NodeValue input, unsigned_t outPlanes, |
218 | unsigned_t stride = 1, unsigned_t groups = 1, |
219 | unsigned_t dilation = 1) { |
220 | return createConv(input, outPlanes, /*kernel*/ 3, stride, /*pad*/ dilation, |
221 | dilation, groups); |
222 | } |
223 | |
224 | NodeValue conv1x1(NodeValue input, unsigned_t outPlanes, |
225 | unsigned_t stride = 1) { |
226 | return createConv(input, outPlanes, /*kernel*/ 1, stride); |
227 | } |
228 | |
229 | NodeValue createRelu(NodeValue input) { |
230 | return F_->createRELU("relu" , input)->getResult(); |
231 | } |
232 | |
233 | NodeValue createAdd(NodeValue lhs, NodeValue rhs) { |
234 | if (isQuantizedElemKind(lhs.getElementType())) { |
235 | return F_->createAdd("qadd" , lhs.getType(), lhs, rhs); |
236 | } else { |
237 | return F_->createAdd("add" , lhs, rhs); |
238 | } |
239 | } |
240 | |
241 | NodeValue createBN(NodeValue input) { |
242 | // Emulate fused Conv + BN |
243 | auto inputKind = input.getNode()->getKind(); |
244 | if (inputKind == Kinded::Kind::ConvolutionNodeKind || |
245 | inputKind == Kinded::Kind::ChannelwiseQuantizedConvolutionNodeKind) { |
246 | return input; |
247 | } |
248 | LOG(FATAL) |
249 | << "Fake batchnorm op has to be after a convolution to emulate fusion" ; |
250 | } |
251 | |
252 | NodeValue makeAvgPool(NodeValue input) { |
253 | auto inputDims = input.dims(); |
254 | auto *outTy = F_->getParent()->uniqueTypeWithNewShape( |
255 | input.getType(), {inputDims[0], 1, 1, inputDims[3]}); |
256 | return F_->createAdaptiveAvgPool("adaptive_avg_pool" , input, outTy) |
257 | ->getResult(); |
258 | } |
259 | |
260 | NodeValue makeBlock(NodeValue input, NodeValue residual, unsigned_t planes, |
261 | unsigned_t stride = 1, unsigned_t groups = 1, |
262 | unsigned_t baseWidth = 64, unsigned_t dilation = 1) { |
263 | unsigned_t expansion = getExpansion(block_); |
264 | NodeValue next = input; |
265 | if (block_ == Block::Bottleneck) { |
266 | auto width = unsigned_t(planes * (baseWidth / 64.0)) * groups; |
267 | next = conv1x1(next, width); |
268 | next = createBN(next); |
269 | next = createRelu(next); |
270 | |
271 | next = conv3x3(next, width, stride, groups, dilation); |
272 | next = createBN(next); |
273 | next = createRelu(next); |
274 | |
275 | next = conv1x1(next, planes * expansion); |
276 | next = createBN(next); |
277 | next = createAdd(next, residual); |
278 | next = createRelu(next); |
279 | |
280 | } else if (block_ == Block::BasicBlock) { |
281 | next = conv3x3(next, planes, stride); |
282 | next = createBN(next); |
283 | next = createRelu(next); |
284 | |
285 | next = conv3x3(next, planes); |
286 | next = createBN(next); |
287 | next = createAdd(next, residual); |
288 | next = createRelu(next); |
289 | } else { |
290 | LOG(FATAL) << "Unknown block" ; |
291 | } |
292 | return next; |
293 | } |
294 | |
295 | NodeValue makeLayer(NodeValue input, unsigned_t planes, unsigned_t blocks, |
296 | unsigned_t stride) { |
297 | NodeValue next = input; |
298 | NodeValue residual = next; |
299 | auto blockExpansion = getExpansion(block_); |
300 | if (stride != 1 || inPlanes_ != planes * blockExpansion) { |
301 | residual = conv1x1(next, planes * blockExpansion, stride); |
302 | } |
303 | next = makeBlock(next, residual, planes, stride, groups_, widthPerGroup_, |
304 | dilation_); |
305 | |
306 | for (unsigned_t i = 1; i < blocks; ++i) { |
307 | residual = next; |
308 | next = makeBlock(next, residual, planes, /*stride*/ 1, groups_, |
309 | widthPerGroup_, dilation_); |
310 | } |
311 | |
312 | return next; |
313 | } |
314 | |
315 | public: |
316 | ResNetBuilder(Block block, llvm::ArrayRef<unsigned_t> layers, |
317 | unsigned_t groups = 1, unsigned_t widthPerGroup = 64) |
318 | : block_(block), groups_(groups), widthPerGroup_(widthPerGroup), |
319 | layers_(layers.vec()) {} |
320 | |
321 | Placeholder *build(Placeholder *input, Function *F) { |
322 | F_ = F; |
323 | nextFilterValue_ = 1; |
324 | NodeValue next = input->getOutput(); |
325 | next = F_->createTranspose("NCHW2NHWC" , next, NCHW2NHWC); |
326 | next = |
327 | createConv(next, /*outChannels*/ inPlanes_, /*kernel*/ 7, /*stride*/ 2, |
328 | /*pad*/ 3, /*dilation*/ 1, /*groups*/ 1, /*fp*/ true); |
329 | next = createBN(next); |
330 | next = createRelu(next); |
331 | next = F_->createMaxPool("maxpool" , next, /*kernel*/ 3, /*sride*/ 2, |
332 | /*pad*/ |
333 | 1) |
334 | ->getResult(); |
335 | if (!fpEverywhere) { |
336 | next = F_->createQuantize("quant" , next, ElemKind::Int8QTy, 1.0, 0); |
337 | } |
338 | next = makeLayer(next, /*planes*/ 64, /*blocks*/ layers_[0], |
339 | /*stride*/ 1); |
340 | |
341 | next = makeLayer(next, /*planes*/ 128, /*blocks*/ layers_[1], |
342 | /*stride*/ 2); |
343 | |
344 | next = makeLayer(next, /*planes*/ 256, /*blocks*/ layers_[2], |
345 | /*stride*/ 2); |
346 | |
347 | next = makeLayer(next, /*planes*/ 512, /*blocks*/ layers_[3], |
348 | /*stride*/ 2); |
349 | if (avgPool) { |
350 | next = makeAvgPool(next); |
351 | } |
352 | next = makeAvgPool(next); |
353 | if (!fpEverywhere) { |
354 | next = |
355 | F_->createDequantize("dequant" , next, ElemKind::FloatTy)->getResult(); |
356 | } |
357 | if (avgPoolFP) { |
358 | next = makeAvgPool(next); |
359 | } |
360 | next = F_->createTranspose("NHWC2NCHW" , next, NHWC2NCHW); |
361 | Placeholder *output = F_->createSave("save" , next)->getPlaceholder(); |
362 | F_ = nullptr; |
363 | return output; |
364 | } |
365 | }; |
366 | |
367 | struct FunctionBundle { |
368 | std::string name; |
369 | Placeholder *input; |
370 | Placeholder *output; |
371 | }; |
372 | |
373 | ResNetBuilder resnext101_32x4d() { |
374 | return ResNetBuilder(Block::Bottleneck, {3, 4, 23, 3}, |
375 | /*groups*/ 32, |
376 | /*widthPerGroup*/ 4); |
377 | } |
378 | |
379 | // ResNetBuilder resnet50() { |
380 | // return ResNetBuilder(Block::Bottleneck, {3, 4, 6, 3}); |
381 | // } |
382 | |
383 | /* |
384 | * This class implements a performance proxy for ResNet-like models |
385 | */ |
386 | class ResNetBench : public Benchmark { |
387 | private: |
388 | ResNetBuilder builder_; |
389 | std::vector<ShapeNCHW> shapes_; |
390 | std::string backendName_; |
391 | std::unique_ptr<runtime::HostManager> hostManager_; |
392 | std::vector<FunctionBundle> bundles_; |
393 | int64_t compilationTime_; |
394 | |
395 | std::vector<FunctionBundle> makeNetworks(ResNetBuilder builder, Module &mod, |
396 | llvm::ArrayRef<ShapeNCHW> shapes) { |
397 | std::vector<FunctionBundle> res; |
398 | for (const auto &shape : shapes) { |
399 | std::string shapeStr = |
400 | strFormat("%dx%dx%dx%d" , int(shape.n), int(shape.c), int(shape.h), |
401 | int(shape.w)); |
402 | auto *F = mod.createFunction(strFormat("F_%s" , shapeStr.c_str())); |
403 | Placeholder *input = mod.createPlaceholder( |
404 | ElemKind::FloatTy, {shape.n, shape.c, shape.h, shape.w}, "input" , |
405 | false); |
406 | Placeholder *output = builder.build(input, F); |
407 | FunctionBundle bundle; |
408 | bundle.name = F->getName().str(); |
409 | bundle.input = input; |
410 | bundle.output = output; |
411 | res.push_back(std::move(bundle)); |
412 | } |
413 | return res; |
414 | } |
415 | |
416 | public: |
417 | ResNetBench(ResNetBuilder builder, llvm::ArrayRef<ShapeNCHW> shapes, |
418 | std::string backendName) |
419 | : builder_(builder), shapes_(shapes.vec()), backendName_(backendName) {} |
420 | |
421 | void setup() override { |
422 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
423 | for (unsigned_t i = 0; i < numDevices; ++i) { |
424 | auto config = std::make_unique<runtime::DeviceConfig>(backendName_); |
425 | config->deviceID = i; |
426 | configs.push_back(std::move(config)); |
427 | } |
428 | |
429 | glow::runtime::HostConfig hostConfig; |
430 | hostConfig.maxActiveRequests = maxActiveRequests; |
431 | |
432 | hostManager_ = |
433 | std::make_unique<runtime::HostManager>(std::move(configs), hostConfig); |
434 | |
435 | const auto numCompileThreadsToUse = |
436 | std::min(size_t(numCompileThreads), shapes_.size()); |
437 | |
438 | // Divide Functions up for compilation threads |
439 | LOG(INFO) << "Building networks" ; |
440 | std::vector<std::unique_ptr<Module>> modules; |
441 | for (size_t i = 0; i < numCompileThreadsToUse; ++i) { |
442 | auto mod = std::make_unique<Module>(); |
443 | const auto beginIt = |
444 | shapes_.begin() + ((shapes_.size() / numCompileThreadsToUse) * i); |
445 | const auto endIt = |
446 | i == numCompileThreadsToUse - 1 |
447 | ? shapes_.end() |
448 | : shapes_.begin() + |
449 | ((shapes_.size() / numCompileThreadsToUse) * (i + 1)); |
450 | std::vector<ShapeNCHW> threadShapes{beginIt, endIt}; |
451 | auto bundles = makeNetworks(builder_, *mod, threadShapes); |
452 | for (auto &bundle : bundles) { |
453 | bundles_.push_back(std::move(bundle)); |
454 | } |
455 | modules.push_back(std::move(mod)); |
456 | } |
457 | |
458 | auto compileFn = [this](std::unique_ptr<Module> mod) { |
459 | glow::CompilationContext cctx; |
460 | cctx.replicationCount = replicationCount; |
461 | cctx.saturateHost = saturateHost; |
462 | cctx.precisionConfig.convertToFP16 = convertToFP16; |
463 | cctx.dumpFinalGraph = dumpDAG; |
464 | hostManager_->addNetwork(std::move(mod), cctx); |
465 | }; |
466 | |
467 | // Compile modules in parallel |
468 | LOG(INFO) << "Compiling networks" ; |
469 | int64_t compilationStartTime = TraceEvent::now(); |
470 | std::vector<std::thread> threads; |
471 | for (size_t i = 1; i < numCompileThreadsToUse; ++i) { |
472 | auto mod = std::move(modules[i]); |
473 | std::thread t(compileFn, std::move(mod)); |
474 | threads.push_back(std::move(t)); |
475 | } |
476 | |
477 | compileFn(std::move(modules[0])); |
478 | |
479 | for (auto &t : threads) { |
480 | t.join(); |
481 | } |
482 | |
483 | int64_t compilationEndTime = TraceEvent::now(); |
484 | compilationTime_ = compilationEndTime - compilationStartTime; |
485 | |
486 | // Run a few warmups |
487 | LOG(INFO) << "Running warmups" ; |
488 | runImpl(2 * bundles_.size()); |
489 | } |
490 | |
491 | void runImpl(unsigned_t numRuns, int32_t threadNum = -1) { |
492 | std::unique_ptr<ExecutionContext> ctx = |
493 | glow::make_unique<ExecutionContext>(); |
494 | |
495 | auto *bindings = ctx->getPlaceholderBindings(); |
496 | |
497 | for (unsigned_t i = 0; i < numRuns; i++) { |
498 | if (logEvery > 0 && i > 0 && threadNum == 0 && i % logEvery == 0) { |
499 | LOG(INFO) << "Thread 0 reached request " << i; |
500 | } |
501 | // Add threadNum to offset the theads |
502 | auto nextBundleNum = (std::max(threadNum, 0) + i) % bundles_.size(); |
503 | const auto &bundle = bundles_[nextBundleNum]; |
504 | bindings->allocate(bundle.input); |
505 | bindings->allocate(bundle.output); |
506 | auto err = hostManager_->runNetworkBlocking(bundle.name, ctx); |
507 | } |
508 | } |
509 | |
510 | void run() override { |
511 | std::vector<std::thread> threads; |
512 | unsigned_t reqsPerThread = numBatches / numRequesters; |
513 | unsigned_t numReqs = numRequesters * reqsPerThread; |
514 | |
515 | LOG(INFO) << "Running" ; |
516 | int64_t startTime = TraceEvent::now(); |
517 | for (unsigned_t i = 0; i < numRequesters; ++i) { |
518 | threads.push_back(std::thread( |
519 | [this, reqsPerThread, i]() { runImpl(reqsPerThread, i); })); |
520 | } |
521 | |
522 | for (auto &thread : threads) { |
523 | thread.join(); |
524 | } |
525 | int64_t endTime = TraceEvent::now(); |
526 | int64_t totatTimeMs = (endTime - startTime) / 1000; |
527 | |
528 | std::cout << "Total runtime: " << totatTimeMs << "ms" << std::endl; |
529 | if (totatTimeMs > 0) { |
530 | std::cout << "Avg requests/second: " |
531 | << numReqs / (double(totatTimeMs) / 1000) << std::endl; |
532 | std::cout << "Avg images/second: " |
533 | << (batchSize * numReqs) / (double(totatTimeMs) / 1000) |
534 | << std::endl; |
535 | std::cout << "Avg runtime per request " << double(totatTimeMs) / numReqs |
536 | << "ms" << std::endl; |
537 | } |
538 | std::cout << "numBins: " << numBins << std::endl; |
539 | std::cout << "baseSize: " << baseSize << "x" << baseSize << std::endl; |
540 | std::cout << "batchSize: " << batchSize << std::endl; |
541 | std::cout << "stepSize: " << stepSize << std::endl; |
542 | std::cout << "replicationCount: " << replicationCount << std::endl; |
543 | std::cout << "numDevices: " << numDevices << std::endl; |
544 | std::cout << "numRequesters: " << numRequesters << std::endl; |
545 | std::cout << "compilation time: " << compilationTime_ / 1000 << "ms" |
546 | << std::endl; |
547 | } |
548 | |
549 | void teardown() override { LOG(INFO) << "Teardown" ; } |
550 | }; |
551 | |
552 | std::vector<ShapeNCHW> generateShapes(dim_t batchSize, dim_t baseSize, |
553 | dim_t numBins, dim_t stepSize) { |
554 | assert(numBins > 0); |
555 | std::vector<ShapeNCHW> shapes; |
556 | shapes.emplace_back(batchSize, 3, baseSize, baseSize); |
557 | |
558 | ShapeNCHW hStepped(batchSize, 3, baseSize, baseSize); |
559 | ShapeNCHW wStepped(batchSize, 3, baseSize, baseSize); |
560 | for (dim_t i = 1; i < numBins; ++i) { |
561 | if (i % 2 == 0) { |
562 | hStepped.h += stepSize; |
563 | shapes.push_back(hStepped); |
564 | } else { |
565 | wStepped.w += stepSize; |
566 | shapes.push_back(wStepped); |
567 | } |
568 | } |
569 | return shapes; |
570 | } |
571 | } // namespace |
572 | |
573 | int main(int argc, char *argv[]) { |
574 | llvm::cl::ParseCommandLineOptions(argc, argv, "ResNet benchmark" ); |
575 | |
576 | CHECK(!avgPool || !avgPoolFP) << "avgPool and avgPoolFP can't be true or " |
577 | "pooling will occur two times" ; |
578 | |
579 | std::vector<ShapeNCHW> shapes = |
580 | generateShapes(batchSize, baseSize, numBins, stepSize); |
581 | auto builder = resnext101_32x4d(); |
582 | ResNetBench b(builder, shapes, backend); |
583 | |
584 | bench(&b, numReps); |
585 | } |
586 | |