1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "ExecutorCore.h"
18
19#include "ExecutorCoreHelperFunctions.h"
20#include "Loader.h"
21
22#include "glow/Base/Image.h"
23#include "glow/Base/TensorSerialization.h"
24#include "glow/Converter/TypeAToTypeBFunctionConverter.h"
25#include "glow/Importer/Caffe2ModelLoader.h"
26#include "glow/Importer/ONNXModelLoader.h"
27#include "glow/Optimizer/IROptimizer/CommandLine.h"
28#include "glow/Support/Support.h"
29
30#include "llvm/ADT/StringSwitch.h"
31#include "llvm/Support/CommandLine.h"
32#include "llvm/Support/Format.h"
33#include "llvm/Support/Timer.h"
34#include "llvm/Support/raw_ostream.h"
35
36#include <atomic>
37#include <cfloat>
38#include <fstream>
39#include <future>
40#include <iostream>
41#include <memory>
42#include <mutex>
43#include <queue>
44#include <sstream>
45#include <thread>
46
47extern llvm::cl::opt<unsigned> traceLevel;
48
49using namespace glow;
50
51namespace {
52
53class PostProcessExecutor : public PostProcessOutputDataExtension {
54public:
55 /// Iterates over registered extensions for processing and printing results
56 /// and executes them.
57 /// \return accumulated errors. Value greater then 0 indicates one or more
58 /// errros have occured.
59 int processOutputs(const llvm::StringMap<Placeholder *> &PHM,
60 PlaceholderBindings &bindings,
61 VecVecRef<std::string> inputImageBatchFilenames) override;
62
63 /// Registers Post Processing Output extensions.
64 void registerPostProcessOutputExtensions(
65 const std::vector<PostProcessExtFuncPtr> &extVector);
66
67private:
68 UniquePtrVec<PostProcessOutputDataExtension> extensions_;
69};
70
71class PreProcessInputExecutor : public PreProcessInputDataExtension {
72public:
73 /// Iterates over PreProcessInputDataExtension extensions and executes them
74 /// one by one.
75 void processInputTensor(llvm::ArrayRef<Tensor *> inputImageData,
76 size_t startId, size_t endId,
77 size_t batchSz) override;
78
79 /// Registers Input Data Preprocessing Extensions.
80 void registerInputDataPreProcessingExtension(
81 const std::vector<
82 std::function<std::unique_ptr<PreProcessInputDataExtension>()>>
83 &extVector);
84
85private:
86 UniquePtrVec<PreProcessInputDataExtension> extensions_;
87};
88
89void PostProcessExecutor::registerPostProcessOutputExtensions(
90 const std::vector<PostProcessExtFuncPtr> &extVector) {
91 for (auto &f : extVector) {
92 extensions_.push_back(f());
93 }
94}
95
96} // namespace
97
98/// Iterates over registered extensions for processing and Printing results and
99/// executes them.
100int PostProcessExecutor::processOutputs(
101 const llvm::StringMap<Placeholder *> &PHM, PlaceholderBindings &bindings,
102 VecVecRef<std::string> inputImageBatchFilenames) {
103 int numErrors = 0;
104 for (auto &f : extensions_) {
105 numErrors += f->processOutputs(PHM, bindings, inputImageBatchFilenames);
106 }
107 return numErrors;
108}
109
110/// Iterates over PreProcessInputDataExtension extensions and execute them one
111/// by one.
112void PreProcessInputExecutor::processInputTensor(
113 llvm::ArrayRef<Tensor *> inputImageData, size_t startId, size_t endId,
114 size_t batchSz) {
115 for (auto &f : extensions_) {
116 f->processInputTensor(inputImageData, startId, endId, batchSz);
117 }
118}
119
120void PreProcessInputExecutor::registerInputDataPreProcessingExtension(
121 const std::vector<
122 std::function<std::unique_ptr<PreProcessInputDataExtension>()>>
123 &extVector) {
124 for (auto &f : extVector) {
125 extensions_.push_back(f());
126 }
127}
128
129Executor::Executor(std::string appName, int argc, char **argv) {
130 appName_ = appName;
131 // Clear all external storage for command args set variables. This is
132 // necessary in order to support multiple calls to parse the command
133 // line; it seems that clearing the command line options is not possible,
134 // thus, we clear their external storage only. With each successive
135 // call to parse the arguments, arguments are pilling up in the ::cl
136 // argument, however, external storage will be set by the arguments from the
137 // current call only.
138 // NOTE: llvm::cl::ResetAllOptionOccurrences() or opt.reset() should do the
139 // job but they don't work.
140 // TODO: Loader should provide function to register callbacks.
141 initExecutorCoreCmdArgVars();
142 initImageCmdArgVars();
143 // Verify/initialize command line parameters, and then loader initializes
144 // the ExecutionEngine and Function.
145
146 parseCommandLine(argc, argv);
147 processImageCmdArgVars(modelInputsOpt.size());
148}
149
150/// Registers a Loader Extension that will be invoked after model is loaded.
151/// If multiple extensions are registered they will be executed in order they
152/// were registered.
153void Executor::registerLoaderExtension(
154 std::function<std::unique_ptr<LoaderExtension>()> func) {
155 loaderextensions_.push_back(func);
156}
157
158/// Registers an extension that will be invoked on Tensor containing current
159/// batch of input data. If multiple extensions are registered they will be
160/// executed in order they were registered.
161void Executor::registerInputDataPreProcessingExtension(
162 std::function<std::unique_ptr<PreProcessInputDataExtension>()> func) {
163 ppInputDataExtensions_.push_back(func);
164}
165
166/// Registers extension that will be invoked for each execution of the
167/// network. If multiple extensions are registered they will be executed in
168/// order they were registered.
169void Executor::registerPostProcessOutputExtension(PostProcessExtFuncPtr func) {
170 ppOutputDataExtensions_.push_back(func);
171}
172
173/// Iterates over lambda expressions and registers them with each instance of a
174/// loader in main dispatch loop.
175void Executor::addLoaderExtensions(Loader &ld) {
176 for (auto &f : loaderextensions_) {
177 ld.registerExtension(f());
178 }
179}
180
181void parseInputFiles(VecVec<std::string> &inputImageFiles) {
182 if (inputImageListFileOpt.empty() && inputImageDirs.empty() &&
183 inputTensorListFile.empty() && inputImageFilenamesOpt.size() == 0) {
184 llvm::errs() << "Args: Either positional image list or "
185 "-input-image-dir or "
186 "-input-image-list-file or "
187 "-input-tensor-list-file "
188 "must be used to specify input images.\n";
189 return;
190 }
191
192 if (!inputImageDirs.empty() &&
193 (!inputImageListFileOpt.empty() || inputImageFilenamesOpt.size() != 0)) {
194 LOG(FATAL) << "Args: Specifying image using input-image-dir cannot be "
195 "combined with "
196 "-input-image-list-file or the positional image list.\n";
197 }
198
199 if (!inputImageListFileOpt.empty() && inputImageFilenamesOpt.size() != 0) {
200 LOG(FATAL) << "Args: positional image list cannot be combined with "
201 "-input-image-list-file to specify input images.\n";
202 }
203
204 int32_t numInputNames = modelInputsOpt.size();
205
206 // if positional list of images, we support one input only. Assign 1st input
207 // vector list.
208 if (inputImageFilenamesOpt.size() != 0) {
209 CHECK_EQ(numInputNames, 1) << "When using positional image list, single "
210 "input networks are supported only.";
211 inputImageFiles.push_back(inputImageFilenamesOpt);
212 return;
213 }
214
215 if (!inputTensorListFile.empty()) {
216 CHECK_EQ(inputImageFilenamesOpt.size(), 0)
217 << "When using -input-tensor-list-file all Input images must be "
218 "specified "
219 "using -input-tensor-list-file option.";
220 CHECK_EQ(inputImageListFileOpt.size(), 0)
221 << "When using -input-tensor-list-file all Input images must be "
222 "specified "
223 "using -input-tensor-list-file option.";
224 CHECK_EQ(numInputNames, 1) << "When using -input-tensor-list-file single "
225 "input networks are supported only.";
226 std::vector<std::string> imageFiles;
227 parseInputList(inputTensorListFile, imageFiles);
228 inputImageFiles.push_back(imageFiles);
229 return;
230 }
231
232 if (!inputImageDirs.empty()) {
233 CHECK_EQ(numInputNames, 1)
234 << "When using image dir. single input networks are supported only.";
235 for (const auto &inputImageDir : inputImageDirs) {
236 std::vector<std::string> imageFiles;
237 parseInputDir(inputImageDir, imageFiles);
238 inputImageFiles.push_back(imageFiles);
239 }
240 return;
241 }
242
243 // If images are given using vector of lists of images
244 CHECK_EQ(numInputNames, inputImageListFileOpt.size())
245 << "Args: number of inputs and number of inputs image lists must match.";
246
247 size_t numInputImages = 0;
248 for (int i = 0; i < numInputNames; i++) {
249 std::vector<std::string> imageFiles;
250 parseInputList(inputImageListFileOpt[i], imageFiles);
251 inputImageFiles.push_back(imageFiles);
252 if (i > 0) {
253 CHECK_EQ(numInputImages, inputImageFiles[i].size())
254 << "Each image list file should have the same number of images.";
255 } else {
256 numInputImages = inputImageFiles[i].size();
257 }
258 }
259}
260
261/// This will parse command line, load, build and execute a network.
262int Executor::executeNetwork() {
263
264 parseInputFiles(inputImageFilenames_);
265
266 if (excludedFirstWarmupRuns && excludedFirstWarmupRuns >= warmup) {
267 llvm::errs() << "Excluding all warmup runs does not make sense\n";
268 return 1;
269 }
270 // Stream input mode.
271 const bool streamInputFilenamesMode = inputImageFilenamesOpt.size() == 1 &&
272 inputImageFilenamesOpt.front() == "-";
273
274 CHECK(!(streamInputFilenamesMode && emittingBundle()))
275 << "Cannot emit a bundle and also stream inputs.";
276
277 // If tracing is enabled, create a TraceContext to merge each runs events
278 // into.
279 if (!tracePath.empty()) {
280 traceContext = glow::make_unique<TraceContext>(TraceLevel::STANDARD);
281 }
282
283 // Mini-batch mode.
284 const bool miniBatchMode = miniBatch > 0;
285 CHECK(((!miniBatchMode) || (!streamInputFilenamesMode)))
286 << "The minibatch option is not compatible with the stream input "
287 "image mode.";
288 CHECK(((!miniBatchMode) || (inputImageFilenames_[0].size() % miniBatch == 0)))
289 << "The number of input images must be a multiple of the mini-batch.";
290
291 CHECK(((!iterationsOpt) || (!miniBatchMode) ||
292 (iterationsOpt % miniBatch == 0)))
293 << "Benchmark count must be a multiple of the mini-batch.";
294 CHECK(!preloadAllImages || miniBatchMode)
295 << "preload-all-images can only be used with minibatch";
296
297 const bool singleBatchRepeatedMode = repeatSingleBatchCount > 0;
298 CHECK(!(streamInputFilenamesMode && singleBatchRepeatedMode))
299 << "singleBatchRepeatedMode is not compatible with "
300 "streamInputFilenamesMode";
301
302 // When the mini-batch mode is enabled do not allow debug instrumentation.
303 if (miniBatchMode) {
304 CHECK(!instrumentDebug)
305 << "The minibatch option is not compatible with debug instrumentation.";
306 }
307
308 CHECK(!preloadAllImages || (modelInputsOpt.size() == 1))
309 << "Preloading all images doesn't support networks with multiple inputs.";
310
311 CHECK(!iterationsOpt || (modelInputsOpt.size() == 1))
312 << "Benchmark mode doesn't support networks with multiple inputs.";
313
314 // Print out the inferred image classification.
315 llvm::outs() << "Model: " << Loader::getModelOptPath() << "\n";
316 std::mutex ioMu;
317 int numErrors = 0;
318
319 if (runAllInputsOnAllDevices) {
320 if (numDevices != miniBatchThreads) {
321 llvm::outs() << "Setting " << miniBatchThreads.ArgStr << " to match "
322 << numDevices.ArgStr << " (" << numDevices
323 << ") as required by " << runAllInputsOnAllDevices.ArgStr
324 << "\n";
325 miniBatchThreads.getValue() = numDevices;
326 }
327 }
328
329 // If preloading then load+process all images here in preloadedInputImageData.
330 Tensor preloadedInputImageData;
331 if (preloadAllImages) {
332 Loader loader;
333 PreProcessInputExecutor ppImageExecutor;
334 addLoaderExtensions(loader);
335 ppImageExecutor.registerInputDataPreProcessingExtension(
336 ppInputDataExtensions_);
337
338 if (!inputTensorListFile.empty()) {
339 loadInputImageFromFileWithType(
340 inputImageFilenames_[0], &preloadedInputImageData, imageLayoutOpt[0]);
341 } else {
342 // Load and process the image data into the inputImageData Tensor.
343 loadImagesAndPreprocess(inputImageFilenames_, {&preloadedInputImageData});
344 ppImageExecutor.processInputTensor({&preloadedInputImageData}, 0,
345 inputImageFilenames_[0].size(),
346 preloadedInputImageData.dims()[0]);
347 }
348 }
349
350 // Process a set of minibatches with indices [startIndex, endIndex).
351 auto processImageRange = [&](size_t startIndex, size_t endIndex, size_t TID) {
352 std::unique_ptr<ExecutionContext> exContext =
353 glow::make_unique<ExecutionContext>();
354 PlaceholderBindings &bindings = *exContext->getPlaceholderBindings();
355 if (traceContext) {
356 exContext->setTraceContext(
357 glow::make_unique<TraceContext>(TraceLevel::STANDARD));
358 }
359 // If runAllInputsOnAllDevices, then assign this thread with TID to device
360 // TID. E.g. if this is TID 2 then this will be assigned to device 2.
361 Loader loader = runAllInputsOnAllDevices ? Loader(TID) : Loader();
362 PostProcessExecutor ppResultExecutor;
363 PreProcessInputExecutor ppImageExecutor;
364
365 // Registering all the extensions per thread.
366 addLoaderExtensions(loader);
367 ppResultExecutor.registerPostProcessOutputExtensions(
368 ppOutputDataExtensions_);
369 ppImageExecutor.registerInputDataPreProcessingExtension(
370 ppInputDataExtensions_);
371
372 // Used to make sure we only compile once, and run only once if not
373 // streaming.
374 bool isFirstRun = true;
375
376 // Perform graph profiling initialization if needed.
377 // if (profilingGraph()) {
378 // loader.initGraphProfiling(
379 // bindings, miniBatch > 0 ? miniBatch :
380 // inputImageFilenames_[0].size(), inputImageFilenames_[0].size());
381 //}
382
383 // These will be set during the first run.
384 llvm::StringMap<Placeholder *> iPHM;
385 llvm::StringMap<Placeholder *> oPHM;
386 std::vector<Placeholder *> inPHs;
387 std::vector<Placeholder *> outPHs;
388
389 size_t miniBatchIndex = startIndex;
390 std::vector<Tensor> inputData(modelInputsOpt.size());
391 if (preloadAllImages) {
392 inputData[0] = preloadedInputImageData.getUnowned();
393 }
394
395 VecVec<std::string> inputImageBatchFilenames;
396 if ((!miniBatchMode) &&
397 (!streamInputFilenamesMode || singleBatchRepeatedMode)) {
398 inputImageBatchFilenames = inputImageFilenames_;
399 } else if (singleBatchRepeatedMode) {
400 for (size_t i = 0, e = modelInputsOpt.size(); i < e; i++) {
401 std::vector<std::string> names(inputImageFilenames_[0].begin(),
402 inputImageFilenames_[0].begin() +
403 miniBatch);
404 inputImageBatchFilenames.push_back(names);
405 }
406 }
407 if (!tracePath.empty()) {
408 loader.getHostManager()->setTraceContext(
409 glow::make_unique<TraceContext>(traceLevel));
410 Error err = loader.getHostManager()->startDeviceTrace();
411 if (err) {
412 LOG(INFO) << "Failed to start device trace.";
413 numErrors = 1;
414 return;
415 } else {
416 llvm::outs() << "Device trace started.";
417 }
418 }
419
420 // Pass input tensors around as array of pointers.
421 std::vector<Tensor *> inputImageData;
422 for (auto &data : inputData) {
423 inputImageData.push_back(&data);
424 }
425
426 unsigned repeatedLoopCountRemaining = repeatSingleBatchCount;
427
428 auto loopCond = [&]() {
429 // If in stream mode then get the next image filenames if they exist,
430 // otherwise exit.
431 if (streamInputFilenamesMode) {
432 return getNextStdinImageFilenames(inputImageBatchFilenames);
433 }
434
435 // If a single batch is going to be loaded once and repeated then keep
436 // running repeatedLoopCountRemaining mores times.
437 if (singleBatchRepeatedMode) {
438 return repeatedLoopCountRemaining-- != 0;
439 }
440
441 // If in miniBatchMode then continue if we have already preloaded all
442 // images (will break inside loop once done), or otherwise get the next
443 // miniBatch image filenames if they exist, otherwise exit.
444 if (miniBatchMode) {
445 return getNextMiniBatch(inputImageBatchFilenames, inputImageFilenames_,
446 miniBatchIndex, miniBatch, endIndex);
447 }
448
449 // At least enter once, e.g. to just dump a bundle.
450 return isFirstRun;
451 };
452
453 while (loopCond()) {
454 if (!preloadAllImages && (!singleBatchRepeatedMode || isFirstRun)) {
455 // Load and process the image data into the inputImageData Tensor.
456 if (!inputTensorListFile.empty()) {
457 loadInputImageFromFileWithType(inputImageBatchFilenames[0],
458 inputImageData[0], imageLayoutOpt[0]);
459 } else {
460 loadImagesAndPreprocess(inputImageBatchFilenames, inputImageData);
461 ppImageExecutor.processInputTensor(inputImageData, startIndex,
462 endIndex,
463 inputImageData[0]->dims()[0]);
464 }
465 }
466
467 // Note: At this point miniBatchIndex is the end index, so subtract
468 // miniBatch to get the start index.
469 const dim_t startMiniBatchIndex = miniBatchIndex - miniBatch;
470
471 ShapeVector imageShape(inputImageData[0]->getType().dims().begin(),
472 inputImageData[0]->getType().dims().end());
473 if (miniBatch) {
474 imageShape[0] = miniBatch;
475 } else if (iterationsOpt) {
476 imageShape[0] = iterationsOpt;
477 }
478
479 // If we are benchmarking reset the image data to the batch size we need.
480 if (iterationsOpt) {
481 auto resetTensor = [](Tensor *tensor) {
482 ShapeVector imageSize(tensor->getType().dims().begin(),
483 tensor->getType().dims().end());
484 imageSize[0] = miniBatch ? miniBatch : iterationsOpt;
485 tensor->reset(ElemKind::FloatTy, imageSize);
486 };
487 std::for_each(inputImageData.begin(), inputImageData.end(),
488 resetTensor);
489 }
490
491 // If this is the first run, then we need to build and compile the model.
492 if (isFirstRun) {
493 isFirstRun = false;
494
495 std::vector<TypeRef> types;
496 auto preloadTy =
497 Type::newShape(inputImageData[0]->getType(), imageShape);
498
499 if (preloadAllImages) {
500 types.push_back(&preloadTy);
501 } else {
502 // get types of all input tensors.
503 for_each(inputImageData.begin(), inputImageData.end(),
504 [&](auto *t) { types.push_back(&t->getType()); });
505 }
506
507 // Build and compile the graph, then get input and output Placeholders.
508 std::tie(iPHM, oPHM) =
509 buildAndCompileAndGetInAndOutPair(loader, bindings, types);
510
511 // If in bundle mode, the bundle has been saved by the above call, so we
512 // can safely return.
513 if (emittingBundle()) {
514 LOG(INFO) << "Emit bundle mode is on. Network is compiled only.";
515 return;
516 }
517
518 // Obtain input/output placeholders from input/output map.
519 // For inputs, we got map but need to convert to array - need to
520 // take from map in order specified by modelInputsOpt.
521 for (size_t i = 0, e = modelInputsOpt.size(); i < e; i++) {
522 auto it = iPHM.find(modelInputsOpt[i]);
523 CHECK(it != iPHM.end())
524 << "Couldn't find placeholder: " << modelInputsOpt[i];
525 CHECK((*it).second) << "Placeholder in input map is NULL.";
526 inPHs.push_back((*it).second);
527 };
528 for_each(oPHM.begin(), oPHM.end(), [&](auto &p) {
529 CHECK(p.second) << "Placeholder in output map is NULL.";
530 outPHs.push_back(p.second);
531 });
532 }
533
534 // preloadAllImages - set a new Tensor that takes a slice from the 1st
535 // (and only) input tensor. Assign this new Tensor the tensor array of
536 // pointers, inputImageData, used further.
537 Tensor inputImageDataBatch;
538 if (preloadAllImages) {
539 std::vector<dim_t> imgSliceStart(imageShape.size(), 0);
540 imgSliceStart[0] = startMiniBatchIndex;
541 inputImageDataBatch =
542 inputImageData[0]->getUnowned(imageShape, imgSliceStart);
543 inputImageData[0] = &inputImageDataBatch;
544 }
545
546 // Compile done.
547 CHECK(!inPHs.empty()) << "Input must be valid.";
548 CHECK(!outPHs.empty()) << "Output must be valid.";
549 CHECK_EQ(inPHs.size(), inputImageData.size())
550 << "Number of input placeholders and tensors must match";
551 for (size_t i = 0, e = inputImageData.size(); i < e; i++) {
552 CHECK(inPHs[i]->dims() == inputImageData[i]->dims())
553 << "New input shape does not match the compiled function: "
554 << inPHs[i]->dims() << " vs " << inputImageData[i]->dims();
555 }
556
557 // Convert the raw input to fp16. This must be done every time we get new
558 // image data.
559 // Convert the raw input to fp16.
560 if (convertInAndOutToFp16) {
561 for (auto &t : inputImageData) {
562 t->convertToType(ElemKind::Float16Ty);
563 }
564 }
565
566 // If we are benchmarking we are done with the while loop.
567 if (iterationsOpt) {
568 break;
569 }
570
571 // Minibatch inference initialization of loader extensions
572 loader.inferInitMiniBatch(bindings, startMiniBatchIndex, miniBatch);
573
574 // About to run inference, so update the input image Placeholder's backing
575 // Tensor with inputImageDataBatch.
576 updateInputPlaceholders(bindings, inPHs, inputImageData);
577
578 // Perform the inference execution, updating output tensors.
579 auto batchSize = inputImageData[0]->dims()[0];
580 loader.runInference(exContext.get(), batchSize);
581 if (traceContext) {
582 traceContext->merge(exContext->getTraceContext());
583 }
584
585 // Process output of the network. Each app cand do its own post-processing
586 // depending on type of the network.
587 {
588 std::lock_guard<std::mutex> lock(ioMu);
589 numErrors += ppResultExecutor.processOutputs(oPHM, bindings,
590 inputImageBatchFilenames);
591 }
592
593 // Minibatch inference initialization of loader extensions.
594 loader.inferEndMiniBatch(bindings, startMiniBatchIndex, miniBatch);
595 }
596
597 if (iterationsOpt) {
598 // Image tensors loaded up to be run at once for benchmark mode.
599 UniquePtrVec<ExecutionContext> contexts =
600 setupContextPool(outPHs, inPHs[0], *inputImageData[0]);
601
602 std::string name = loader.getFunctionName();
603 std::unique_ptr<llvm::Timer> restRunsTimer = nullptr;
604 std::unique_ptr<llvm::Timer> firstRunsTimer = nullptr;
605 std::unique_ptr<double> bestRunTime = nullptr;
606 if (timeOpt) {
607 if (excludedFirstWarmupRuns) {
608 firstRunsTimer.reset(
609 new llvm::Timer("First Runs", "First inference runs"));
610 restRunsTimer.reset(
611 new llvm::Timer("Rest Inferences", "Rest of the inference runs"));
612 } else {
613 restRunsTimer.reset(
614 new llvm::Timer("Inferences", "All inference runs"));
615 }
616 bestRunTime.reset(new double);
617 *bestRunTime = DBL_MAX;
618 }
619 unsigned requestCount = miniBatch ? iterationsOpt / miniBatch : 1;
620
621 runBenchmark(name, loader, std::move(contexts), requestCount, warmup,
622 restRunsTimer.get(), firstRunsTimer.get(),
623 bestRunTime.get());
624 if (timeOpt) {
625 double wallTime = restRunsTimer->getTotalTime().getWallTime();
626 llvm::outs() << llvm::formatv(
627 "Average wall time per item (s): {0:f4}\n",
628 wallTime / (iterationsOpt + warmup - excludedFirstWarmupRuns));
629 llvm::outs() << llvm::formatv(
630 " Best wall time (s): {0:f4}\n", *bestRunTime);
631 }
632 }
633
634 if (profilingGraph()) {
635 loader.generateAndSerializeProfilingInfos(bindings);
636 }
637
638 if (!tracePath.empty()) {
639 Error err = loader.getHostManager()->stopDeviceTrace();
640 if (err) {
641 LOG(INFO) << "Failed to stop device trace:";
642 numErrors = 1;
643 return;
644 } else {
645 traceContext->merge(loader.getHostManager()->getTraceContext());
646 }
647 }
648 };
649
650 // We will force single-threaded execution if:
651 // - Minibatch mode and runAllInputsOnAllDevices are disabled;
652 // - We are going to emit bundle and do not do inference;
653 // - We are collecting inference profile.
654 // Otherwise, there can be several minibatches of equal size.
655 const bool multiThreadingAllowed =
656 (runAllInputsOnAllDevices || miniBatchMode) && !emittingBundle() &&
657 !profilingGraph();
658 const size_t numBatches =
659 miniBatchMode ? inputImageFilenames_[0].size() / miniBatch : 1u;
660 const size_t numThreads =
661 runAllInputsOnAllDevices
662 ? miniBatchThreads
663 : (multiThreadingAllowed
664 ? std::min(size_t(miniBatchThreads), numBatches)
665 : 1u);
666 if (miniBatchThreads > 1 && !multiThreadingAllowed) {
667 llvm::outs() << "WARNING: multi-threaded execution is not possible. Make "
668 "sure that minibatch size is specified and you are not "
669 "trying to dump profile or emit bundle.\n";
670 }
671
672 llvm::outs() << "Running " << numThreads << " thread(s).\n";
673 std::vector<std::thread> threads(numThreads);
674 const size_t miniBatchesPerThread =
675 (numBatches + numThreads - 1) / numThreads;
676 for (size_t i = 0; i < numThreads; i++) {
677 size_t startIndex, endIndex;
678 if (!runAllInputsOnAllDevices && numThreads > 1) {
679 startIndex = i * miniBatchesPerThread * miniBatch;
680 endIndex = std::min((i + 1) * miniBatchesPerThread * miniBatch,
681 inputImageFilenames_[0].size());
682 } else {
683 startIndex = 0;
684 endIndex = inputImageFilenames_[0].size();
685 }
686 auto worker = [&processImageRange, startIndex, endIndex, i]() {
687 processImageRange(startIndex, endIndex, i);
688 };
689 threads.push_back(std::thread(worker));
690 }
691
692 for (auto &t : threads) {
693 if (t.joinable()) {
694 t.join();
695 }
696 }
697
698 if (!tracePath.empty()) {
699 traceContext->dump(tracePath, appName_);
700 }
701
702 return numErrors;
703}
704