1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include "ExecutorCore.h" |
18 | |
19 | #include "ExecutorCoreHelperFunctions.h" |
20 | #include "Loader.h" |
21 | |
22 | #include "glow/Base/Image.h" |
23 | #include "glow/Base/TensorSerialization.h" |
24 | #include "glow/Converter/TypeAToTypeBFunctionConverter.h" |
25 | #include "glow/Importer/Caffe2ModelLoader.h" |
26 | #include "glow/Importer/ONNXModelLoader.h" |
27 | #include "glow/Optimizer/IROptimizer/CommandLine.h" |
28 | #include "glow/Support/Support.h" |
29 | |
30 | #include "llvm/ADT/StringSwitch.h" |
31 | #include "llvm/Support/CommandLine.h" |
32 | #include "llvm/Support/Format.h" |
33 | #include "llvm/Support/Timer.h" |
34 | #include "llvm/Support/raw_ostream.h" |
35 | |
36 | #include <atomic> |
37 | #include <cfloat> |
38 | #include <fstream> |
39 | #include <future> |
40 | #include <iostream> |
41 | #include <memory> |
42 | #include <mutex> |
43 | #include <queue> |
44 | #include <sstream> |
45 | #include <thread> |
46 | |
47 | extern llvm::cl::opt<unsigned> traceLevel; |
48 | |
49 | using namespace glow; |
50 | |
51 | namespace { |
52 | |
53 | class PostProcessExecutor : public PostProcessOutputDataExtension { |
54 | public: |
55 | /// Iterates over registered extensions for processing and printing results |
56 | /// and executes them. |
57 | /// \return accumulated errors. Value greater then 0 indicates one or more |
58 | /// errros have occured. |
59 | int processOutputs(const llvm::StringMap<Placeholder *> &PHM, |
60 | PlaceholderBindings &bindings, |
61 | VecVecRef<std::string> inputImageBatchFilenames) override; |
62 | |
63 | /// Registers Post Processing Output extensions. |
64 | void registerPostProcessOutputExtensions( |
65 | const std::vector<PostProcessExtFuncPtr> &extVector); |
66 | |
67 | private: |
68 | UniquePtrVec<PostProcessOutputDataExtension> extensions_; |
69 | }; |
70 | |
71 | class PreProcessInputExecutor : public PreProcessInputDataExtension { |
72 | public: |
73 | /// Iterates over PreProcessInputDataExtension extensions and executes them |
74 | /// one by one. |
75 | void processInputTensor(llvm::ArrayRef<Tensor *> inputImageData, |
76 | size_t startId, size_t endId, |
77 | size_t batchSz) override; |
78 | |
79 | /// Registers Input Data Preprocessing Extensions. |
80 | void registerInputDataPreProcessingExtension( |
81 | const std::vector< |
82 | std::function<std::unique_ptr<PreProcessInputDataExtension>()>> |
83 | &extVector); |
84 | |
85 | private: |
86 | UniquePtrVec<PreProcessInputDataExtension> extensions_; |
87 | }; |
88 | |
89 | void PostProcessExecutor::registerPostProcessOutputExtensions( |
90 | const std::vector<PostProcessExtFuncPtr> &extVector) { |
91 | for (auto &f : extVector) { |
92 | extensions_.push_back(f()); |
93 | } |
94 | } |
95 | |
96 | } // namespace |
97 | |
98 | /// Iterates over registered extensions for processing and Printing results and |
99 | /// executes them. |
100 | int PostProcessExecutor::processOutputs( |
101 | const llvm::StringMap<Placeholder *> &PHM, PlaceholderBindings &bindings, |
102 | VecVecRef<std::string> inputImageBatchFilenames) { |
103 | int numErrors = 0; |
104 | for (auto &f : extensions_) { |
105 | numErrors += f->processOutputs(PHM, bindings, inputImageBatchFilenames); |
106 | } |
107 | return numErrors; |
108 | } |
109 | |
110 | /// Iterates over PreProcessInputDataExtension extensions and execute them one |
111 | /// by one. |
112 | void PreProcessInputExecutor::processInputTensor( |
113 | llvm::ArrayRef<Tensor *> inputImageData, size_t startId, size_t endId, |
114 | size_t batchSz) { |
115 | for (auto &f : extensions_) { |
116 | f->processInputTensor(inputImageData, startId, endId, batchSz); |
117 | } |
118 | } |
119 | |
120 | void PreProcessInputExecutor::registerInputDataPreProcessingExtension( |
121 | const std::vector< |
122 | std::function<std::unique_ptr<PreProcessInputDataExtension>()>> |
123 | &extVector) { |
124 | for (auto &f : extVector) { |
125 | extensions_.push_back(f()); |
126 | } |
127 | } |
128 | |
129 | Executor::Executor(std::string appName, int argc, char **argv) { |
130 | appName_ = appName; |
131 | // Clear all external storage for command args set variables. This is |
132 | // necessary in order to support multiple calls to parse the command |
133 | // line; it seems that clearing the command line options is not possible, |
134 | // thus, we clear their external storage only. With each successive |
135 | // call to parse the arguments, arguments are pilling up in the ::cl |
136 | // argument, however, external storage will be set by the arguments from the |
137 | // current call only. |
138 | // NOTE: llvm::cl::ResetAllOptionOccurrences() or opt.reset() should do the |
139 | // job but they don't work. |
140 | // TODO: Loader should provide function to register callbacks. |
141 | initExecutorCoreCmdArgVars(); |
142 | initImageCmdArgVars(); |
143 | // Verify/initialize command line parameters, and then loader initializes |
144 | // the ExecutionEngine and Function. |
145 | |
146 | parseCommandLine(argc, argv); |
147 | processImageCmdArgVars(modelInputsOpt.size()); |
148 | } |
149 | |
150 | /// Registers a Loader Extension that will be invoked after model is loaded. |
151 | /// If multiple extensions are registered they will be executed in order they |
152 | /// were registered. |
153 | void Executor::registerLoaderExtension( |
154 | std::function<std::unique_ptr<LoaderExtension>()> func) { |
155 | loaderextensions_.push_back(func); |
156 | } |
157 | |
158 | /// Registers an extension that will be invoked on Tensor containing current |
159 | /// batch of input data. If multiple extensions are registered they will be |
160 | /// executed in order they were registered. |
161 | void Executor::registerInputDataPreProcessingExtension( |
162 | std::function<std::unique_ptr<PreProcessInputDataExtension>()> func) { |
163 | ppInputDataExtensions_.push_back(func); |
164 | } |
165 | |
166 | /// Registers extension that will be invoked for each execution of the |
167 | /// network. If multiple extensions are registered they will be executed in |
168 | /// order they were registered. |
169 | void Executor::registerPostProcessOutputExtension(PostProcessExtFuncPtr func) { |
170 | ppOutputDataExtensions_.push_back(func); |
171 | } |
172 | |
173 | /// Iterates over lambda expressions and registers them with each instance of a |
174 | /// loader in main dispatch loop. |
175 | void Executor::addLoaderExtensions(Loader &ld) { |
176 | for (auto &f : loaderextensions_) { |
177 | ld.registerExtension(f()); |
178 | } |
179 | } |
180 | |
181 | void parseInputFiles(VecVec<std::string> &inputImageFiles) { |
182 | if (inputImageListFileOpt.empty() && inputImageDirs.empty() && |
183 | inputTensorListFile.empty() && inputImageFilenamesOpt.size() == 0) { |
184 | llvm::errs() << "Args: Either positional image list or " |
185 | "-input-image-dir or " |
186 | "-input-image-list-file or " |
187 | "-input-tensor-list-file " |
188 | "must be used to specify input images.\n" ; |
189 | return; |
190 | } |
191 | |
192 | if (!inputImageDirs.empty() && |
193 | (!inputImageListFileOpt.empty() || inputImageFilenamesOpt.size() != 0)) { |
194 | LOG(FATAL) << "Args: Specifying image using input-image-dir cannot be " |
195 | "combined with " |
196 | "-input-image-list-file or the positional image list.\n" ; |
197 | } |
198 | |
199 | if (!inputImageListFileOpt.empty() && inputImageFilenamesOpt.size() != 0) { |
200 | LOG(FATAL) << "Args: positional image list cannot be combined with " |
201 | "-input-image-list-file to specify input images.\n" ; |
202 | } |
203 | |
204 | int32_t numInputNames = modelInputsOpt.size(); |
205 | |
206 | // if positional list of images, we support one input only. Assign 1st input |
207 | // vector list. |
208 | if (inputImageFilenamesOpt.size() != 0) { |
209 | CHECK_EQ(numInputNames, 1) << "When using positional image list, single " |
210 | "input networks are supported only." ; |
211 | inputImageFiles.push_back(inputImageFilenamesOpt); |
212 | return; |
213 | } |
214 | |
215 | if (!inputTensorListFile.empty()) { |
216 | CHECK_EQ(inputImageFilenamesOpt.size(), 0) |
217 | << "When using -input-tensor-list-file all Input images must be " |
218 | "specified " |
219 | "using -input-tensor-list-file option." ; |
220 | CHECK_EQ(inputImageListFileOpt.size(), 0) |
221 | << "When using -input-tensor-list-file all Input images must be " |
222 | "specified " |
223 | "using -input-tensor-list-file option." ; |
224 | CHECK_EQ(numInputNames, 1) << "When using -input-tensor-list-file single " |
225 | "input networks are supported only." ; |
226 | std::vector<std::string> imageFiles; |
227 | parseInputList(inputTensorListFile, imageFiles); |
228 | inputImageFiles.push_back(imageFiles); |
229 | return; |
230 | } |
231 | |
232 | if (!inputImageDirs.empty()) { |
233 | CHECK_EQ(numInputNames, 1) |
234 | << "When using image dir. single input networks are supported only." ; |
235 | for (const auto &inputImageDir : inputImageDirs) { |
236 | std::vector<std::string> imageFiles; |
237 | parseInputDir(inputImageDir, imageFiles); |
238 | inputImageFiles.push_back(imageFiles); |
239 | } |
240 | return; |
241 | } |
242 | |
243 | // If images are given using vector of lists of images |
244 | CHECK_EQ(numInputNames, inputImageListFileOpt.size()) |
245 | << "Args: number of inputs and number of inputs image lists must match." ; |
246 | |
247 | size_t numInputImages = 0; |
248 | for (int i = 0; i < numInputNames; i++) { |
249 | std::vector<std::string> imageFiles; |
250 | parseInputList(inputImageListFileOpt[i], imageFiles); |
251 | inputImageFiles.push_back(imageFiles); |
252 | if (i > 0) { |
253 | CHECK_EQ(numInputImages, inputImageFiles[i].size()) |
254 | << "Each image list file should have the same number of images." ; |
255 | } else { |
256 | numInputImages = inputImageFiles[i].size(); |
257 | } |
258 | } |
259 | } |
260 | |
261 | /// This will parse command line, load, build and execute a network. |
262 | int Executor::executeNetwork() { |
263 | |
264 | parseInputFiles(inputImageFilenames_); |
265 | |
266 | if (excludedFirstWarmupRuns && excludedFirstWarmupRuns >= warmup) { |
267 | llvm::errs() << "Excluding all warmup runs does not make sense\n" ; |
268 | return 1; |
269 | } |
270 | // Stream input mode. |
271 | const bool streamInputFilenamesMode = inputImageFilenamesOpt.size() == 1 && |
272 | inputImageFilenamesOpt.front() == "-" ; |
273 | |
274 | CHECK(!(streamInputFilenamesMode && emittingBundle())) |
275 | << "Cannot emit a bundle and also stream inputs." ; |
276 | |
277 | // If tracing is enabled, create a TraceContext to merge each runs events |
278 | // into. |
279 | if (!tracePath.empty()) { |
280 | traceContext = glow::make_unique<TraceContext>(TraceLevel::STANDARD); |
281 | } |
282 | |
283 | // Mini-batch mode. |
284 | const bool miniBatchMode = miniBatch > 0; |
285 | CHECK(((!miniBatchMode) || (!streamInputFilenamesMode))) |
286 | << "The minibatch option is not compatible with the stream input " |
287 | "image mode." ; |
288 | CHECK(((!miniBatchMode) || (inputImageFilenames_[0].size() % miniBatch == 0))) |
289 | << "The number of input images must be a multiple of the mini-batch." ; |
290 | |
291 | CHECK(((!iterationsOpt) || (!miniBatchMode) || |
292 | (iterationsOpt % miniBatch == 0))) |
293 | << "Benchmark count must be a multiple of the mini-batch." ; |
294 | CHECK(!preloadAllImages || miniBatchMode) |
295 | << "preload-all-images can only be used with minibatch" ; |
296 | |
297 | const bool singleBatchRepeatedMode = repeatSingleBatchCount > 0; |
298 | CHECK(!(streamInputFilenamesMode && singleBatchRepeatedMode)) |
299 | << "singleBatchRepeatedMode is not compatible with " |
300 | "streamInputFilenamesMode" ; |
301 | |
302 | // When the mini-batch mode is enabled do not allow debug instrumentation. |
303 | if (miniBatchMode) { |
304 | CHECK(!instrumentDebug) |
305 | << "The minibatch option is not compatible with debug instrumentation." ; |
306 | } |
307 | |
308 | CHECK(!preloadAllImages || (modelInputsOpt.size() == 1)) |
309 | << "Preloading all images doesn't support networks with multiple inputs." ; |
310 | |
311 | CHECK(!iterationsOpt || (modelInputsOpt.size() == 1)) |
312 | << "Benchmark mode doesn't support networks with multiple inputs." ; |
313 | |
314 | // Print out the inferred image classification. |
315 | llvm::outs() << "Model: " << Loader::getModelOptPath() << "\n" ; |
316 | std::mutex ioMu; |
317 | int numErrors = 0; |
318 | |
319 | if (runAllInputsOnAllDevices) { |
320 | if (numDevices != miniBatchThreads) { |
321 | llvm::outs() << "Setting " << miniBatchThreads.ArgStr << " to match " |
322 | << numDevices.ArgStr << " (" << numDevices |
323 | << ") as required by " << runAllInputsOnAllDevices.ArgStr |
324 | << "\n" ; |
325 | miniBatchThreads.getValue() = numDevices; |
326 | } |
327 | } |
328 | |
329 | // If preloading then load+process all images here in preloadedInputImageData. |
330 | Tensor preloadedInputImageData; |
331 | if (preloadAllImages) { |
332 | Loader loader; |
333 | PreProcessInputExecutor ppImageExecutor; |
334 | addLoaderExtensions(loader); |
335 | ppImageExecutor.registerInputDataPreProcessingExtension( |
336 | ppInputDataExtensions_); |
337 | |
338 | if (!inputTensorListFile.empty()) { |
339 | loadInputImageFromFileWithType( |
340 | inputImageFilenames_[0], &preloadedInputImageData, imageLayoutOpt[0]); |
341 | } else { |
342 | // Load and process the image data into the inputImageData Tensor. |
343 | loadImagesAndPreprocess(inputImageFilenames_, {&preloadedInputImageData}); |
344 | ppImageExecutor.processInputTensor({&preloadedInputImageData}, 0, |
345 | inputImageFilenames_[0].size(), |
346 | preloadedInputImageData.dims()[0]); |
347 | } |
348 | } |
349 | |
350 | // Process a set of minibatches with indices [startIndex, endIndex). |
351 | auto processImageRange = [&](size_t startIndex, size_t endIndex, size_t TID) { |
352 | std::unique_ptr<ExecutionContext> exContext = |
353 | glow::make_unique<ExecutionContext>(); |
354 | PlaceholderBindings &bindings = *exContext->getPlaceholderBindings(); |
355 | if (traceContext) { |
356 | exContext->setTraceContext( |
357 | glow::make_unique<TraceContext>(TraceLevel::STANDARD)); |
358 | } |
359 | // If runAllInputsOnAllDevices, then assign this thread with TID to device |
360 | // TID. E.g. if this is TID 2 then this will be assigned to device 2. |
361 | Loader loader = runAllInputsOnAllDevices ? Loader(TID) : Loader(); |
362 | PostProcessExecutor ppResultExecutor; |
363 | PreProcessInputExecutor ppImageExecutor; |
364 | |
365 | // Registering all the extensions per thread. |
366 | addLoaderExtensions(loader); |
367 | ppResultExecutor.registerPostProcessOutputExtensions( |
368 | ppOutputDataExtensions_); |
369 | ppImageExecutor.registerInputDataPreProcessingExtension( |
370 | ppInputDataExtensions_); |
371 | |
372 | // Used to make sure we only compile once, and run only once if not |
373 | // streaming. |
374 | bool isFirstRun = true; |
375 | |
376 | // Perform graph profiling initialization if needed. |
377 | // if (profilingGraph()) { |
378 | // loader.initGraphProfiling( |
379 | // bindings, miniBatch > 0 ? miniBatch : |
380 | // inputImageFilenames_[0].size(), inputImageFilenames_[0].size()); |
381 | //} |
382 | |
383 | // These will be set during the first run. |
384 | llvm::StringMap<Placeholder *> iPHM; |
385 | llvm::StringMap<Placeholder *> oPHM; |
386 | std::vector<Placeholder *> inPHs; |
387 | std::vector<Placeholder *> outPHs; |
388 | |
389 | size_t miniBatchIndex = startIndex; |
390 | std::vector<Tensor> inputData(modelInputsOpt.size()); |
391 | if (preloadAllImages) { |
392 | inputData[0] = preloadedInputImageData.getUnowned(); |
393 | } |
394 | |
395 | VecVec<std::string> inputImageBatchFilenames; |
396 | if ((!miniBatchMode) && |
397 | (!streamInputFilenamesMode || singleBatchRepeatedMode)) { |
398 | inputImageBatchFilenames = inputImageFilenames_; |
399 | } else if (singleBatchRepeatedMode) { |
400 | for (size_t i = 0, e = modelInputsOpt.size(); i < e; i++) { |
401 | std::vector<std::string> names(inputImageFilenames_[0].begin(), |
402 | inputImageFilenames_[0].begin() + |
403 | miniBatch); |
404 | inputImageBatchFilenames.push_back(names); |
405 | } |
406 | } |
407 | if (!tracePath.empty()) { |
408 | loader.getHostManager()->setTraceContext( |
409 | glow::make_unique<TraceContext>(traceLevel)); |
410 | Error err = loader.getHostManager()->startDeviceTrace(); |
411 | if (err) { |
412 | LOG(INFO) << "Failed to start device trace." ; |
413 | numErrors = 1; |
414 | return; |
415 | } else { |
416 | llvm::outs() << "Device trace started." ; |
417 | } |
418 | } |
419 | |
420 | // Pass input tensors around as array of pointers. |
421 | std::vector<Tensor *> inputImageData; |
422 | for (auto &data : inputData) { |
423 | inputImageData.push_back(&data); |
424 | } |
425 | |
426 | unsigned repeatedLoopCountRemaining = repeatSingleBatchCount; |
427 | |
428 | auto loopCond = [&]() { |
429 | // If in stream mode then get the next image filenames if they exist, |
430 | // otherwise exit. |
431 | if (streamInputFilenamesMode) { |
432 | return getNextStdinImageFilenames(inputImageBatchFilenames); |
433 | } |
434 | |
435 | // If a single batch is going to be loaded once and repeated then keep |
436 | // running repeatedLoopCountRemaining mores times. |
437 | if (singleBatchRepeatedMode) { |
438 | return repeatedLoopCountRemaining-- != 0; |
439 | } |
440 | |
441 | // If in miniBatchMode then continue if we have already preloaded all |
442 | // images (will break inside loop once done), or otherwise get the next |
443 | // miniBatch image filenames if they exist, otherwise exit. |
444 | if (miniBatchMode) { |
445 | return getNextMiniBatch(inputImageBatchFilenames, inputImageFilenames_, |
446 | miniBatchIndex, miniBatch, endIndex); |
447 | } |
448 | |
449 | // At least enter once, e.g. to just dump a bundle. |
450 | return isFirstRun; |
451 | }; |
452 | |
453 | while (loopCond()) { |
454 | if (!preloadAllImages && (!singleBatchRepeatedMode || isFirstRun)) { |
455 | // Load and process the image data into the inputImageData Tensor. |
456 | if (!inputTensorListFile.empty()) { |
457 | loadInputImageFromFileWithType(inputImageBatchFilenames[0], |
458 | inputImageData[0], imageLayoutOpt[0]); |
459 | } else { |
460 | loadImagesAndPreprocess(inputImageBatchFilenames, inputImageData); |
461 | ppImageExecutor.processInputTensor(inputImageData, startIndex, |
462 | endIndex, |
463 | inputImageData[0]->dims()[0]); |
464 | } |
465 | } |
466 | |
467 | // Note: At this point miniBatchIndex is the end index, so subtract |
468 | // miniBatch to get the start index. |
469 | const dim_t startMiniBatchIndex = miniBatchIndex - miniBatch; |
470 | |
471 | ShapeVector imageShape(inputImageData[0]->getType().dims().begin(), |
472 | inputImageData[0]->getType().dims().end()); |
473 | if (miniBatch) { |
474 | imageShape[0] = miniBatch; |
475 | } else if (iterationsOpt) { |
476 | imageShape[0] = iterationsOpt; |
477 | } |
478 | |
479 | // If we are benchmarking reset the image data to the batch size we need. |
480 | if (iterationsOpt) { |
481 | auto resetTensor = [](Tensor *tensor) { |
482 | ShapeVector imageSize(tensor->getType().dims().begin(), |
483 | tensor->getType().dims().end()); |
484 | imageSize[0] = miniBatch ? miniBatch : iterationsOpt; |
485 | tensor->reset(ElemKind::FloatTy, imageSize); |
486 | }; |
487 | std::for_each(inputImageData.begin(), inputImageData.end(), |
488 | resetTensor); |
489 | } |
490 | |
491 | // If this is the first run, then we need to build and compile the model. |
492 | if (isFirstRun) { |
493 | isFirstRun = false; |
494 | |
495 | std::vector<TypeRef> types; |
496 | auto preloadTy = |
497 | Type::newShape(inputImageData[0]->getType(), imageShape); |
498 | |
499 | if (preloadAllImages) { |
500 | types.push_back(&preloadTy); |
501 | } else { |
502 | // get types of all input tensors. |
503 | for_each(inputImageData.begin(), inputImageData.end(), |
504 | [&](auto *t) { types.push_back(&t->getType()); }); |
505 | } |
506 | |
507 | // Build and compile the graph, then get input and output Placeholders. |
508 | std::tie(iPHM, oPHM) = |
509 | buildAndCompileAndGetInAndOutPair(loader, bindings, types); |
510 | |
511 | // If in bundle mode, the bundle has been saved by the above call, so we |
512 | // can safely return. |
513 | if (emittingBundle()) { |
514 | LOG(INFO) << "Emit bundle mode is on. Network is compiled only." ; |
515 | return; |
516 | } |
517 | |
518 | // Obtain input/output placeholders from input/output map. |
519 | // For inputs, we got map but need to convert to array - need to |
520 | // take from map in order specified by modelInputsOpt. |
521 | for (size_t i = 0, e = modelInputsOpt.size(); i < e; i++) { |
522 | auto it = iPHM.find(modelInputsOpt[i]); |
523 | CHECK(it != iPHM.end()) |
524 | << "Couldn't find placeholder: " << modelInputsOpt[i]; |
525 | CHECK((*it).second) << "Placeholder in input map is NULL." ; |
526 | inPHs.push_back((*it).second); |
527 | }; |
528 | for_each(oPHM.begin(), oPHM.end(), [&](auto &p) { |
529 | CHECK(p.second) << "Placeholder in output map is NULL." ; |
530 | outPHs.push_back(p.second); |
531 | }); |
532 | } |
533 | |
534 | // preloadAllImages - set a new Tensor that takes a slice from the 1st |
535 | // (and only) input tensor. Assign this new Tensor the tensor array of |
536 | // pointers, inputImageData, used further. |
537 | Tensor inputImageDataBatch; |
538 | if (preloadAllImages) { |
539 | std::vector<dim_t> imgSliceStart(imageShape.size(), 0); |
540 | imgSliceStart[0] = startMiniBatchIndex; |
541 | inputImageDataBatch = |
542 | inputImageData[0]->getUnowned(imageShape, imgSliceStart); |
543 | inputImageData[0] = &inputImageDataBatch; |
544 | } |
545 | |
546 | // Compile done. |
547 | CHECK(!inPHs.empty()) << "Input must be valid." ; |
548 | CHECK(!outPHs.empty()) << "Output must be valid." ; |
549 | CHECK_EQ(inPHs.size(), inputImageData.size()) |
550 | << "Number of input placeholders and tensors must match" ; |
551 | for (size_t i = 0, e = inputImageData.size(); i < e; i++) { |
552 | CHECK(inPHs[i]->dims() == inputImageData[i]->dims()) |
553 | << "New input shape does not match the compiled function: " |
554 | << inPHs[i]->dims() << " vs " << inputImageData[i]->dims(); |
555 | } |
556 | |
557 | // Convert the raw input to fp16. This must be done every time we get new |
558 | // image data. |
559 | // Convert the raw input to fp16. |
560 | if (convertInAndOutToFp16) { |
561 | for (auto &t : inputImageData) { |
562 | t->convertToType(ElemKind::Float16Ty); |
563 | } |
564 | } |
565 | |
566 | // If we are benchmarking we are done with the while loop. |
567 | if (iterationsOpt) { |
568 | break; |
569 | } |
570 | |
571 | // Minibatch inference initialization of loader extensions |
572 | loader.inferInitMiniBatch(bindings, startMiniBatchIndex, miniBatch); |
573 | |
574 | // About to run inference, so update the input image Placeholder's backing |
575 | // Tensor with inputImageDataBatch. |
576 | updateInputPlaceholders(bindings, inPHs, inputImageData); |
577 | |
578 | // Perform the inference execution, updating output tensors. |
579 | auto batchSize = inputImageData[0]->dims()[0]; |
580 | loader.runInference(exContext.get(), batchSize); |
581 | if (traceContext) { |
582 | traceContext->merge(exContext->getTraceContext()); |
583 | } |
584 | |
585 | // Process output of the network. Each app cand do its own post-processing |
586 | // depending on type of the network. |
587 | { |
588 | std::lock_guard<std::mutex> lock(ioMu); |
589 | numErrors += ppResultExecutor.processOutputs(oPHM, bindings, |
590 | inputImageBatchFilenames); |
591 | } |
592 | |
593 | // Minibatch inference initialization of loader extensions. |
594 | loader.inferEndMiniBatch(bindings, startMiniBatchIndex, miniBatch); |
595 | } |
596 | |
597 | if (iterationsOpt) { |
598 | // Image tensors loaded up to be run at once for benchmark mode. |
599 | UniquePtrVec<ExecutionContext> contexts = |
600 | setupContextPool(outPHs, inPHs[0], *inputImageData[0]); |
601 | |
602 | std::string name = loader.getFunctionName(); |
603 | std::unique_ptr<llvm::Timer> restRunsTimer = nullptr; |
604 | std::unique_ptr<llvm::Timer> firstRunsTimer = nullptr; |
605 | std::unique_ptr<double> bestRunTime = nullptr; |
606 | if (timeOpt) { |
607 | if (excludedFirstWarmupRuns) { |
608 | firstRunsTimer.reset( |
609 | new llvm::Timer("First Runs" , "First inference runs" )); |
610 | restRunsTimer.reset( |
611 | new llvm::Timer("Rest Inferences" , "Rest of the inference runs" )); |
612 | } else { |
613 | restRunsTimer.reset( |
614 | new llvm::Timer("Inferences" , "All inference runs" )); |
615 | } |
616 | bestRunTime.reset(new double); |
617 | *bestRunTime = DBL_MAX; |
618 | } |
619 | unsigned requestCount = miniBatch ? iterationsOpt / miniBatch : 1; |
620 | |
621 | runBenchmark(name, loader, std::move(contexts), requestCount, warmup, |
622 | restRunsTimer.get(), firstRunsTimer.get(), |
623 | bestRunTime.get()); |
624 | if (timeOpt) { |
625 | double wallTime = restRunsTimer->getTotalTime().getWallTime(); |
626 | llvm::outs() << llvm::formatv( |
627 | "Average wall time per item (s): {0:f4}\n" , |
628 | wallTime / (iterationsOpt + warmup - excludedFirstWarmupRuns)); |
629 | llvm::outs() << llvm::formatv( |
630 | " Best wall time (s): {0:f4}\n" , *bestRunTime); |
631 | } |
632 | } |
633 | |
634 | if (profilingGraph()) { |
635 | loader.generateAndSerializeProfilingInfos(bindings); |
636 | } |
637 | |
638 | if (!tracePath.empty()) { |
639 | Error err = loader.getHostManager()->stopDeviceTrace(); |
640 | if (err) { |
641 | LOG(INFO) << "Failed to stop device trace:" ; |
642 | numErrors = 1; |
643 | return; |
644 | } else { |
645 | traceContext->merge(loader.getHostManager()->getTraceContext()); |
646 | } |
647 | } |
648 | }; |
649 | |
650 | // We will force single-threaded execution if: |
651 | // - Minibatch mode and runAllInputsOnAllDevices are disabled; |
652 | // - We are going to emit bundle and do not do inference; |
653 | // - We are collecting inference profile. |
654 | // Otherwise, there can be several minibatches of equal size. |
655 | const bool multiThreadingAllowed = |
656 | (runAllInputsOnAllDevices || miniBatchMode) && !emittingBundle() && |
657 | !profilingGraph(); |
658 | const size_t numBatches = |
659 | miniBatchMode ? inputImageFilenames_[0].size() / miniBatch : 1u; |
660 | const size_t numThreads = |
661 | runAllInputsOnAllDevices |
662 | ? miniBatchThreads |
663 | : (multiThreadingAllowed |
664 | ? std::min(size_t(miniBatchThreads), numBatches) |
665 | : 1u); |
666 | if (miniBatchThreads > 1 && !multiThreadingAllowed) { |
667 | llvm::outs() << "WARNING: multi-threaded execution is not possible. Make " |
668 | "sure that minibatch size is specified and you are not " |
669 | "trying to dump profile or emit bundle.\n" ; |
670 | } |
671 | |
672 | llvm::outs() << "Running " << numThreads << " thread(s).\n" ; |
673 | std::vector<std::thread> threads(numThreads); |
674 | const size_t miniBatchesPerThread = |
675 | (numBatches + numThreads - 1) / numThreads; |
676 | for (size_t i = 0; i < numThreads; i++) { |
677 | size_t startIndex, endIndex; |
678 | if (!runAllInputsOnAllDevices && numThreads > 1) { |
679 | startIndex = i * miniBatchesPerThread * miniBatch; |
680 | endIndex = std::min((i + 1) * miniBatchesPerThread * miniBatch, |
681 | inputImageFilenames_[0].size()); |
682 | } else { |
683 | startIndex = 0; |
684 | endIndex = inputImageFilenames_[0].size(); |
685 | } |
686 | auto worker = [&processImageRange, startIndex, endIndex, i]() { |
687 | processImageRange(startIndex, endIndex, i); |
688 | }; |
689 | threads.push_back(std::thread(worker)); |
690 | } |
691 | |
692 | for (auto &t : threads) { |
693 | if (t.joinable()) { |
694 | t.join(); |
695 | } |
696 | } |
697 | |
698 | if (!tracePath.empty()) { |
699 | traceContext->dump(tracePath, appName_); |
700 | } |
701 | |
702 | return numErrors; |
703 | } |
704 | |