1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16
17#include "glow/Base/Image.h"
18#include "glow/Converter/TypeAToTypeBFunctionConverter.h"
19#include "glow/Graph/Nodes.h"
20#include "glow/Importer/Caffe2ModelLoader.h"
21#include "glow/Importer/ONNXModelLoader.h"
22#include "glow/Support/Support.h"
23
24#include "llvm/ADT/StringSwitch.h"
25#include "llvm/Support/CommandLine.h"
26#include "llvm/Support/FileSystem.h"
27#include "llvm/Support/Format.h"
28#include "llvm/Support/Timer.h"
29#include "llvm/Support/raw_ostream.h"
30
31#include <atomic>
32#include <cfloat>
33#include <fstream>
34#include <future>
35#include <iostream>
36#include <memory>
37#include <mutex>
38#include <queue>
39#include <sstream>
40#include <thread>
41
42#include "ExecutorCoreHelperFunctions.h"
43
44using namespace glow;
45
46/// Image loader options.
47llvm::cl::OptionCategory executorCat("Executor Options");
48
49// Either positional or list file or input-image-list-file will ultimately
50// create a double vector containing an image list per input, stored here.
51VecVec<std::string> inputImageFilenames_;
52
53llvm::cl::list<std::string> inputImageFilenamesOpt(
54 llvm::cl::Positional,
55 llvm::cl::desc("<input files> (note: specifying '-' enables streaming "
56 "mode, where the model is compiled once and then can be run "
57 "many times with new input filenames passed via stdin)"),
58 llvm::cl::ZeroOrMore);
59
60llvm::cl::list<std::string> inputImageDirs(
61 "input-image-dir",
62 llvm::cl::desc(
63 "Name of directory containing images. Can be used multiple times."),
64 llvm::cl::value_desc("dir_name"), llvm::cl::Optional, llvm::cl::ZeroOrMore,
65 llvm::cl::cat(executorCat));
66
67std::vector<std::string> inputImageListFileOpt;
68static llvm::cl::list<std::string, std::vector<std::string>>
69 inputImageListFileF(
70 "input-image-list-file",
71 llvm::cl::desc(
72 "List of files containing list of images (one image per line)"),
73 llvm::cl::value_desc("string_name"), llvm::cl::ZeroOrMore,
74 llvm::cl::CommaSeparated, llvm::cl::cat(executorCat),
75 llvm::cl::location(inputImageListFileOpt));
76
77llvm::cl::opt<std::string> inputTensorListFile(
78 "input-tensor-list-file",
79 llvm::cl::desc(
80 "Name of the file containing list of tensors (one tensor per line)"),
81 llvm::cl::value_desc("string_name"), llvm::cl::Optional,
82 llvm::cl::cat(executorCat));
83
84llvm::cl::opt<unsigned> miniBatch(
85 "minibatch",
86 llvm::cl::desc(
87 "Size of mini-batches. Split the input image list into a set of "
88 "mini-batches. The input model is compiled for an input tensor batch "
89 "size equal to the specified mini-batch size and mini-batches of "
90 "images are inferred separately. The number of input images must be a "
91 "multiple of the mini-batch size. By default, mini-batch is set to 1."),
92 llvm::cl::Optional, llvm::cl::init(1), llvm::cl::cat(executorCat));
93
94llvm::cl::opt<unsigned> miniBatchThreads(
95 "minibatch-threads",
96 llvm::cl::desc(
97 "Max number of threads used to process mini-batches. If "
98 "minibatch-threads is greater than 1, and we are working in minibatch "
99 "mode, then several worker threads are created to process the "
100 "minibatches. Then the minibatches are distributed between these "
101 "threads, and each thread processes its set of minibatches "
102 "independently."
103 " By default, the number of threads is 1, and no parallelization is "
104 "happening. These are things to be aware of:\n"
105 "\t- The actual number of worker threads can be less than specified by "
106 "this option (for example, if specified number of threads is greater "
107 "than number of minibatches to process). Their number may also be "
108 "forced to 1 in some cases (see below);\n"
109 "\t- Currently, dumping profile and emitting bundle force "
110 "single-threaded mode;\n"
111 "\t- If a model has operations that make reduction across images in "
112 "the batch, it is a user's responsibility to make sure that this model "
113 "is not processed in multi-threaded mode. Otherwise, the correctness "
114 "of results is not guaranteed."),
115 llvm::cl::Optional, llvm::cl::init(1), llvm::cl::cat(executorCat));
116
117llvm::cl::opt<unsigned> poolSize(
118 "pool-size",
119 llvm::cl::desc("Size of context pool for the benchmark; default:10"),
120 llvm::cl::Optional, llvm::cl::init(10), llvm::cl::cat(executorCat));
121
122llvm::cl::opt<bool> convertInAndOutToFp16(
123 "convert-inout-to-fp16",
124 llvm::cl::desc(
125 "Convert the input and output tensors of the network to fp16"),
126 llvm::cl::cat(executorCat));
127
128llvm::cl::opt<std::string> tracePath("trace-path",
129 llvm::cl::desc("Write trace logs to disk"),
130 llvm::cl::init(""),
131 llvm::cl::cat(executorCat));
132
133llvm::cl::opt<bool>
134 autoInstrument("auto-instrument",
135 llvm::cl::desc("Add instrumentation for operator tracing"),
136 llvm::cl::Optional, llvm::cl::init(false),
137 llvm::cl::cat(executorCat));
138
139llvm::cl::opt<unsigned> traceLevel(
140 "trace-level",
141 llvm::cl::desc(
142 "Set tracing level (bit-field, see TraceEvents.h for details)"),
143 llvm::cl::Optional, llvm::cl::init((unsigned)TraceLevel::NONE),
144 llvm::cl::cat(executorCat));
145
146llvm::cl::opt<unsigned> warmup(
147 "warmup", llvm::cl::desc("How many passes to do to warm everything up"),
148 llvm::cl::init(0), llvm::cl::value_desc("W"), llvm::cl::cat(executorCat));
149
150llvm::cl::opt<unsigned> excludedFirstWarmupRuns(
151 "excluded-first-warmup-runs",
152 llvm::cl::desc("Exclude the time of the given number of first warmup runs "
153 "from the total time"),
154 llvm::cl::Optional, llvm::cl::init(0), llvm::cl::cat(executorCat));
155
156llvm::cl::opt<bool>
157 preloadAllImages("preload-all-images",
158 llvm::cl::desc("Pre-load all images before inference"),
159 llvm::cl::init(false), llvm::cl::cat(executorCat));
160
161llvm::cl::opt<std::string> modelOutputName(
162 "output-name",
163 llvm::cl::desc("The name of the variable for the model's output."),
164 llvm::cl::value_desc("string_name"), llvm::cl::Optional,
165 llvm::cl::cat(executorCat));
166
167llvm::cl::opt<unsigned> repeatSingleBatchCount(
168 "repeat-single-batch-count",
169 llvm::cl::desc(
170 "Repeat a single batch input n times. Used for testing purposes. If "
171 "used without minibatch then the whole input set is used as the batch "
172 "size and repeated n times. Otherwise the first minibatch is repeated "
173 "and all other inputs are ignored."),
174 llvm::cl::init(0), llvm::cl::cat(executorCat));
175
176/// Read all images from \p inputImageDir into \p imageFilenames.
177void parseInputDir(const std::string &inputImageDir,
178 std::vector<std::string> &imageFilenames) {
179 CHECK(llvm::sys::fs::is_directory(inputImageDir))
180 << strFormat("Path '%s' is not a directory!", inputImageDir.data());
181 std::error_code code;
182 llvm::sys::fs::directory_iterator dirIt(inputImageDir, code);
183 std::vector<std::string> imageFiles;
184 while (!code && dirIt != llvm::sys::fs::directory_iterator()) {
185 auto path = dirIt->path();
186 if (llvm::sys::fs::is_regular_file(path)) {
187 imageFiles.emplace_back(path);
188 }
189 dirIt.increment(code);
190 }
191 // The paths retrieved by the directory iterator are not sorted.
192 // Sort the paths alphabetically in increasing order and add them
193 // to the overall list of image filenames.
194 std::sort(imageFiles.begin(), imageFiles.end());
195 for (auto &imageFile : imageFiles) {
196 imageFilenames.push_back(imageFile);
197 }
198}
199
200/// Clear external storage for cmd args defined in Loader.
201void initExecutorCoreCmdArgVars() {
202 inputImageFilenames_.clear();
203 inputImageListFileOpt.clear();
204 modelInputsOpt.clear();
205}
206
207/// Do any special processing for cmd args defined in ExecutorCore.
208void processExecutorCoreCmdArgVars() {}
209
210/// Read all images from \p inputListFile in to \p imageFilenames.
211void parseInputList(const std::string &inputListFile,
212 std::vector<std::string> &imageFilenames) {
213
214 std::ifstream inFile;
215 inFile.open(inputListFile);
216 if (!inFile.good()) {
217 llvm::outs() << "Could not open input-image-list-file: " << inputListFile
218 << ", exiting.\n";
219 std::exit(1);
220 }
221
222 while (!inFile.eof()) {
223 std::string img;
224 getline(inFile, img);
225 if (!img.empty()) {
226 imageFilenames.push_back(img);
227 }
228 }
229 inFile.close();
230}
231
232/// Write a prompt to stdout asking for filenames for classification. Read in
233/// those filenames and add them to \p filenames. \p filenames is cleared before
234/// adding the new set of filenames from stdin. \returns false if the passed in
235/// line was empty.
236bool getNextStdinImageFilenames(VecVec<std::string> &filenamesVec) {
237 std::vector<std::string> filenames;
238 // Clear out old filenames before adding new ones.
239 filenamesVec.clear();
240
241 llvm::outs() << "Enter image filenames to classify: ";
242
243 // Add in each filename to the vector.
244 std::string filenamesRaw;
245 getline(std::cin, filenamesRaw);
246 std::istringstream iss(filenamesRaw);
247 std::string filename;
248 while (iss >> filename) {
249 filenames.push_back(filename);
250 }
251 if (!filenames.empty()) {
252 filenamesVec.push_back(filenames);
253 }
254 return !filenames.empty();
255}
256
257/// Generate in \p imageLists the list of lists (for each input) of filenames
258/// corresponding to the next mini-batch of size \p miniBatchSize extracted from
259/// \p totalImageLists at index \p minibatchIndex. /returns true if the index is
260/// valid, false otherwise. In case the function returns true, \p minibatchIndex
261/// is incremented by \p miniBatchSize. Stop upon reaching \p miniBatchLimit.
262bool getNextMiniBatch(VecVec<std::string> &imageLists,
263 VecVecRef<std::string> totalImageLists,
264 size_t &miniBatchIndex, size_t miniBatchSize,
265 size_t miniBatchLimit) {
266 if (miniBatchIndex >= miniBatchLimit) {
267 return false;
268 }
269
270 imageLists.clear();
271 for (const auto &totalImageList : totalImageLists) {
272 size_t batchIdx = miniBatchIndex;
273 size_t batchSize = miniBatchSize;
274 size_t endIndex = batchIdx + batchSize;
275 std::vector<std::string> imageList;
276 for (size_t index = batchIdx; index < endIndex; index++) {
277 imageList.push_back(totalImageList[index]);
278 }
279 imageLists.push_back(imageList);
280 }
281 miniBatchIndex += miniBatchSize;
282 return true;
283}
284
285Placeholder *
286getOutputForPostProcessing(const llvm::StringMap<Placeholder *> &PHM) {
287 if (PHM.size() == 1) {
288 return PHM.begin()->second;
289 }
290 if (modelOutputName.empty()) {
291 static bool warningPrinted = false;
292 if (!warningPrinted) {
293 warningPrinted = true;
294 llvm::outs()
295 << "WARNING: Multiple outputs found and none is selected. "
296 "Any postprocessing will be DISABLED!\n"
297 "Use '-output-name' to select output for postprocessing\n";
298 }
299 return nullptr;
300 }
301 auto ph = PHM.find(modelOutputName);
302 if (ph == PHM.end()) {
303 static bool warning_printed = false;
304 if (!warning_printed) {
305 warning_printed = true;
306 llvm::outs() << "WARNING: Name specified not found in outputs. "
307 "Any postprocessing will be DISABLED: "
308 << modelOutputName << "\n";
309 }
310 return nullptr;
311 }
312 return ph->second;
313}
314
315/// Given \p loader, the \p bindings, and \p inputImageType, build the graph
316/// from the provided protobuf file found via \p loader. Then compiles and
317/// \returns a pair of pointers to the input Placeholder and output Nodes Map.
318std::pair<llvm::StringMap<Placeholder *>, llvm::StringMap<Placeholder *>>
319buildAndCompileAndGetInAndOutPair(Loader &loader, PlaceholderBindings &bindings,
320 llvm::ArrayRef<TypeRef> inputImageType) {
321 // Load model.
322 loader.loadModel(&bindings, inputImageType);
323
324 // Allocate tensors to back all inputs and outputs.
325 bindings.allocate(loader.getModule()->getPlaceholders());
326
327 // Convert the placeholders for now. The backing Tensor's data will be
328 // converted later.
329 if (convertInAndOutToFp16) {
330 PrecisionConfiguration precConfig;
331 TypeAToTypeBFunctionConverter converter(*loader.getFunction(),
332 ElemKind::FloatTy,
333 ElemKind::Float16Ty, precConfig);
334 for (auto *placeholder : loader.getModule()->getPlaceholders()) {
335 converter.convertPlaceholder(*placeholder, &bindings);
336 }
337 }
338
339 // Compile the model, and perform quantization/emit a bundle/dump debug info
340 // if requested from command line.
341 CompilationContext cctx = loader.getCompilationContext();
342 cctx.bindings = &bindings;
343 cctx.backendOpts.autoInstrument = autoInstrument;
344 loader.compile(cctx);
345
346 // Get input/output placeholder maps.
347 llvm::StringMap<Placeholder *> inpMap = loader.getInputPlaceholderMap();
348 llvm::StringMap<Placeholder *> outMap = loader.getOutputPlaceholderMap();
349 return std::make_pair(inpMap, outMap);
350}
351
352/// Setup the pool of contexts needed for a benchmark run.
353UniquePtrVec<ExecutionContext>
354setupContextPool(const std::vector<Placeholder *> outputPHV,
355 Placeholder *inputImagePH, Tensor &inputImageData) {
356 UniquePtrVec<ExecutionContext> contexts;
357 // Size of the pool, the smaller of poolSize or the actual number of
358 // requests.
359 unsigned iterations =
360 miniBatch ? std::min(int(poolSize), int(iterationsOpt / miniBatch)) : 1;
361 // Setup pool of inference requests to be run.
362 for (unsigned i = 0; i < iterations; i++) {
363 auto newContext = glow::make_unique<ExecutionContext>();
364 newContext->setTraceContext(glow::make_unique<TraceContext>(traceLevel));
365 auto ph = newContext->getPlaceholderBindings();
366 ph->insert(inputImagePH, Tensor(inputImageData.getType()));
367 for (auto *outputPH : outputPHV) {
368 ph->allocate(outputPH);
369 }
370 contexts.push_back(std::move(newContext));
371 }
372 return contexts;
373}
374
375std::mutex eventLock;
376std::unique_ptr<TraceContext> traceContext;
377
378/// Run inference request on HostManager. This method builds a runNetwork
379/// request for the \p hostManager, this is a recursive call, in the callback
380/// provided to the HostManager this function can call itself if the desired
381/// number of warmups and requests has not yet been dispatched.
382static void runInference(runtime::HostManager *hostManager, std::string name,
383 std::unique_ptr<ExecutionContext> batch,
384 std::promise<void> &runPromise,
385 std::atomic<unsigned> &inflight,
386 std::atomic<int> &dispatched, unsigned warmUp,
387 llvm::Timer *restRunsTimer = nullptr,
388 llvm::Timer *firstRunsTimer = nullptr,
389 double *bestRunTime = nullptr) {
390 static std::atomic<unsigned> firstRunsDone(0);
391 auto start = TraceEvent::now();
392 if (firstRunsTimer != nullptr && !firstRunsTimer->isRunning() &&
393 firstRunsDone < excludedFirstWarmupRuns) {
394 firstRunsTimer->startTimer();
395 } else if (restRunsTimer != nullptr &&
396 firstRunsDone >= excludedFirstWarmupRuns &&
397 !restRunsTimer->hasTriggered()) {
398 restRunsTimer->startTimer();
399 }
400
401 llvm::Timer *bestRunTimer = nullptr;
402 if (bestRunTime != nullptr) {
403 bestRunTimer = new llvm::Timer("Best Run", "Best Inference Run");
404 bestRunTimer->startTimer();
405 }
406
407 hostManager->runNetwork(
408 name, std::move(batch),
409 [&runPromise, &inflight, &dispatched, hostManager, name, warmUp,
410 restRunsTimer, firstRunsTimer, bestRunTime, bestRunTimer,
411 start](runtime::RunIdentifierTy, Error err,
412 std::unique_ptr<ExecutionContext> contextPtr) {
413 EXIT_ON_ERR(std::move(err));
414 if (!tracePath.empty()) {
415 if (!warmUp) {
416 std::lock_guard<std::mutex> l(eventLock);
417 // Temporary (AIBench relies on inference_e2e metric)
418 // Later we switch AIBench to the metric from
419 // HostManager::dispatchNextRun()
420 traceContext->logCompleteTraceEvent("inference_e2e",
421 TraceLevel::RUNTIME, start);
422 // Merge this run's TraceEvents into the global
423 // TraceContext.
424 traceContext->merge(contextPtr->getTraceContext());
425 } else {
426 contextPtr->getTraceContext()->getTraceEvents().clear();
427 }
428 }
429 firstRunsDone++;
430 if (firstRunsTimer != nullptr && firstRunsTimer->isRunning() &&
431 firstRunsDone == excludedFirstWarmupRuns) {
432 firstRunsTimer->stopTimer();
433 }
434 if (bestRunTime != nullptr) {
435 bestRunTimer->stopTimer();
436 double wallTime = bestRunTimer->getTotalTime().getWallTime();
437 if (wallTime < *bestRunTime)
438 *bestRunTime = wallTime;
439 bestRunTimer->clear();
440 delete bestRunTimer;
441 }
442
443 // Kick off another run.
444 if (dispatched.fetch_sub(1) > 0) {
445 inflight++;
446 runInference(hostManager, name, std::move(contextPtr), runPromise,
447 inflight, dispatched, warmUp > 0 ? warmUp - 1 : 0,
448 restRunsTimer, firstRunsTimer, bestRunTime);
449 } else if (restRunsTimer != nullptr) {
450 restRunsTimer->stopTimer();
451 }
452
453 if (--inflight == 0) {
454 runPromise.set_value();
455 }
456 });
457}
458
459/// Run the requested number of benchmark requests \p requestCount prepended by
460/// \p warmUp cycles
461/// through the HostManager from the \p loader using the provided context pool
462/// \p contexts and wait for all runs to complete.
463void runBenchmark(std::string name, Loader &loader,
464 std::vector<std::unique_ptr<ExecutionContext>> contexts,
465 unsigned requestCount, unsigned warmUp,
466 llvm::Timer *restRunsTimer = nullptr,
467 llvm::Timer *firstRunsTimer = nullptr,
468 double *bestRunTime = nullptr) {
469 runtime::HostManager *hostManager = loader.getHostManager();
470 std::atomic<unsigned> inflight(0);
471 std::atomic<int> dispatched(requestCount + warmUp * contexts.size());
472 std::promise<void> runPromise;
473 auto fut = runPromise.get_future();
474
475 // Kick off initial pool of requests.
476 for (size_t i = 0, e = contexts.size(); i < e; i++) {
477 auto batch = std::move(contexts[i]);
478 inflight++;
479 dispatched--;
480 runInference(hostManager, name, std::move(batch), runPromise, inflight,
481 dispatched, warmUp, restRunsTimer, firstRunsTimer,
482 bestRunTime);
483 }
484
485 // Wait for all to finish.
486 fut.wait();
487}
488