1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | |
17 | #include "glow/Base/Image.h" |
18 | #include "glow/Converter/TypeAToTypeBFunctionConverter.h" |
19 | #include "glow/Graph/Nodes.h" |
20 | #include "glow/Importer/Caffe2ModelLoader.h" |
21 | #include "glow/Importer/ONNXModelLoader.h" |
22 | #include "glow/Support/Support.h" |
23 | |
24 | #include "llvm/ADT/StringSwitch.h" |
25 | #include "llvm/Support/CommandLine.h" |
26 | #include "llvm/Support/FileSystem.h" |
27 | #include "llvm/Support/Format.h" |
28 | #include "llvm/Support/Timer.h" |
29 | #include "llvm/Support/raw_ostream.h" |
30 | |
31 | #include <atomic> |
32 | #include <cfloat> |
33 | #include <fstream> |
34 | #include <future> |
35 | #include <iostream> |
36 | #include <memory> |
37 | #include <mutex> |
38 | #include <queue> |
39 | #include <sstream> |
40 | #include <thread> |
41 | |
42 | #include "ExecutorCoreHelperFunctions.h" |
43 | |
44 | using namespace glow; |
45 | |
46 | /// Image loader options. |
47 | llvm::cl::OptionCategory executorCat("Executor Options" ); |
48 | |
49 | // Either positional or list file or input-image-list-file will ultimately |
50 | // create a double vector containing an image list per input, stored here. |
51 | VecVec<std::string> inputImageFilenames_; |
52 | |
53 | llvm::cl::list<std::string> inputImageFilenamesOpt( |
54 | llvm::cl::Positional, |
55 | llvm::cl::desc("<input files> (note: specifying '-' enables streaming " |
56 | "mode, where the model is compiled once and then can be run " |
57 | "many times with new input filenames passed via stdin)" ), |
58 | llvm::cl::ZeroOrMore); |
59 | |
60 | llvm::cl::list<std::string> inputImageDirs( |
61 | "input-image-dir" , |
62 | llvm::cl::desc( |
63 | "Name of directory containing images. Can be used multiple times." ), |
64 | llvm::cl::value_desc("dir_name" ), llvm::cl::Optional, llvm::cl::ZeroOrMore, |
65 | llvm::cl::cat(executorCat)); |
66 | |
67 | std::vector<std::string> inputImageListFileOpt; |
68 | static llvm::cl::list<std::string, std::vector<std::string>> |
69 | inputImageListFileF( |
70 | "input-image-list-file" , |
71 | llvm::cl::desc( |
72 | "List of files containing list of images (one image per line)" ), |
73 | llvm::cl::value_desc("string_name" ), llvm::cl::ZeroOrMore, |
74 | llvm::cl::CommaSeparated, llvm::cl::cat(executorCat), |
75 | llvm::cl::location(inputImageListFileOpt)); |
76 | |
77 | llvm::cl::opt<std::string> inputTensorListFile( |
78 | "input-tensor-list-file" , |
79 | llvm::cl::desc( |
80 | "Name of the file containing list of tensors (one tensor per line)" ), |
81 | llvm::cl::value_desc("string_name" ), llvm::cl::Optional, |
82 | llvm::cl::cat(executorCat)); |
83 | |
84 | llvm::cl::opt<unsigned> miniBatch( |
85 | "minibatch" , |
86 | llvm::cl::desc( |
87 | "Size of mini-batches. Split the input image list into a set of " |
88 | "mini-batches. The input model is compiled for an input tensor batch " |
89 | "size equal to the specified mini-batch size and mini-batches of " |
90 | "images are inferred separately. The number of input images must be a " |
91 | "multiple of the mini-batch size. By default, mini-batch is set to 1." ), |
92 | llvm::cl::Optional, llvm::cl::init(1), llvm::cl::cat(executorCat)); |
93 | |
94 | llvm::cl::opt<unsigned> miniBatchThreads( |
95 | "minibatch-threads" , |
96 | llvm::cl::desc( |
97 | "Max number of threads used to process mini-batches. If " |
98 | "minibatch-threads is greater than 1, and we are working in minibatch " |
99 | "mode, then several worker threads are created to process the " |
100 | "minibatches. Then the minibatches are distributed between these " |
101 | "threads, and each thread processes its set of minibatches " |
102 | "independently." |
103 | " By default, the number of threads is 1, and no parallelization is " |
104 | "happening. These are things to be aware of:\n" |
105 | "\t- The actual number of worker threads can be less than specified by " |
106 | "this option (for example, if specified number of threads is greater " |
107 | "than number of minibatches to process). Their number may also be " |
108 | "forced to 1 in some cases (see below);\n" |
109 | "\t- Currently, dumping profile and emitting bundle force " |
110 | "single-threaded mode;\n" |
111 | "\t- If a model has operations that make reduction across images in " |
112 | "the batch, it is a user's responsibility to make sure that this model " |
113 | "is not processed in multi-threaded mode. Otherwise, the correctness " |
114 | "of results is not guaranteed." ), |
115 | llvm::cl::Optional, llvm::cl::init(1), llvm::cl::cat(executorCat)); |
116 | |
117 | llvm::cl::opt<unsigned> poolSize( |
118 | "pool-size" , |
119 | llvm::cl::desc("Size of context pool for the benchmark; default:10" ), |
120 | llvm::cl::Optional, llvm::cl::init(10), llvm::cl::cat(executorCat)); |
121 | |
122 | llvm::cl::opt<bool> convertInAndOutToFp16( |
123 | "convert-inout-to-fp16" , |
124 | llvm::cl::desc( |
125 | "Convert the input and output tensors of the network to fp16" ), |
126 | llvm::cl::cat(executorCat)); |
127 | |
128 | llvm::cl::opt<std::string> tracePath("trace-path" , |
129 | llvm::cl::desc("Write trace logs to disk" ), |
130 | llvm::cl::init("" ), |
131 | llvm::cl::cat(executorCat)); |
132 | |
133 | llvm::cl::opt<bool> |
134 | autoInstrument("auto-instrument" , |
135 | llvm::cl::desc("Add instrumentation for operator tracing" ), |
136 | llvm::cl::Optional, llvm::cl::init(false), |
137 | llvm::cl::cat(executorCat)); |
138 | |
139 | llvm::cl::opt<unsigned> traceLevel( |
140 | "trace-level" , |
141 | llvm::cl::desc( |
142 | "Set tracing level (bit-field, see TraceEvents.h for details)" ), |
143 | llvm::cl::Optional, llvm::cl::init((unsigned)TraceLevel::NONE), |
144 | llvm::cl::cat(executorCat)); |
145 | |
146 | llvm::cl::opt<unsigned> warmup( |
147 | "warmup" , llvm::cl::desc("How many passes to do to warm everything up" ), |
148 | llvm::cl::init(0), llvm::cl::value_desc("W" ), llvm::cl::cat(executorCat)); |
149 | |
150 | llvm::cl::opt<unsigned> excludedFirstWarmupRuns( |
151 | "excluded-first-warmup-runs" , |
152 | llvm::cl::desc("Exclude the time of the given number of first warmup runs " |
153 | "from the total time" ), |
154 | llvm::cl::Optional, llvm::cl::init(0), llvm::cl::cat(executorCat)); |
155 | |
156 | llvm::cl::opt<bool> |
157 | preloadAllImages("preload-all-images" , |
158 | llvm::cl::desc("Pre-load all images before inference" ), |
159 | llvm::cl::init(false), llvm::cl::cat(executorCat)); |
160 | |
161 | llvm::cl::opt<std::string> modelOutputName( |
162 | "output-name" , |
163 | llvm::cl::desc("The name of the variable for the model's output." ), |
164 | llvm::cl::value_desc("string_name" ), llvm::cl::Optional, |
165 | llvm::cl::cat(executorCat)); |
166 | |
167 | llvm::cl::opt<unsigned> repeatSingleBatchCount( |
168 | "repeat-single-batch-count" , |
169 | llvm::cl::desc( |
170 | "Repeat a single batch input n times. Used for testing purposes. If " |
171 | "used without minibatch then the whole input set is used as the batch " |
172 | "size and repeated n times. Otherwise the first minibatch is repeated " |
173 | "and all other inputs are ignored." ), |
174 | llvm::cl::init(0), llvm::cl::cat(executorCat)); |
175 | |
176 | /// Read all images from \p inputImageDir into \p imageFilenames. |
177 | void parseInputDir(const std::string &inputImageDir, |
178 | std::vector<std::string> &imageFilenames) { |
179 | CHECK(llvm::sys::fs::is_directory(inputImageDir)) |
180 | << strFormat("Path '%s' is not a directory!" , inputImageDir.data()); |
181 | std::error_code code; |
182 | llvm::sys::fs::directory_iterator dirIt(inputImageDir, code); |
183 | std::vector<std::string> imageFiles; |
184 | while (!code && dirIt != llvm::sys::fs::directory_iterator()) { |
185 | auto path = dirIt->path(); |
186 | if (llvm::sys::fs::is_regular_file(path)) { |
187 | imageFiles.emplace_back(path); |
188 | } |
189 | dirIt.increment(code); |
190 | } |
191 | // The paths retrieved by the directory iterator are not sorted. |
192 | // Sort the paths alphabetically in increasing order and add them |
193 | // to the overall list of image filenames. |
194 | std::sort(imageFiles.begin(), imageFiles.end()); |
195 | for (auto &imageFile : imageFiles) { |
196 | imageFilenames.push_back(imageFile); |
197 | } |
198 | } |
199 | |
200 | /// Clear external storage for cmd args defined in Loader. |
201 | void initExecutorCoreCmdArgVars() { |
202 | inputImageFilenames_.clear(); |
203 | inputImageListFileOpt.clear(); |
204 | modelInputsOpt.clear(); |
205 | } |
206 | |
207 | /// Do any special processing for cmd args defined in ExecutorCore. |
208 | void processExecutorCoreCmdArgVars() {} |
209 | |
210 | /// Read all images from \p inputListFile in to \p imageFilenames. |
211 | void parseInputList(const std::string &inputListFile, |
212 | std::vector<std::string> &imageFilenames) { |
213 | |
214 | std::ifstream inFile; |
215 | inFile.open(inputListFile); |
216 | if (!inFile.good()) { |
217 | llvm::outs() << "Could not open input-image-list-file: " << inputListFile |
218 | << ", exiting.\n" ; |
219 | std::exit(1); |
220 | } |
221 | |
222 | while (!inFile.eof()) { |
223 | std::string img; |
224 | getline(inFile, img); |
225 | if (!img.empty()) { |
226 | imageFilenames.push_back(img); |
227 | } |
228 | } |
229 | inFile.close(); |
230 | } |
231 | |
232 | /// Write a prompt to stdout asking for filenames for classification. Read in |
233 | /// those filenames and add them to \p filenames. \p filenames is cleared before |
234 | /// adding the new set of filenames from stdin. \returns false if the passed in |
235 | /// line was empty. |
236 | bool getNextStdinImageFilenames(VecVec<std::string> &filenamesVec) { |
237 | std::vector<std::string> filenames; |
238 | // Clear out old filenames before adding new ones. |
239 | filenamesVec.clear(); |
240 | |
241 | llvm::outs() << "Enter image filenames to classify: " ; |
242 | |
243 | // Add in each filename to the vector. |
244 | std::string filenamesRaw; |
245 | getline(std::cin, filenamesRaw); |
246 | std::istringstream iss(filenamesRaw); |
247 | std::string filename; |
248 | while (iss >> filename) { |
249 | filenames.push_back(filename); |
250 | } |
251 | if (!filenames.empty()) { |
252 | filenamesVec.push_back(filenames); |
253 | } |
254 | return !filenames.empty(); |
255 | } |
256 | |
257 | /// Generate in \p imageLists the list of lists (for each input) of filenames |
258 | /// corresponding to the next mini-batch of size \p miniBatchSize extracted from |
259 | /// \p totalImageLists at index \p minibatchIndex. /returns true if the index is |
260 | /// valid, false otherwise. In case the function returns true, \p minibatchIndex |
261 | /// is incremented by \p miniBatchSize. Stop upon reaching \p miniBatchLimit. |
262 | bool getNextMiniBatch(VecVec<std::string> &imageLists, |
263 | VecVecRef<std::string> totalImageLists, |
264 | size_t &miniBatchIndex, size_t miniBatchSize, |
265 | size_t miniBatchLimit) { |
266 | if (miniBatchIndex >= miniBatchLimit) { |
267 | return false; |
268 | } |
269 | |
270 | imageLists.clear(); |
271 | for (const auto &totalImageList : totalImageLists) { |
272 | size_t batchIdx = miniBatchIndex; |
273 | size_t batchSize = miniBatchSize; |
274 | size_t endIndex = batchIdx + batchSize; |
275 | std::vector<std::string> imageList; |
276 | for (size_t index = batchIdx; index < endIndex; index++) { |
277 | imageList.push_back(totalImageList[index]); |
278 | } |
279 | imageLists.push_back(imageList); |
280 | } |
281 | miniBatchIndex += miniBatchSize; |
282 | return true; |
283 | } |
284 | |
285 | Placeholder * |
286 | getOutputForPostProcessing(const llvm::StringMap<Placeholder *> &PHM) { |
287 | if (PHM.size() == 1) { |
288 | return PHM.begin()->second; |
289 | } |
290 | if (modelOutputName.empty()) { |
291 | static bool warningPrinted = false; |
292 | if (!warningPrinted) { |
293 | warningPrinted = true; |
294 | llvm::outs() |
295 | << "WARNING: Multiple outputs found and none is selected. " |
296 | "Any postprocessing will be DISABLED!\n" |
297 | "Use '-output-name' to select output for postprocessing\n" ; |
298 | } |
299 | return nullptr; |
300 | } |
301 | auto ph = PHM.find(modelOutputName); |
302 | if (ph == PHM.end()) { |
303 | static bool warning_printed = false; |
304 | if (!warning_printed) { |
305 | warning_printed = true; |
306 | llvm::outs() << "WARNING: Name specified not found in outputs. " |
307 | "Any postprocessing will be DISABLED: " |
308 | << modelOutputName << "\n" ; |
309 | } |
310 | return nullptr; |
311 | } |
312 | return ph->second; |
313 | } |
314 | |
315 | /// Given \p loader, the \p bindings, and \p inputImageType, build the graph |
316 | /// from the provided protobuf file found via \p loader. Then compiles and |
317 | /// \returns a pair of pointers to the input Placeholder and output Nodes Map. |
318 | std::pair<llvm::StringMap<Placeholder *>, llvm::StringMap<Placeholder *>> |
319 | buildAndCompileAndGetInAndOutPair(Loader &loader, PlaceholderBindings &bindings, |
320 | llvm::ArrayRef<TypeRef> inputImageType) { |
321 | // Load model. |
322 | loader.loadModel(&bindings, inputImageType); |
323 | |
324 | // Allocate tensors to back all inputs and outputs. |
325 | bindings.allocate(loader.getModule()->getPlaceholders()); |
326 | |
327 | // Convert the placeholders for now. The backing Tensor's data will be |
328 | // converted later. |
329 | if (convertInAndOutToFp16) { |
330 | PrecisionConfiguration precConfig; |
331 | TypeAToTypeBFunctionConverter converter(*loader.getFunction(), |
332 | ElemKind::FloatTy, |
333 | ElemKind::Float16Ty, precConfig); |
334 | for (auto *placeholder : loader.getModule()->getPlaceholders()) { |
335 | converter.convertPlaceholder(*placeholder, &bindings); |
336 | } |
337 | } |
338 | |
339 | // Compile the model, and perform quantization/emit a bundle/dump debug info |
340 | // if requested from command line. |
341 | CompilationContext cctx = loader.getCompilationContext(); |
342 | cctx.bindings = &bindings; |
343 | cctx.backendOpts.autoInstrument = autoInstrument; |
344 | loader.compile(cctx); |
345 | |
346 | // Get input/output placeholder maps. |
347 | llvm::StringMap<Placeholder *> inpMap = loader.getInputPlaceholderMap(); |
348 | llvm::StringMap<Placeholder *> outMap = loader.getOutputPlaceholderMap(); |
349 | return std::make_pair(inpMap, outMap); |
350 | } |
351 | |
352 | /// Setup the pool of contexts needed for a benchmark run. |
353 | UniquePtrVec<ExecutionContext> |
354 | setupContextPool(const std::vector<Placeholder *> outputPHV, |
355 | Placeholder *inputImagePH, Tensor &inputImageData) { |
356 | UniquePtrVec<ExecutionContext> contexts; |
357 | // Size of the pool, the smaller of poolSize or the actual number of |
358 | // requests. |
359 | unsigned iterations = |
360 | miniBatch ? std::min(int(poolSize), int(iterationsOpt / miniBatch)) : 1; |
361 | // Setup pool of inference requests to be run. |
362 | for (unsigned i = 0; i < iterations; i++) { |
363 | auto newContext = glow::make_unique<ExecutionContext>(); |
364 | newContext->setTraceContext(glow::make_unique<TraceContext>(traceLevel)); |
365 | auto ph = newContext->getPlaceholderBindings(); |
366 | ph->insert(inputImagePH, Tensor(inputImageData.getType())); |
367 | for (auto *outputPH : outputPHV) { |
368 | ph->allocate(outputPH); |
369 | } |
370 | contexts.push_back(std::move(newContext)); |
371 | } |
372 | return contexts; |
373 | } |
374 | |
375 | std::mutex eventLock; |
376 | std::unique_ptr<TraceContext> traceContext; |
377 | |
378 | /// Run inference request on HostManager. This method builds a runNetwork |
379 | /// request for the \p hostManager, this is a recursive call, in the callback |
380 | /// provided to the HostManager this function can call itself if the desired |
381 | /// number of warmups and requests has not yet been dispatched. |
382 | static void runInference(runtime::HostManager *hostManager, std::string name, |
383 | std::unique_ptr<ExecutionContext> batch, |
384 | std::promise<void> &runPromise, |
385 | std::atomic<unsigned> &inflight, |
386 | std::atomic<int> &dispatched, unsigned warmUp, |
387 | llvm::Timer *restRunsTimer = nullptr, |
388 | llvm::Timer *firstRunsTimer = nullptr, |
389 | double *bestRunTime = nullptr) { |
390 | static std::atomic<unsigned> firstRunsDone(0); |
391 | auto start = TraceEvent::now(); |
392 | if (firstRunsTimer != nullptr && !firstRunsTimer->isRunning() && |
393 | firstRunsDone < excludedFirstWarmupRuns) { |
394 | firstRunsTimer->startTimer(); |
395 | } else if (restRunsTimer != nullptr && |
396 | firstRunsDone >= excludedFirstWarmupRuns && |
397 | !restRunsTimer->hasTriggered()) { |
398 | restRunsTimer->startTimer(); |
399 | } |
400 | |
401 | llvm::Timer *bestRunTimer = nullptr; |
402 | if (bestRunTime != nullptr) { |
403 | bestRunTimer = new llvm::Timer("Best Run" , "Best Inference Run" ); |
404 | bestRunTimer->startTimer(); |
405 | } |
406 | |
407 | hostManager->runNetwork( |
408 | name, std::move(batch), |
409 | [&runPromise, &inflight, &dispatched, hostManager, name, warmUp, |
410 | restRunsTimer, firstRunsTimer, bestRunTime, bestRunTimer, |
411 | start](runtime::RunIdentifierTy, Error err, |
412 | std::unique_ptr<ExecutionContext> contextPtr) { |
413 | EXIT_ON_ERR(std::move(err)); |
414 | if (!tracePath.empty()) { |
415 | if (!warmUp) { |
416 | std::lock_guard<std::mutex> l(eventLock); |
417 | // Temporary (AIBench relies on inference_e2e metric) |
418 | // Later we switch AIBench to the metric from |
419 | // HostManager::dispatchNextRun() |
420 | traceContext->logCompleteTraceEvent("inference_e2e" , |
421 | TraceLevel::RUNTIME, start); |
422 | // Merge this run's TraceEvents into the global |
423 | // TraceContext. |
424 | traceContext->merge(contextPtr->getTraceContext()); |
425 | } else { |
426 | contextPtr->getTraceContext()->getTraceEvents().clear(); |
427 | } |
428 | } |
429 | firstRunsDone++; |
430 | if (firstRunsTimer != nullptr && firstRunsTimer->isRunning() && |
431 | firstRunsDone == excludedFirstWarmupRuns) { |
432 | firstRunsTimer->stopTimer(); |
433 | } |
434 | if (bestRunTime != nullptr) { |
435 | bestRunTimer->stopTimer(); |
436 | double wallTime = bestRunTimer->getTotalTime().getWallTime(); |
437 | if (wallTime < *bestRunTime) |
438 | *bestRunTime = wallTime; |
439 | bestRunTimer->clear(); |
440 | delete bestRunTimer; |
441 | } |
442 | |
443 | // Kick off another run. |
444 | if (dispatched.fetch_sub(1) > 0) { |
445 | inflight++; |
446 | runInference(hostManager, name, std::move(contextPtr), runPromise, |
447 | inflight, dispatched, warmUp > 0 ? warmUp - 1 : 0, |
448 | restRunsTimer, firstRunsTimer, bestRunTime); |
449 | } else if (restRunsTimer != nullptr) { |
450 | restRunsTimer->stopTimer(); |
451 | } |
452 | |
453 | if (--inflight == 0) { |
454 | runPromise.set_value(); |
455 | } |
456 | }); |
457 | } |
458 | |
459 | /// Run the requested number of benchmark requests \p requestCount prepended by |
460 | /// \p warmUp cycles |
461 | /// through the HostManager from the \p loader using the provided context pool |
462 | /// \p contexts and wait for all runs to complete. |
463 | void runBenchmark(std::string name, Loader &loader, |
464 | std::vector<std::unique_ptr<ExecutionContext>> contexts, |
465 | unsigned requestCount, unsigned warmUp, |
466 | llvm::Timer *restRunsTimer = nullptr, |
467 | llvm::Timer *firstRunsTimer = nullptr, |
468 | double *bestRunTime = nullptr) { |
469 | runtime::HostManager *hostManager = loader.getHostManager(); |
470 | std::atomic<unsigned> inflight(0); |
471 | std::atomic<int> dispatched(requestCount + warmUp * contexts.size()); |
472 | std::promise<void> runPromise; |
473 | auto fut = runPromise.get_future(); |
474 | |
475 | // Kick off initial pool of requests. |
476 | for (size_t i = 0, e = contexts.size(); i < e; i++) { |
477 | auto batch = std::move(contexts[i]); |
478 | inflight++; |
479 | dispatched--; |
480 | runInference(hostManager, name, std::move(batch), runPromise, inflight, |
481 | dispatched, warmUp, restRunsTimer, firstRunsTimer, |
482 | bestRunTime); |
483 | } |
484 | |
485 | // Wait for all to finish. |
486 | fut.wait(); |
487 | } |
488 | |