1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include "BackendTestUtils.h"
17
18#include "glow/ExecutionContext/ExecutionContext.h"
19#include "glow/Flags/Flags.h"
20#include "glow/Runtime/HostManager/HostManager.h"
21
22#include "gtest/gtest.h"
23
24#include <future>
25#include <thread>
26
27using namespace glow;
28using namespace glow::runtime;
29using DAGNodePairTy = std::pair<std::vector<std::unique_ptr<DAGNode>>,
30 std::vector<std::unique_ptr<DAGNode>>>;
31
32class HostManagerTest : public ::testing::TestWithParam<std::string> {
33public:
34 void SetUp() override { backendName_ = GetParam(); }
35 std::string backendName_;
36};
37
38std::vector<std::unique_ptr<DeviceConfig>>
39generateConfigs(std::string backendName, unsigned numConfigs = 1) {
40 std::vector<std::unique_ptr<DeviceConfig>> configs;
41 for (unsigned i = 0; i < numConfigs; i++) {
42 auto deviceConfig = glow::make_unique<DeviceConfig>(backendName);
43 deviceConfig->deviceID = i;
44 configs.push_back(std::move(deviceConfig));
45 }
46 return configs;
47}
48
49std::unique_ptr<Module> setupModule(unsigned functionCount) {
50 std::unique_ptr<Module> module = glow::make_unique<Module>();
51 for (unsigned int i = 0; i < functionCount; i++) {
52 Function *F = module->createFunction("function" + std::to_string(i));
53 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3},
54 "X" + std::to_string(i), false);
55 auto *pow = F->createPow("Pow" + std::to_string(i), X, 2.0);
56 F->createSave("save" + std::to_string(i), pow);
57 }
58 return module;
59}
60
61std::unique_ptr<HostManager>
62createHostManager(llvm::StringRef backendName,
63 HostConfig hostConfig = HostConfig()) {
64 std::vector<std::unique_ptr<DeviceConfig>> configs =
65 generateConfigs(std::string(backendName), 1);
66 std::unique_ptr<HostManager> hostManager =
67 glow::make_unique<HostManager>(std::move(configs), hostConfig);
68 return hostManager;
69}
70
71Error addNetwork(HostManager *manager, std::string name) {
72 std::unique_ptr<Module> module = glow::make_unique<Module>();
73 Function *F = module->createFunction(name);
74 auto *X =
75 module->createPlaceholder(ElemKind::FloatTy, {3}, "X_" + name, false);
76 auto *pow = F->createPow("Pow_" + name, X, 2.0);
77 F->createSave("save" + name, pow);
78
79 // Expect this to be an Error because multiple networks with the same name
80 // have been added to HostManager
81 CompilationContext cctx;
82 return manager->addNetwork(std::move(module), cctx);
83}
84
85void addAndRemoveNetwork(HostManager *manager, unsigned int functionNumber) {
86 std::string name = "function" + std::to_string(functionNumber);
87 ERR_TO_BOOL(addNetwork(manager, name));
88 // Removal can return an error if the network is in the process of being
89 // added. That is fine we expect it in this test.
90 ERR_TO_BOOL(manager->removeNetwork(name));
91}
92
93TEST_P(HostManagerTest, newHostManager) {
94 CHECK_IF_ENABLED();
95 createHostManager(backendName_);
96}
97
98TEST_P(HostManagerTest, addNetwork) {
99 CHECK_IF_ENABLED();
100 auto module = setupModule(6);
101 auto hostManager = createHostManager(backendName_);
102 CompilationContext cctx;
103 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
104}
105
106TEST_P(HostManagerTest, queueOverflow) {
107 CHECK_IF_ENABLED();
108 std::unique_ptr<Module> module = glow::make_unique<Module>();
109
110 Function *F = module->createFunction("main");
111 auto *X = module->createPlaceholder(ElemKind::FloatTy, {10}, "X", false);
112 auto *pow = F->createPow("Pow1", X, 2.0);
113 pow = F->createPow("Pow1", pow, 2.0);
114 auto *save = F->createSave("save", pow);
115 std::vector<std::unique_ptr<ExecutionContext>> contexts;
116 for (int i = 0; i < 100; ++i) {
117 std::unique_ptr<ExecutionContext> context =
118 glow::make_unique<ExecutionContext>();
119 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
120 XTensor->getHandle() = {1., 2., 3., 1., 2., 3., 1., 2., 3., 1.};
121 context->getPlaceholderBindings()->allocate(save->getPlaceholder());
122 contexts.emplace_back(std::move(context));
123 }
124
125 HostConfig hostConfig;
126 hostConfig.maxQueueSize = 1;
127 hostConfig.maxActiveRequests = 1;
128 auto hostManager = createHostManager(backendName_, hostConfig);
129 CompilationContext cctx;
130 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
131
132 std::vector<std::promise<void>> requests(100);
133 std::list<std::future<void>> futures;
134 for (auto &r : requests) {
135 futures.emplace_back(r.get_future());
136 }
137
138 for (int i = 0; i < 100; ++i) {
139 auto &context = contexts[i];
140 auto &request = requests[i];
141 hostManager->runNetwork(
142 "main", std::move(context),
143 [&request](RunIdentifierTy runID, Error err,
144 std::unique_ptr<ExecutionContext> context_) {
145 TRACE_EVENT_SCOPE(context_->getTraceContext(), TraceLevel::RUNTIME,
146 "HostManager::runNetwork");
147 ERR_TO_BOOL(std::move(err));
148 request.set_value();
149 });
150 }
151
152 for (auto &f : futures) {
153 f.wait();
154 }
155}
156
157TEST_P(HostManagerTest, runNetwork) {
158 CHECK_IF_ENABLED();
159 std::unique_ptr<Module> module = glow::make_unique<Module>();
160 std::unique_ptr<ExecutionContext> context =
161 glow::make_unique<ExecutionContext>();
162
163 Function *F = module->createFunction("main");
164 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
165 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
166 XTensor->getHandle() = {1., 2., 3.};
167 auto *pow = F->createPow("Pow1", X, 2.0);
168 auto *save = F->createSave("save", pow);
169 auto *saveTensor =
170 context->getPlaceholderBindings()->allocate(save->getPlaceholder());
171
172 auto hostManager = createHostManager(backendName_);
173 CompilationContext cctx;
174 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
175
176 std::promise<void> runNetwork;
177 auto ready = runNetwork.get_future();
178
179 std::unique_ptr<Error> runErr;
180 hostManager->runNetwork("main", std::move(context),
181 [&runNetwork, &saveTensor, &context, &runErr](
182 RunIdentifierTy runID, Error err,
183 std::unique_ptr<ExecutionContext> context_) {
184 auto HX = saveTensor->getHandle();
185 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
186 EXPECT_NEAR(HX.at({1}), 4, 1E-5);
187 EXPECT_NEAR(HX.at({2}), 9, 1E-5);
188 context = std::move(context_);
189 runErr = glow::make_unique<Error>(std::move(err));
190 runNetwork.set_value();
191 });
192
193 ready.wait();
194 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
195
196 // reset runErr
197 runErr = nullptr;
198
199 std::promise<void> newRun;
200 ready = newRun.get_future();
201 hostManager->runNetwork("main", std::move(context),
202 [&newRun, &saveTensor, &runErr](
203 RunIdentifierTy runID, Error err,
204 std::unique_ptr<ExecutionContext> context_) {
205 auto HX = saveTensor->getHandle();
206 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
207 EXPECT_NEAR(HX.at({1}), 4, 1E-5);
208 EXPECT_NEAR(HX.at({2}), 9, 1E-5);
209 runErr = glow::make_unique<Error>(std::move(err));
210 newRun.set_value();
211 });
212
213 ready.wait();
214 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
215}
216
217/// Test that HostManager properly handles concurrent add/remove requests with
218/// unique network names.
219TEST_P(HostManagerTest, ConcurrentAddRemoveUnique) {
220 CHECK_IF_ENABLED();
221 constexpr auto numThreads = 6;
222 constexpr auto numItersPerThread = 20;
223 auto hostManager = createHostManager(backendName_);
224 std::atomic<unsigned> counter{0};
225 std::vector<std::thread> threads;
226 for (auto i = 0; i < numThreads; ++i) {
227 threads.emplace_back([&]() {
228 for (auto j = 0; j < numItersPerThread; ++j) {
229 addAndRemoveNetwork(hostManager.get(), ++counter);
230 }
231 });
232 }
233
234 for (auto &t : threads) {
235 t.join();
236 }
237}
238
239/// Test that HostManager properly handles concurrent add/remove requests with a
240/// duplicate network name.
241TEST_P(HostManagerTest, ConcurrentAddRemoveDuplicate) {
242 CHECK_IF_ENABLED();
243 constexpr auto numThreads = 6;
244 constexpr auto numItersPerThread = 20;
245 auto hostManager = createHostManager(backendName_);
246 std::vector<std::thread> threads;
247 for (auto i = 0; i < numThreads; ++i) {
248 threads.emplace_back([&]() {
249 for (auto j = 0; j < numItersPerThread; ++j) {
250 addAndRemoveNetwork(hostManager.get(), 0);
251 }
252 });
253 }
254
255 for (auto &t : threads) {
256 t.join();
257 }
258}
259
260/// Run several requests concurrently.
261TEST_P(HostManagerTest, runNetworkConcurrent) {
262 CHECK_IF_ENABLED();
263 std::unique_ptr<Module> module = glow::make_unique<Module>();
264
265 Function *F = module->createFunction("main");
266 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
267 auto *pow = F->createPow("Pow1", X, 2.0);
268 F->createSave("save", pow);
269 auto *savePH = module->getPlaceholderByNameSlow("save");
270
271 auto hostManager = createHostManager(backendName_);
272 CompilationContext cctx;
273
274 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
275
276 std::vector<std::future<void>> ready;
277 for (int i = 0; i < 50; i++) {
278 auto runNetwork = std::make_shared<std::promise<void>>();
279 ready.push_back(runNetwork->get_future());
280 std::unique_ptr<ExecutionContext> context =
281 glow::make_unique<ExecutionContext>();
282 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
283 XTensor->getHandle() = {1., 2., 3.};
284 auto *saveTensor = context->getPlaceholderBindings()->allocate(savePH);
285 hostManager->runNetwork(
286 "main", std::move(context),
287 [runNetwork, saveTensor](RunIdentifierTy runID, Error err,
288 std::unique_ptr<ExecutionContext> context_) {
289 auto HX = saveTensor->getHandle();
290 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
291 EXPECT_NEAR(HX.at({1}), 4, 1E-5);
292 EXPECT_NEAR(HX.at({2}), 9, 1E-5);
293 EXPECT_FALSE(std::move(err));
294 runNetwork->set_value();
295 });
296 }
297
298 for (auto &r : ready) {
299 r.wait();
300 }
301}
302
303TEST_P(HostManagerTest, testSaturateHost) {
304 CHECK_IF_ENABLED();
305 std::unique_ptr<Module> module = glow::make_unique<Module>();
306
307 Function *F = module->createFunction("main");
308 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
309 auto *pow = F->createPow("Pow1", X, 2.0);
310 F->createSave("save", pow);
311 auto *savePH = module->getPlaceholderByNameSlow("save");
312
313 std::vector<std::unique_ptr<DeviceConfig>> configs =
314 generateConfigs(backendName_, 2);
315 std::unique_ptr<HostManager> hostManager =
316 glow::make_unique<HostManager>(std::move(configs), HostConfig());
317
318 CompilationContext cctx;
319 cctx.saturateHost = true;
320 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
321
322 std::vector<std::future<void>> ready;
323 for (int i = 0; i < 50; i++) {
324 auto runNetwork = std::make_shared<std::promise<void>>();
325 ready.push_back(runNetwork->get_future());
326 std::unique_ptr<ExecutionContext> context =
327 glow::make_unique<ExecutionContext>();
328 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
329 XTensor->getHandle() = {1., 2., 3.};
330 auto *saveTensor = context->getPlaceholderBindings()->allocate(savePH);
331 hostManager->runNetwork(
332 "main", std::move(context),
333 [runNetwork, saveTensor](RunIdentifierTy, Error err,
334 std::unique_ptr<ExecutionContext>) {
335 auto HX = saveTensor->getHandle();
336 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
337 EXPECT_NEAR(HX.at({1}), 4, 1E-5);
338 EXPECT_NEAR(HX.at({2}), 9, 1E-5);
339 EXPECT_FALSE(std::move(err));
340 runNetwork->set_value();
341 });
342 }
343
344 for (auto &r : ready) {
345 r.wait();
346 }
347}
348
349/// Test that the HostManager respects it's configuration parameters.
350TEST_P(HostManagerTest, ConfigureHostManager) {
351 CHECK_IF_ENABLED();
352 HostConfig config;
353 config.maxActiveRequests = 1;
354 config.maxQueueSize = 0;
355 auto hostManager = createHostManager("Interpreter", std::move(config));
356
357 EXPECT_FALSE(ERR_TO_BOOL(addNetwork(hostManager.get(), "main")));
358
359 auto context = glow::make_unique<ExecutionContext>();
360 auto context2 = glow::make_unique<ExecutionContext>();
361
362 std::unique_ptr<Error> runErr;
363
364 std::shared_ptr<std::mutex> lock = std::make_shared<std::mutex>();
365 std::unique_lock<std::mutex> guard(*lock);
366
367 /// Don't care a about the first one.
368 hostManager->runNetwork("main", std::move(context),
369 [lock](RunIdentifierTy runID, Error err,
370 std::unique_ptr<ExecutionContext> context_) {
371 ERR_TO_BOOL(std::move(err));
372 });
373
374 hostManager->runNetwork(
375 "main", std::move(context2),
376 [&runErr](RunIdentifierTy runID, Error err,
377 std::unique_ptr<ExecutionContext> context_) {
378 runErr = glow::make_unique<Error>(std::move(err));
379 });
380 guard.unlock();
381 // Don't need a future, error CB called inline.
382 EXPECT_TRUE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
383}
384
385/// Test that the HostManager properly enqueues requests.
386TEST_P(HostManagerTest, QueueTest) {
387 CHECK_IF_ENABLED();
388 HostConfig config;
389 // Setup the hostmanager to allow 1 active and 2 queued requests for a total
390 // of 3 requests in the system.
391 config.maxActiveRequests = 1;
392 auto hostManager = createHostManager("Interpreter", std::move(config));
393
394 EXPECT_FALSE(ERR_TO_BOOL(addNetwork(hostManager.get(), "main")));
395
396 auto context = glow::make_unique<ExecutionContext>();
397 auto context2 = glow::make_unique<ExecutionContext>();
398 auto context3 = glow::make_unique<ExecutionContext>();
399 auto context4 = glow::make_unique<ExecutionContext>();
400 std::promise<unsigned> run1p, run2p, run3p, dispatched;
401 auto dispatchDone = dispatched.get_future();
402 auto run1f = run1p.get_future();
403 auto run2f = run2p.get_future();
404 auto run3f = run3p.get_future();
405 std::atomic<unsigned> counter{0};
406
407 // The first will go right to dispatch since there will be no inflight
408 // requests.
409 hostManager->runNetwork("main", std::move(context),
410 [&run1p, &counter, &dispatchDone](
411 RunIdentifierTy runID, Error err,
412 std::unique_ptr<ExecutionContext> context) {
413 EXIT_ON_ERR(std::move(err));
414 run1p.set_value(counter++);
415 dispatchDone.wait();
416 });
417 // Set the priority of the second to 1.
418 hostManager->runNetwork(
419 "main", std::move(context2),
420 [&run2p, &counter](RunIdentifierTy runID, Error err,
421 std::unique_ptr<ExecutionContext> context) {
422 EXIT_ON_ERR(std::move(err));
423 run2p.set_value(counter++);
424 },
425 1);
426
427 // Set the priority of the run3 to 0 so it should be first in the queue
428 // after run1.
429 hostManager->runNetwork(
430 "main", std::move(context3),
431 [&run3p, &counter](RunIdentifierTy runID, Error err,
432 std::unique_ptr<ExecutionContext> context) {
433 EXIT_ON_ERR(std::move(err));
434 run3p.set_value(counter++);
435 },
436 0);
437 /// Wait for all three to finish.
438 dispatched.set_value(0);
439 auto res1 = run1f.get();
440 auto res2 = run2f.get();
441 auto res3 = run3f.get();
442 // Should expect them to finish in order: 1, 3, 2. Check atomic value
443 EXPECT_GT(res3, res1);
444 EXPECT_GT(res2, res3);
445}
446
447/// Test that the enabling partition replication through user defined
448/// partitioning works.
449TEST_P(HostManagerTest, testPartitionConfigReplication) {
450 CHECK_IF_ENABLED();
451 std::unique_ptr<Module> module = glow::make_unique<Module>();
452 std::unique_ptr<ExecutionContext> context =
453 glow::make_unique<ExecutionContext>();
454
455 Function *F = module->createFunction("main");
456 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
457 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
458 XTensor->getHandle() = {1., 2., 3.};
459 auto *pow = F->createPow("Pow", X, 2.0);
460 auto *save = F->createSave("save", pow);
461 auto savePH = save->getPlaceholder();
462
463 std::vector<std::unique_ptr<DeviceConfig>> configs =
464 generateConfigs(backendName_, 2);
465 std::unique_ptr<HostManager> hostManager =
466 glow::make_unique<HostManager>(std::move(configs), HostConfig());
467 CompilationContext cctx;
468
469 // Setup forced partitioning.
470 PartitionConfig partitionConfig;
471 partitionConfig.funcName = "main";
472 partitionConfig.numOfPartitions = 2;
473 partitionConfig.backendNames = {backendName_, backendName_};
474 partitionConfig.partitionNames = {"p0", "p1"};
475 partitionConfig.nodeToPartition = {{"Pow", 0}, {"save", 3}};
476 partitionConfig.logicalIDs = {{0}, {1}};
477 partitionConfig.replicationCount[0] = 2;
478 cctx.partitionConfig = &partitionConfig;
479
480 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
481
482 std::vector<std::future<void>> ready;
483 for (int i = 0; i < 50; i++) {
484 auto runNetwork = std::make_shared<std::promise<void>>();
485 ready.push_back(runNetwork->get_future());
486 std::unique_ptr<ExecutionContext> context =
487 glow::make_unique<ExecutionContext>();
488 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
489 XTensor->getHandle() = {1., 2., 3.};
490 auto *saveTensor = context->getPlaceholderBindings()->allocate(savePH);
491 hostManager->runNetwork(
492 "main", std::move(context),
493 [runNetwork, saveTensor](RunIdentifierTy runID, Error err,
494 std::unique_ptr<ExecutionContext> context_) {
495 auto HX = saveTensor->getHandle();
496 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
497 EXPECT_NEAR(HX.at({1}), 4, 1E-5);
498 EXPECT_NEAR(HX.at({2}), 9, 1E-5);
499 EXPECT_FALSE(std::move(err));
500 runNetwork->set_value();
501 });
502 }
503
504 for (auto &r : ready) {
505 r.wait();
506 }
507}
508
509/// Test replication for a single partition network.
510TEST_P(HostManagerTest, testSinglePartitionReplication) {
511 CHECK_IF_ENABLED();
512 std::unique_ptr<Module> module = glow::make_unique<Module>();
513 std::unique_ptr<ExecutionContext> context =
514 glow::make_unique<ExecutionContext>();
515
516 Function *F = module->createFunction("main");
517 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
518 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
519 XTensor->getHandle() = {1., 2., 3.};
520 auto *pow = F->createPow("Pow1", X, 2.0);
521 auto *save = F->createSave("save", pow);
522 auto *savePH = save->getPlaceholder();
523
524 auto hostManager = createHostManager(backendName_);
525 CompilationContext cctx;
526 cctx.replicationCount = 2;
527 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
528
529 std::vector<std::future<void>> ready;
530 for (int i = 0; i < 50; i++) {
531 auto runNetwork = std::make_shared<std::promise<void>>();
532 ready.push_back(runNetwork->get_future());
533 std::unique_ptr<ExecutionContext> context =
534 glow::make_unique<ExecutionContext>();
535 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
536 XTensor->getHandle() = {1., 2., 3.};
537 auto *saveTensor = context->getPlaceholderBindings()->allocate(savePH);
538 hostManager->runNetwork(
539 "main", std::move(context),
540 [runNetwork, saveTensor](RunIdentifierTy runID, Error err,
541 std::unique_ptr<ExecutionContext> context_) {
542 auto HX = saveTensor->getHandle();
543 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
544 EXPECT_NEAR(HX.at({1}), 4, 1E-5);
545 EXPECT_NEAR(HX.at({2}), 9, 1E-5);
546 EXPECT_FALSE(std::move(err));
547 runNetwork->set_value();
548 });
549 }
550
551 for (auto &r : ready) {
552 r.wait();
553 }
554}
555
556// This test creates a network that is split into four partitions. P0,P1,P2,P3
557// and three devices D0,D1,D2. P0 is loaded on D0, P1 and P2 are loaded on D2
558// and P3 is loaded on D2. This test then enables both DRT and P2P
559// optimizations. We then run the network twice to test the alternating static
560// assignments.
561TEST_P(HostManagerTest, testStaticAssignmentP2PandDRT) {
562 CHECK_IF_ENABLED();
563 std::unique_ptr<Module> module = glow::make_unique<Module>();
564 std::unique_ptr<ExecutionContext> context =
565 glow::make_unique<ExecutionContext>();
566
567 Function *F = module->createFunction("main");
568 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
569 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
570 XTensor->getHandle() = {1., 2., 3.};
571 auto *pow = F->createPow("Pow1", X, 2.0);
572 auto pow2 = F->createPow("Pow2", pow, 2.0);
573 auto pow3 = F->createPow("Pow3", pow2, 1.0);
574 auto *save = F->createSave("save", pow3);
575 auto *saveTensor =
576 context->getPlaceholderBindings()->allocate(save->getPlaceholder());
577
578 std::vector<std::unique_ptr<DeviceConfig>> configs =
579 generateConfigs(backendName_, 3);
580 std::unique_ptr<HostManager> hostManager =
581 glow::make_unique<HostManager>(std::move(configs), HostConfig());
582 CompilationContext cctx;
583 cctx.enableP2P = true;
584 cctx.enableDRT = true;
585
586 // Setup forced partitioning.
587 PartitionConfig partitionConfig;
588 partitionConfig.funcName = "main";
589 partitionConfig.numOfPartitions = 4;
590 partitionConfig.backendNames = {backendName_, backendName_, backendName_,
591 backendName_};
592 partitionConfig.partitionNames = {"p0", "p1", "p2", "p3"};
593 partitionConfig.nodeToPartition = {
594 {"Pow1", 0}, {"Pow2", 1}, {"Pow3", 2}, {"Pow1__1", 0},
595 {"Pow2__1", 1}, {"Pow3__1", 2}, {"save", 3}, {"save_save", 3}};
596 partitionConfig.logicalIDs = {{0}, {1}, {1}, {2}};
597 cctx.partitionConfig = &partitionConfig;
598
599 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
600
601 std::promise<void> runNetwork;
602 auto ready = runNetwork.get_future();
603
604 std::unique_ptr<Error> runErr;
605 hostManager->runNetwork("main", std::move(context),
606 [&runNetwork, &saveTensor, &context, &runErr](
607 RunIdentifierTy runID, Error err,
608 std::unique_ptr<ExecutionContext> context_) {
609 auto HX = saveTensor->getHandle();
610 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
611 EXPECT_NEAR(HX.at({1}), 16, 1E-5);
612 EXPECT_NEAR(HX.at({2}), 81, 1E-5);
613 context = std::move(context_);
614 runErr = glow::make_unique<Error>(std::move(err));
615 runNetwork.set_value();
616 });
617
618 ready.wait();
619 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
620
621 // reset runErr
622 runErr = nullptr;
623
624 std::promise<void> newRun;
625 ready = newRun.get_future();
626 hostManager->runNetwork("main", std::move(context),
627 [&newRun, &saveTensor, &runErr](
628 RunIdentifierTy runID, Error err,
629 std::unique_ptr<ExecutionContext> context_) {
630 auto HX = saveTensor->getHandle();
631 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
632 EXPECT_NEAR(HX.at({1}), 16, 1E-5);
633 EXPECT_NEAR(HX.at({2}), 81, 1E-5);
634 runErr = glow::make_unique<Error>(std::move(err));
635 newRun.set_value();
636 });
637
638 ready.wait();
639 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
640}
641
642// This test creates a network that is split into four partitions. P0,P1,P2,P3
643// and three devices D0,D1,D2. P0 is loaded on D0, P1 and P2 are loaded on D2
644// and P3 is loaded on D2. This test then enables the DRT optimization without
645// P2P. We then run the network twice to test the alternating static
646// assignments.
647TEST_P(HostManagerTest, testStaticAssignmentDeviceResidentTensorOnly) {
648 CHECK_IF_ENABLED();
649 std::unique_ptr<Module> module = glow::make_unique<Module>();
650 std::unique_ptr<ExecutionContext> context =
651 glow::make_unique<ExecutionContext>();
652
653 Function *F = module->createFunction("main");
654 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
655 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
656 XTensor->getHandle() = {1., 2., 3.};
657 auto *pow = F->createPow("Pow1", X, 2.0);
658 auto pow2 = F->createPow("Pow2", pow, 2.0);
659 auto pow3 = F->createPow("Pow3", pow2, 1.0);
660 auto *save = F->createSave("save", pow3);
661 auto *saveTensor =
662 context->getPlaceholderBindings()->allocate(save->getPlaceholder());
663
664 std::vector<std::unique_ptr<DeviceConfig>> configs =
665 generateConfigs(backendName_, 3);
666 std::unique_ptr<HostManager> hostManager =
667 glow::make_unique<HostManager>(std::move(configs), HostConfig());
668 CompilationContext cctx;
669 cctx.enableDRT = true;
670
671 // Setup forced partitioning.
672 PartitionConfig partitionConfig;
673 partitionConfig.funcName = "main";
674 partitionConfig.numOfPartitions = 4;
675 partitionConfig.backendNames = {backendName_, backendName_, backendName_,
676 backendName_};
677 partitionConfig.partitionNames = {"p0", "p1", "p2", "p3"};
678 partitionConfig.nodeToPartition = {
679 {"Pow1", 0}, {"Pow2", 1}, {"Pow3", 2}, {"Pow1__1", 0},
680 {"Pow2__1", 1}, {"Pow3__1", 2}, {"save", 3}, {"save_save", 3}};
681 partitionConfig.logicalIDs = {{0}, {1}, {1}, {2}};
682 cctx.partitionConfig = &partitionConfig;
683
684 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
685
686 std::promise<void> runNetwork;
687 auto ready = runNetwork.get_future();
688
689 std::unique_ptr<Error> runErr;
690 hostManager->runNetwork("main", std::move(context),
691 [&runNetwork, &saveTensor, &context, &runErr](
692 RunIdentifierTy runID, Error err,
693 std::unique_ptr<ExecutionContext> context_) {
694 auto HX = saveTensor->getHandle();
695 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
696 EXPECT_NEAR(HX.at({1}), 16, 1E-5);
697 EXPECT_NEAR(HX.at({2}), 81, 1E-5);
698 context = std::move(context_);
699 runErr = glow::make_unique<Error>(std::move(err));
700 runNetwork.set_value();
701 });
702
703 ready.wait();
704 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
705
706 // reset runErr
707 runErr = nullptr;
708
709 std::promise<void> newRun;
710 ready = newRun.get_future();
711 hostManager->runNetwork("main", std::move(context),
712 [&newRun, &saveTensor, &runErr](
713 RunIdentifierTy runID, Error err,
714 std::unique_ptr<ExecutionContext> context_) {
715 auto HX = saveTensor->getHandle();
716 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
717 EXPECT_NEAR(HX.at({1}), 16, 1E-5);
718 EXPECT_NEAR(HX.at({2}), 81, 1E-5);
719 runErr = glow::make_unique<Error>(std::move(err));
720 newRun.set_value();
721 });
722
723 ready.wait();
724 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
725}
726
727// This test creates a network that is split into four partitions. P0,P1,P2,P3
728// and three devices D0,D1,D2. P0 is loaded on D0, P1 and P2 are loaded on D2
729// and P3 is loaded on D2. This test then enables the P2P optimization without
730// DRT. We then run the network twice to test the alternating static
731// assignments.
732TEST_P(HostManagerTest, testStaticAssignmentP2POnly) {
733 CHECK_IF_ENABLED();
734 std::unique_ptr<Module> module = glow::make_unique<Module>();
735 std::unique_ptr<ExecutionContext> context =
736 glow::make_unique<ExecutionContext>();
737
738 Function *F = module->createFunction("main");
739 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
740 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
741 XTensor->getHandle() = {1., 2., 3.};
742 auto *pow = F->createPow("Pow1", X, 2.0);
743 auto pow2 = F->createPow("Pow2", pow, 2.0);
744 auto pow3 = F->createPow("Pow3", pow2, 1.0);
745 auto *save = F->createSave("save", pow3);
746 auto *saveTensor =
747 context->getPlaceholderBindings()->allocate(save->getPlaceholder());
748
749 std::vector<std::unique_ptr<DeviceConfig>> configs =
750 generateConfigs(backendName_, 3);
751 std::unique_ptr<HostManager> hostManager =
752 glow::make_unique<HostManager>(std::move(configs), HostConfig());
753 CompilationContext cctx;
754 cctx.enableP2P = true;
755
756 // Setup forced partitioning.
757 PartitionConfig partitionConfig;
758 partitionConfig.funcName = "main";
759 partitionConfig.numOfPartitions = 4;
760 partitionConfig.backendNames = {backendName_, backendName_, backendName_,
761 backendName_};
762 partitionConfig.partitionNames = {"p0", "p1", "p2", "p3"};
763 partitionConfig.nodeToPartition = {
764 {"Pow1", 0}, {"Pow2", 1}, {"Pow3", 2}, {"Pow1__1", 0},
765 {"Pow2__1", 1}, {"Pow3__1", 2}, {"save", 3}, {"save_save", 3}};
766 partitionConfig.logicalIDs = {{0}, {1}, {1}, {2}};
767 cctx.partitionConfig = &partitionConfig;
768
769 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
770
771 std::promise<void> runNetwork;
772 auto ready = runNetwork.get_future();
773
774 std::unique_ptr<Error> runErr;
775 hostManager->runNetwork("main", std::move(context),
776 [&runNetwork, &saveTensor, &context, &runErr](
777 RunIdentifierTy runID, Error err,
778 std::unique_ptr<ExecutionContext> context_) {
779 auto HX = saveTensor->getHandle();
780 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
781 EXPECT_NEAR(HX.at({1}), 16, 1E-5);
782 EXPECT_NEAR(HX.at({2}), 81, 1E-5);
783 context = std::move(context_);
784 runErr = glow::make_unique<Error>(std::move(err));
785 runNetwork.set_value();
786 });
787
788 ready.wait();
789 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
790
791 // reset runErr
792 runErr = nullptr;
793
794 std::promise<void> newRun;
795 ready = newRun.get_future();
796 hostManager->runNetwork("main", std::move(context),
797 [&newRun, &saveTensor, &runErr](
798 RunIdentifierTy runID, Error err,
799 std::unique_ptr<ExecutionContext> context_) {
800 auto HX = saveTensor->getHandle();
801 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
802 EXPECT_NEAR(HX.at({1}), 16, 1E-5);
803 EXPECT_NEAR(HX.at({2}), 81, 1E-5);
804 runErr = glow::make_unique<Error>(std::move(err));
805 newRun.set_value();
806 });
807
808 ready.wait();
809 EXPECT_FALSE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
810}
811
812// This test creates a network that is split into two partitions. P0,P1. P0 is
813// loaded on one device, P1 is loaded on two devices. This test then enables
814// static assignment which allows for P2P testing. We then run the network
815// multiple requests concurrently.
816TEST_P(HostManagerTest, testStaticAssignmentP2PandDRTConcurrent) {
817 CHECK_IF_ENABLED();
818 std::unique_ptr<Module> module = glow::make_unique<Module>();
819
820 Function *F = module->createFunction("main");
821 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
822 auto *pow = F->createPow("Pow1", X, 2.0);
823 F->createSave("save", pow);
824 auto *savePH = module->getPlaceholderByNameSlow("save");
825
826 std::vector<std::unique_ptr<DeviceConfig>> configs =
827 generateConfigs(backendName_, 3);
828 std::unique_ptr<HostManager> hostManager =
829 glow::make_unique<HostManager>(std::move(configs), HostConfig());
830 CompilationContext cctx;
831 cctx.enableDRT = true;
832 cctx.enableP2P = true;
833
834 // Setup forced partitioning.
835 PartitionConfig partitionConfig;
836 partitionConfig.funcName = "main";
837 partitionConfig.numOfPartitions = 2;
838 partitionConfig.backendNames = {backendName_, backendName_};
839 partitionConfig.partitionNames = {"p0", "p1"};
840 partitionConfig.nodeToPartition = {{"Pow1", 0}, {"save", 1}};
841 partitionConfig.logicalIDs = {{0}, {1, 2}};
842 cctx.partitionConfig = &partitionConfig;
843
844 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
845
846 std::vector<std::future<void>> ready;
847 for (int i = 0; i < 50; i++) {
848 auto runNetwork = std::make_shared<std::promise<void>>();
849 ready.push_back(runNetwork->get_future());
850 std::unique_ptr<ExecutionContext> context =
851 glow::make_unique<ExecutionContext>();
852 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
853 XTensor->getHandle() = {1., 2., 3.};
854 auto *saveTensor = context->getPlaceholderBindings()->allocate(savePH);
855 hostManager->runNetwork(
856 "main", std::move(context),
857 [runNetwork, saveTensor](RunIdentifierTy runID, Error err,
858 std::unique_ptr<ExecutionContext> context_) {
859 auto HX = saveTensor->getHandle();
860 EXPECT_NEAR(HX.at({0}), 1, 1E-5);
861 EXPECT_NEAR(HX.at({1}), 4, 1E-5);
862 EXPECT_NEAR(HX.at({2}), 9, 1E-5);
863 EXPECT_FALSE(std::move(err));
864 runNetwork->set_value();
865 });
866 }
867
868 for (auto &r : ready) {
869 r.wait();
870 }
871}
872
873/// This tests that the HostMangaer registry works and is able to report what
874/// devices a network is loaded on.
875TEST_P(HostManagerTest, testHostManagerRegistry) {
876 CHECK_IF_ENABLED();
877 std::unique_ptr<Module> module = glow::make_unique<Module>();
878
879 Function *F = module->createFunction("main");
880 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
881 auto *pow = F->createPow("Pow1", X, 2.0);
882 F->createSave("save", pow);
883 module->getPlaceholderByNameSlow("save");
884
885 std::unique_ptr<Module> module2 = glow::make_unique<Module>();
886
887 Function *F2 = module2->createFunction("main2");
888 auto *X2 = module2->createPlaceholder(ElemKind::FloatTy, {3}, "X2", false);
889 auto *pow2 = F2->createPow("Pow2", X2, 2.0);
890 F2->createSave("save2", pow2);
891 module2->getPlaceholderByNameSlow("save2");
892
893 std::vector<std::unique_ptr<DeviceConfig>> configs =
894 generateConfigs(backendName_, 2);
895 std::unique_ptr<HostManager> hostManager =
896 glow::make_unique<HostManager>(std::move(configs), HostConfig());
897
898 CompilationContext cctx;
899 cctx.saturateHost = true;
900 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
901 cctx.saturateHost = false;
902 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module2), cctx)));
903 glow::runtime::ManagerRegistry()->registerHostManager(hostManager.get());
904 auto testHM = glow::runtime::ManagerRegistry()->getHostManager();
905 auto loading = testHM->getDevicePartitionMapping("main");
906 auto loading2 = testHM->getDevicePartitionMapping("main2");
907 EXPECT_EQ(loading["main"].size(), 2);
908 EXPECT_EQ(loading2["main2"].size(), 1);
909}
910
911TEST_P(HostManagerTest, testTimeout) {
912 CHECK_IF_ENABLED();
913
914 if (backendName_ == "NNPI") {
915 // Skip this test if running on ICEREF, since we want to test the device
916 // timeout.
917 auto useInfAPI = getenv("USE_INF_API");
918 if (!useInfAPI || strcmp(useInfAPI, "1")) {
919 GTEST_SKIP();
920 }
921 // Set the timeout to very short so we fail intentionally.
922 glow::runtime::flags::NNPITimeoutMs = 1;
923 }
924
925 std::unique_ptr<Module> module = glow::make_unique<Module>();
926 std::unique_ptr<ExecutionContext> context =
927 glow::make_unique<ExecutionContext>();
928
929 Function *F = module->createFunction("main");
930 auto *X = module->createPlaceholder(ElemKind::FloatTy, {3}, "X", false);
931 auto *XTensor = context->getPlaceholderBindings()->allocate(X);
932 XTensor->getHandle() = {1., 2., 3.};
933 auto *pow = F->createPow("Poww", X, 2.0);
934 for (unsigned i = 0; i < 1000; i++) {
935 pow = F->createPow("pow" + std::to_string(i), pow, 1.0);
936 }
937 auto *save = F->createSave("save", pow);
938 context->getPlaceholderBindings()->allocate(save->getPlaceholder());
939
940 auto hostManager = createHostManager(backendName_);
941
942 CompilationContext cctx;
943 ASSERT_FALSE(ERR_TO_BOOL(hostManager->addNetwork(std::move(module), cctx)));
944
945 std::promise<void> runNetwork;
946 auto ready = runNetwork.get_future();
947
948 std::unique_ptr<Error> runErr;
949 hostManager->runNetwork("main", std::move(context),
950 [&runNetwork, &context, &runErr](
951 RunIdentifierTy runID, Error err,
952 std::unique_ptr<ExecutionContext> context_) {
953 context = std::move(context_);
954 runErr = glow::make_unique<Error>(std::move(err));
955 runNetwork.set_value();
956 });
957
958 ready.wait();
959 EXPECT_TRUE(ERR_TO_BOOL(std::move(*DCHECK_NOTNULL(runErr.get()))));
960}
961
962INSTANTIATE_BACKEND_TEST(HostManagerTest);
963