1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <algorithm>
17#include <array>
18#include <cstdlib>
19#include <future>
20#include <random>
21
22#include "Bench.h"
23
24#include "glow/ExecutionEngine/ExecutionEngine.h"
25#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27using namespace glow;
28
29/*
30 * This class implements an add microbenchmark. There are a number of
31 * parallel Add nodes which are created, one per core. Then these are
32 * chained together in multiple layers.
33 *
34 * Microbenchmarks are generally useful for understanding performance
35 * through targeted experiementation and are not representative of
36 * end-to-end workloads.
37 */
38class AddBench : public Benchmark {
39 dim_t n_;
40 dim_t numLayers_;
41 PlaceholderBindings bindings_;
42 std::unique_ptr<runtime::HostManager> hostManager_;
43 size_t asyncLaunchSize_;
44 size_t numCores_;
45 const char *backendStr_;
46 ElemKind dtype_;
47 size_t elementSize_;
48 const char *devId_;
49
50public:
51 AddBench(dim_t n_, dim_t numLayers_, dim_t asyncLaunchSize_, dim_t numCores_,
52 const char *backendStr_, const char *dtypeStr_,
53 const char *devId_ = nullptr)
54 : n_(n_), numLayers_(numLayers_), asyncLaunchSize_(asyncLaunchSize_),
55 numCores_(numCores_), backendStr_(backendStr_), devId_(devId_) {
56
57 dtype_ = ElemKind::Float16Ty;
58 elementSize_ = 2;
59 if (std::string(dtypeStr_) == "Float16") {
60 dtype_ = ElemKind::Float16Ty;
61 elementSize_ = 2;
62 } else if (std::string(dtypeStr_) == "Float32") {
63 dtype_ = ElemKind::FloatTy;
64 elementSize_ = 4;
65 }
66 }
67
68 void setup() override {
69
70 // Setup host manager
71 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
72 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
73 if (devId_ != nullptr) {
74 config->parameters["DeviceID"] = devId_;
75 }
76 configs.push_back(std::move(config));
77 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
78
79 std::unique_ptr<Module> mod(new Module);
80 auto fn = mod->createFunction("singleNode");
81
82 // Create multiple chains of Add nodes
83 std::vector<Placeholder *> A(numCores_);
84 std::vector<Placeholder *> B(numCores_);
85 std::vector<Placeholder *> output(numCores_);
86 std::vector<Node *> cur(numCores_);
87 for (size_t core = 0; core < numCores_; core++) {
88 A[core] = mod->createPlaceholder(dtype_, {n_}, "A" + std::to_string(core),
89 false);
90 B[core] = mod->createPlaceholder(dtype_, {n_}, "B" + std::to_string(core),
91 false);
92 output[core] = mod->createPlaceholder(
93 dtype_, {n_}, "output" + std::to_string(core), false);
94 cur[core] = A[core];
95 }
96
97 std::vector<Node *> eltwise(numCores_);
98 for (size_t layer = 0; layer < numLayers_; layer++) {
99 for (size_t core = 0; core < numCores_; core++) {
100 eltwise[core] = fn->createAdd("eltwise" + std::to_string(core) + "_" +
101 std::to_string(layer),
102 cur[core], B[core]);
103 cur[core] = eltwise[core];
104 }
105 }
106 for (size_t core = 0; core < numCores_; core++) {
107 fn->createSave("save" + std::to_string(core), cur[core], output[core]);
108 }
109
110 CompilationContext ctx;
111 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
112 }
113
114 void run() override {
115 std::vector<std::promise<void>> promises(asyncLaunchSize_);
116 std::vector<std::future<void>> futures;
117
118 // Launch a number of independent requests
119 for (auto &runPromise : promises) {
120 std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
121 futures.push_back(runPromise.get_future());
122 hostManager_->runNetwork(
123 "singleNode", std::move(contextPtr),
124 [&runPromise](runtime::RunIdentifierTy, Error err,
125 std::unique_ptr<ExecutionContext> /* contextPtr */) {
126 EXIT_ON_ERR(std::move(err));
127 runPromise.set_value();
128 });
129 }
130 for (auto &fut : futures) {
131 fut.wait();
132 }
133 }
134
135 void teardown() override {}
136
137 // Two inputs per layer and one output
138 double gbytes() const { return elementSize_ * n_ * (3 * numLayers_) / 1e9; }
139};
140
141int main(int argc, char *argv[]) {
142 printf("Add Microbenchmark\n");
143 printf("Usage: AddBench n(Int) numLayers(Int) numReps(Int) "
144 "numAsyncLaunches(Int) numAddChains(Int) backendStr(String) "
145 "dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n");
146 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
147 "environment variable\n");
148 benchParseGlowOpts(argc, argv);
149 assert(argc == 8 || argc == 9);
150 size_t n = atoi(argv[1]);
151 size_t numLayers = atoi(argv[2]);
152 size_t reps = atoi(argv[3]);
153 size_t asyncLaunches = atoi(argv[4]);
154 size_t numCores = atoi(argv[5]);
155 const char *backendStr = argv[6];
156 const char *dtypeStr = argv[7];
157 char *dev_id = nullptr;
158
159 if (argc > 8) {
160 dev_id = argv[8];
161 printf("Setting backend device: \"%s\"\n", dev_id);
162 }
163
164 assert(reps > 0);
165
166 AddBench b(n, numLayers, asyncLaunches, numCores, backendStr, dtypeStr,
167 dev_id);
168 auto times = bench(&b, reps);
169 printf("_,benchName,_,n,numLayers,numReps,numAsyncLaunches,numAddChains,"
170 "backendStr,dtypeStr,runtime,gbytesPerSecPerChain\n");
171 for (auto t : times) {
172 printf("BenchResult,AddBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%2.6lf,"
173 "%5.2lf\n",
174 n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr,
175 t / asyncLaunches, b.gbytes() * asyncLaunches / t);
176 }
177 double min = *(std::min_element(times.begin(), times.end()));
178 size_t midElt = times.size() / 2;
179 std::nth_element(times.begin(), times.begin() + midElt, times.end());
180 double median = times[midElt];
181 double median_runtime = median / ((double)asyncLaunches);
182 double min_runtime = min / ((double)asyncLaunches);
183 printf("_,benchName,_,n,numLayers,numReps,numAsyncLaunches,numAddChains,"
184 "backendStr,dtypeStr,medianRuntime,minRuntime,"
185 "medianGbytesPerSecPerChain,maxGbytesPerSecPerChain\n");
186 printf(
187 "BenchSummary,AddBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%2.6lf,%2.6lf,%"
188 "5.2lf, %5.2lf\n",
189 n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr,
190 median_runtime, min_runtime, b.gbytes() / median_runtime,
191 b.gbytes() / min_runtime);
192}
193