1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <algorithm> |
17 | #include <array> |
18 | #include <cstdlib> |
19 | #include <future> |
20 | #include <random> |
21 | |
22 | #include "Bench.h" |
23 | |
24 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
25 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
26 | |
27 | using namespace glow; |
28 | |
29 | /* |
30 | * This class implements an add microbenchmark. There are a number of |
31 | * parallel Add nodes which are created, one per core. Then these are |
32 | * chained together in multiple layers. |
33 | * |
34 | * Microbenchmarks are generally useful for understanding performance |
35 | * through targeted experiementation and are not representative of |
36 | * end-to-end workloads. |
37 | */ |
38 | class AddBench : public Benchmark { |
39 | dim_t n_; |
40 | dim_t numLayers_; |
41 | PlaceholderBindings bindings_; |
42 | std::unique_ptr<runtime::HostManager> hostManager_; |
43 | size_t asyncLaunchSize_; |
44 | size_t numCores_; |
45 | const char *backendStr_; |
46 | ElemKind dtype_; |
47 | size_t elementSize_; |
48 | const char *devId_; |
49 | |
50 | public: |
51 | AddBench(dim_t n_, dim_t numLayers_, dim_t asyncLaunchSize_, dim_t numCores_, |
52 | const char *backendStr_, const char *dtypeStr_, |
53 | const char *devId_ = nullptr) |
54 | : n_(n_), numLayers_(numLayers_), asyncLaunchSize_(asyncLaunchSize_), |
55 | numCores_(numCores_), backendStr_(backendStr_), devId_(devId_) { |
56 | |
57 | dtype_ = ElemKind::Float16Ty; |
58 | elementSize_ = 2; |
59 | if (std::string(dtypeStr_) == "Float16" ) { |
60 | dtype_ = ElemKind::Float16Ty; |
61 | elementSize_ = 2; |
62 | } else if (std::string(dtypeStr_) == "Float32" ) { |
63 | dtype_ = ElemKind::FloatTy; |
64 | elementSize_ = 4; |
65 | } |
66 | } |
67 | |
68 | void setup() override { |
69 | |
70 | // Setup host manager |
71 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
72 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
73 | if (devId_ != nullptr) { |
74 | config->parameters["DeviceID" ] = devId_; |
75 | } |
76 | configs.push_back(std::move(config)); |
77 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
78 | |
79 | std::unique_ptr<Module> mod(new Module); |
80 | auto fn = mod->createFunction("singleNode" ); |
81 | |
82 | // Create multiple chains of Add nodes |
83 | std::vector<Placeholder *> A(numCores_); |
84 | std::vector<Placeholder *> B(numCores_); |
85 | std::vector<Placeholder *> output(numCores_); |
86 | std::vector<Node *> cur(numCores_); |
87 | for (size_t core = 0; core < numCores_; core++) { |
88 | A[core] = mod->createPlaceholder(dtype_, {n_}, "A" + std::to_string(core), |
89 | false); |
90 | B[core] = mod->createPlaceholder(dtype_, {n_}, "B" + std::to_string(core), |
91 | false); |
92 | output[core] = mod->createPlaceholder( |
93 | dtype_, {n_}, "output" + std::to_string(core), false); |
94 | cur[core] = A[core]; |
95 | } |
96 | |
97 | std::vector<Node *> eltwise(numCores_); |
98 | for (size_t layer = 0; layer < numLayers_; layer++) { |
99 | for (size_t core = 0; core < numCores_; core++) { |
100 | eltwise[core] = fn->createAdd("eltwise" + std::to_string(core) + "_" + |
101 | std::to_string(layer), |
102 | cur[core], B[core]); |
103 | cur[core] = eltwise[core]; |
104 | } |
105 | } |
106 | for (size_t core = 0; core < numCores_; core++) { |
107 | fn->createSave("save" + std::to_string(core), cur[core], output[core]); |
108 | } |
109 | |
110 | CompilationContext ctx; |
111 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
112 | } |
113 | |
114 | void run() override { |
115 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
116 | std::vector<std::future<void>> futures; |
117 | |
118 | // Launch a number of independent requests |
119 | for (auto &runPromise : promises) { |
120 | std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext); |
121 | futures.push_back(runPromise.get_future()); |
122 | hostManager_->runNetwork( |
123 | "singleNode" , std::move(contextPtr), |
124 | [&runPromise](runtime::RunIdentifierTy, Error err, |
125 | std::unique_ptr<ExecutionContext> /* contextPtr */) { |
126 | EXIT_ON_ERR(std::move(err)); |
127 | runPromise.set_value(); |
128 | }); |
129 | } |
130 | for (auto &fut : futures) { |
131 | fut.wait(); |
132 | } |
133 | } |
134 | |
135 | void teardown() override {} |
136 | |
137 | // Two inputs per layer and one output |
138 | double gbytes() const { return elementSize_ * n_ * (3 * numLayers_) / 1e9; } |
139 | }; |
140 | |
141 | int main(int argc, char *argv[]) { |
142 | printf("Add Microbenchmark\n" ); |
143 | printf("Usage: AddBench n(Int) numLayers(Int) numReps(Int) " |
144 | "numAsyncLaunches(Int) numAddChains(Int) backendStr(String) " |
145 | "dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n" ); |
146 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
147 | "environment variable\n" ); |
148 | benchParseGlowOpts(argc, argv); |
149 | assert(argc == 8 || argc == 9); |
150 | size_t n = atoi(argv[1]); |
151 | size_t numLayers = atoi(argv[2]); |
152 | size_t reps = atoi(argv[3]); |
153 | size_t asyncLaunches = atoi(argv[4]); |
154 | size_t numCores = atoi(argv[5]); |
155 | const char *backendStr = argv[6]; |
156 | const char *dtypeStr = argv[7]; |
157 | char *dev_id = nullptr; |
158 | |
159 | if (argc > 8) { |
160 | dev_id = argv[8]; |
161 | printf("Setting backend device: \"%s\"\n" , dev_id); |
162 | } |
163 | |
164 | assert(reps > 0); |
165 | |
166 | AddBench b(n, numLayers, asyncLaunches, numCores, backendStr, dtypeStr, |
167 | dev_id); |
168 | auto times = bench(&b, reps); |
169 | printf("_,benchName,_,n,numLayers,numReps,numAsyncLaunches,numAddChains," |
170 | "backendStr,dtypeStr,runtime,gbytesPerSecPerChain\n" ); |
171 | for (auto t : times) { |
172 | printf("BenchResult,AddBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%2.6lf," |
173 | "%5.2lf\n" , |
174 | n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr, |
175 | t / asyncLaunches, b.gbytes() * asyncLaunches / t); |
176 | } |
177 | double min = *(std::min_element(times.begin(), times.end())); |
178 | size_t midElt = times.size() / 2; |
179 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
180 | double median = times[midElt]; |
181 | double median_runtime = median / ((double)asyncLaunches); |
182 | double min_runtime = min / ((double)asyncLaunches); |
183 | printf("_,benchName,_,n,numLayers,numReps,numAsyncLaunches,numAddChains," |
184 | "backendStr,dtypeStr,medianRuntime,minRuntime," |
185 | "medianGbytesPerSecPerChain,maxGbytesPerSecPerChain\n" ); |
186 | printf( |
187 | "BenchSummary,AddBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%2.6lf,%2.6lf,%" |
188 | "5.2lf, %5.2lf\n" , |
189 | n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr, |
190 | median_runtime, min_runtime, b.gbytes() / median_runtime, |
191 | b.gbytes() / min_runtime); |
192 | } |
193 | |