1 | /** |
2 | * Copyright (c) 2017-present, Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <algorithm> |
17 | #include <array> |
18 | #include <cstdlib> |
19 | #include <future> |
20 | #include <random> |
21 | |
22 | #include "Bench.h" |
23 | |
24 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
25 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
26 | |
27 | using namespace glow; |
28 | |
29 | /* |
30 | * This class implements a transpose microbenchmark. There are multiple |
31 | * layers of transpose, followed by an Add with the tensor from the previous |
32 | * layer. |
33 | * |
34 | * Microbenchmarks are generally useful for understanding performance |
35 | * through targeted experiementation and are not representative of |
36 | * end-to-end workloads. |
37 | */ |
38 | class TransposeBench : public Benchmark { |
39 | dim_t batchSize_; |
40 | dim_t n_; |
41 | dim_t numLayers_; |
42 | std::unique_ptr<runtime::HostManager> hostManager_; |
43 | std::vector<std::unique_ptr<ExecutionContext>> contexts_; |
44 | dim_t asyncLaunchSize_; |
45 | dim_t numCores_; |
46 | const char *backendStr_; |
47 | ElemKind dtype_; |
48 | dim_t elementSize_; |
49 | const char *devId_; |
50 | |
51 | public: |
52 | TransposeBench(dim_t batchSize_, dim_t n_, dim_t numLayers_, |
53 | dim_t asyncLaunchSize_, dim_t numCores_, |
54 | const char *backendStr_, const char *dtypeStr_, |
55 | const char *devId_ = nullptr) |
56 | : batchSize_(batchSize_), n_(n_), numLayers_(numLayers_), |
57 | asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_), |
58 | backendStr_(backendStr_), devId_(devId_) { |
59 | |
60 | dtype_ = ElemKind::Float16Ty; |
61 | elementSize_ = 2; |
62 | if (std::string(dtypeStr_) == "Float16" ) { |
63 | dtype_ = ElemKind::Float16Ty; |
64 | elementSize_ = 2; |
65 | } else if (std::string(dtypeStr_) == "Float32" ) { |
66 | dtype_ = ElemKind::FloatTy; |
67 | elementSize_ = 4; |
68 | } |
69 | } |
70 | |
71 | void setup() override { |
72 | |
73 | // Create execution contexts here |
74 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
75 | std::unique_ptr<ExecutionContext> context(new ExecutionContext); |
76 | contexts_.push_back(std::move(context)); |
77 | } |
78 | |
79 | // Setup host manager |
80 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
81 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
82 | if (devId_ != nullptr) { |
83 | config->parameters["DeviceID" ] = devId_; |
84 | } |
85 | configs.push_back(std::move(config)); |
86 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
87 | |
88 | std::unique_ptr<Module> mod(new Module); |
89 | auto fn = mod->createFunction("singleNode" ); |
90 | |
91 | std::vector<Placeholder *> input(numCores_); |
92 | std::vector<SaveNode *> S(numCores_); |
93 | auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_); |
94 | |
95 | for (dim_t core = 0; core < numCores_; core++) { |
96 | if (batchSizePerCore[core] == 0) |
97 | continue; |
98 | input[core] = |
99 | mod->createPlaceholder(dtype_, {batchSizePerCore[core], n_, n_}, |
100 | "A" + std::to_string(core), false); |
101 | } |
102 | |
103 | // Create multiple chains of Transpose and Add nodes |
104 | for (dim_t core = 0; core < numCores_; core++) { |
105 | if (batchSizePerCore[core] == 0) |
106 | continue; |
107 | // for each context, add input bindings |
108 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
109 | if (dtype_ == ElemKind::FloatTy) { |
110 | contexts_[i] |
111 | ->getPlaceholderBindings() |
112 | ->allocate(input[core]) |
113 | ->getHandle<float>() |
114 | .randomize(0.0f, 1.0f, mod->getPRNG()); |
115 | } else if (dtype_ == ElemKind::Float16Ty) { |
116 | contexts_[i] |
117 | ->getPlaceholderBindings() |
118 | ->allocate(input[core]) |
119 | ->getHandle<float16_t>() |
120 | .randomize(0.0f, 1.0f, mod->getPRNG()); |
121 | } |
122 | } |
123 | |
124 | Node *cur = input[core]; |
125 | for (dim_t layer = 0; layer < numLayers_; layer++) { |
126 | auto *xp = fn->createTranspose("transpose_" + std::to_string(layer) + |
127 | "_" + std::to_string(core), |
128 | cur, {0, 2, 1}); |
129 | auto *ad = fn->createAdd("add_" + std::to_string(layer) + "_" + |
130 | std::to_string(core), |
131 | cur, xp); |
132 | cur = ad; |
133 | } |
134 | |
135 | S[core] = fn->createSave("save" , cur); |
136 | |
137 | // for each context, allocate output |
138 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
139 | contexts_[i]->getPlaceholderBindings()->allocate( |
140 | S[core]->getPlaceholder()); |
141 | } |
142 | } |
143 | |
144 | CompilationContext ctx; |
145 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
146 | } |
147 | |
148 | void run() override { |
149 | std::vector<std::unique_ptr<ExecutionContext>> localContexts( |
150 | asyncLaunchSize_); |
151 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
152 | std::vector<std::future<void>> futures; |
153 | |
154 | // Launch a number of independent requests |
155 | int i = 0; |
156 | for (auto &promise : promises) { |
157 | futures.push_back(promise.get_future()); |
158 | hostManager_->runNetwork( |
159 | "singleNode" , std::move(contexts_[i]), |
160 | [&localContexts, &promise, |
161 | i](runtime::RunIdentifierTy, Error err, |
162 | std::unique_ptr<ExecutionContext> contextPtr) { |
163 | EXIT_ON_ERR(std::move(err)); |
164 | localContexts[i] = std::move(contextPtr); |
165 | promise.set_value(); |
166 | }); |
167 | i++; |
168 | } |
169 | for (auto &fut : futures) { |
170 | fut.wait(); |
171 | } |
172 | for (dim_t j = 0; j < asyncLaunchSize_; j++) { |
173 | contexts_[j] = std::move(localContexts[j]); |
174 | } |
175 | } |
176 | |
177 | void teardown() override {} |
178 | |
179 | // Each layer reads the tensor thrice, and writes the tensor twice |
180 | double gbytes() const { |
181 | return (5.0 * numLayers_ * batchSize_ * n_ * n_ * elementSize_) / 1e9; |
182 | } |
183 | }; |
184 | |
185 | int main(int argc, char *argv[]) { |
186 | printf("Transpose Microbenchmark\n" ); |
187 | printf("Usage: TransposeBench batchSize(Int) n(Int) numLayers(Int) " |
188 | "numReps(Int) numAsyncLaunches(Int) numTransposeChains(Int) " |
189 | "backendStr(String) dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n" ); |
190 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
191 | "environment variable\n" ); |
192 | benchParseGlowOpts(argc, argv); |
193 | assert(argc == 9 || argc == 10); |
194 | size_t batchSize = atoi(argv[1]); |
195 | size_t n = atoi(argv[2]); |
196 | size_t numLayers = atoi(argv[3]); |
197 | size_t numReps = atoi(argv[4]); |
198 | size_t numAsyncLaunches = atoi(argv[5]); |
199 | size_t numCores = atoi(argv[6]); |
200 | const char *backendStr = argv[7]; |
201 | const char *dtypeStr = argv[8]; |
202 | char *dev_id = nullptr; |
203 | |
204 | if (argc > 9) { |
205 | dev_id = argv[9]; |
206 | printf("Setting backend device: \"%s\"\n" , dev_id); |
207 | } |
208 | |
209 | assert(numReps > 0); |
210 | |
211 | TransposeBench b(batchSize, n, numLayers, numAsyncLaunches, numCores, |
212 | backendStr, dtypeStr, dev_id); |
213 | |
214 | auto times = bench(&b, numReps); |
215 | printf("_,benchName,_,batchSize,n,numLayers,numReps,numAsyncLaunches," |
216 | "numTransposeChains,backendStr,dtypeStr,runtime,gbytesPerSec\n" ); |
217 | for (auto t : times) { |
218 | printf( |
219 | "BenchResult,TransposeBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%f\n" , |
220 | batchSize, n, numLayers, numReps, numAsyncLaunches, numCores, |
221 | backendStr, dtypeStr, t / numAsyncLaunches, |
222 | b.gbytes() * numAsyncLaunches / t); |
223 | } |
224 | double min = *(std::min_element(times.begin(), times.end())); |
225 | size_t midElt = times.size() / 2; |
226 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
227 | double median = times[midElt]; |
228 | double median_runtime = median / ((double)numAsyncLaunches); |
229 | double min_runtime = min / ((double)numAsyncLaunches); |
230 | printf("_,benchName,_,batchSize,n,numLayers,numReps,numAsyncLaunches," |
231 | "numTransposeChains,backendStr,dtypeStr,medianRuntime,minRuntime," |
232 | "medianGbytesPerSec,maxGbytesPerSec\n" ); |
233 | printf( |
234 | "BenchSummary,TransposeBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%f,%f,%" |
235 | "f\n" , |
236 | batchSize, n, numLayers, numReps, numAsyncLaunches, numCores, backendStr, |
237 | dtypeStr, median_runtime, min_runtime, b.gbytes() / median_runtime, |
238 | b.gbytes() / min_runtime); |
239 | } |
240 | |