1 | /** |
2 | * Copyright (c) 2017-present, Facebook, Inc. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <algorithm> |
17 | #include <array> |
18 | #include <cstdlib> |
19 | #include <future> |
20 | #include <random> |
21 | |
22 | #include "Bench.h" |
23 | |
24 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
25 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
26 | |
27 | using namespace glow; |
28 | |
29 | /* |
30 | * This class implements a batch GEMM microbenchmark. Each layer contains a |
31 | * batch of (m x m) * (m x n) matrix multiplications. There are a number of |
32 | * layers which do successive GEMMs on the intermediate outputs (RHS) and |
33 | * inputs (LHS) |
34 | * |
35 | * Microbenchmarks are generally useful for understanding performance |
36 | * through targeted experiementation and are not representative of |
37 | * end-to-end workloads. |
38 | */ |
39 | class BatchGemmBench : public Benchmark { |
40 | dim_t batchSize_; |
41 | dim_t m_; |
42 | dim_t n_; |
43 | dim_t numLayers_; |
44 | std::unique_ptr<runtime::HostManager> hostManager_; |
45 | std::vector<std::unique_ptr<ExecutionContext>> contexts_; |
46 | dim_t asyncLaunchSize_; |
47 | dim_t numCores_; |
48 | const char *backendStr_; |
49 | ElemKind dtype_; |
50 | dim_t elementSize_; |
51 | const char *devId_; |
52 | |
53 | public: |
54 | BatchGemmBench(dim_t batchSize_, dim_t m_, dim_t n_, dim_t numLayers_, |
55 | dim_t asyncLaunchSize_, dim_t numCores_, |
56 | const char *backendStr_, const char *dtypeStr_, |
57 | const char *devId_ = nullptr) |
58 | : batchSize_(batchSize_), m_(m_), n_(n_), numLayers_(numLayers_), |
59 | asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_), |
60 | backendStr_(backendStr_), devId_(devId_) { |
61 | |
62 | dtype_ = ElemKind::Float16Ty; |
63 | elementSize_ = 2; |
64 | if (std::string(dtypeStr_) == "Float16" ) { |
65 | dtype_ = ElemKind::Float16Ty; |
66 | elementSize_ = 2; |
67 | } else if (std::string(dtypeStr_) == "Float32" ) { |
68 | dtype_ = ElemKind::FloatTy; |
69 | elementSize_ = 4; |
70 | } |
71 | } |
72 | |
73 | void setup() override { |
74 | |
75 | // Create execution contexts here |
76 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
77 | std::unique_ptr<ExecutionContext> context(new ExecutionContext); |
78 | contexts_.push_back(std::move(context)); |
79 | } |
80 | |
81 | // Setup host manager |
82 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
83 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
84 | if (devId_ != nullptr) { |
85 | config->parameters["DeviceID" ] = devId_; |
86 | } |
87 | configs.push_back(std::move(config)); |
88 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
89 | |
90 | std::unique_ptr<Module> mod(new Module); |
91 | auto fn = mod->createFunction("singleNode" ); |
92 | |
93 | std::vector<Placeholder *> A(numCores_); |
94 | std::vector<Placeholder *> B(numCores_); |
95 | std::vector<SaveNode *> S(numCores_); |
96 | |
97 | // Calculate the batch size per core |
98 | auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_); |
99 | |
100 | for (dim_t core = 0; core < numCores_; core++) { |
101 | if (batchSizePerCore[core] == 0) |
102 | continue; |
103 | A[core] = mod->createPlaceholder(dtype_, {batchSizePerCore[core], m_, m_}, |
104 | "A" + std::to_string(core), false); |
105 | B[core] = mod->createPlaceholder(dtype_, {batchSizePerCore[core], m_, n_}, |
106 | "B" + std::to_string(core), false); |
107 | } |
108 | |
109 | // for each context, add input bindings |
110 | for (dim_t core = 0; core < numCores_; core++) { |
111 | if (batchSizePerCore[core] == 0) |
112 | continue; |
113 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
114 | if (dtype_ == ElemKind::FloatTy) { |
115 | contexts_[i] |
116 | ->getPlaceholderBindings() |
117 | ->allocate(A[core]) |
118 | ->getHandle<float>() |
119 | .randomize(0.0f, 1.0f, mod->getPRNG()); |
120 | contexts_[i] |
121 | ->getPlaceholderBindings() |
122 | ->allocate(B[core]) |
123 | ->getHandle<float>() |
124 | .randomize(0.0f, 1.0f, mod->getPRNG()); |
125 | } else if (dtype_ == ElemKind::Float16Ty) { |
126 | contexts_[i] |
127 | ->getPlaceholderBindings() |
128 | ->allocate(A[core]) |
129 | ->getHandle<float16_t>() |
130 | .randomize(0.0f, 1.0f, mod->getPRNG()); |
131 | contexts_[i] |
132 | ->getPlaceholderBindings() |
133 | ->allocate(B[core]) |
134 | ->getHandle<float16_t>() |
135 | .randomize(0.0f, 1.0f, mod->getPRNG()); |
136 | } |
137 | } |
138 | |
139 | Node *cur = B[core]; |
140 | for (dim_t layer = 0; layer < numLayers_; layer++) { |
141 | auto *bmm = fn->createBatchMatMul( |
142 | "batchmatmul" + std::to_string(layer) + "_" + std::to_string(core), |
143 | A[core], cur); |
144 | cur = bmm; |
145 | } |
146 | |
147 | S[core] = fn->createSave("save" + std::to_string(core), cur); |
148 | |
149 | // for each context, add output bindings |
150 | for (dim_t i = 0; i < asyncLaunchSize_; i++) { |
151 | contexts_[i]->getPlaceholderBindings()->allocate( |
152 | S[core]->getPlaceholder()); |
153 | } |
154 | } |
155 | |
156 | CompilationContext ctx; |
157 | ctx.dumpFinalGraph = true; |
158 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
159 | } |
160 | |
161 | void run() override { |
162 | std::vector<std::unique_ptr<ExecutionContext>> localContexts( |
163 | asyncLaunchSize_); |
164 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
165 | std::vector<std::future<void>> futures; |
166 | |
167 | // Launch a number of independent requests |
168 | int i = 0; |
169 | for (auto &promise : promises) { |
170 | futures.push_back(promise.get_future()); |
171 | hostManager_->runNetwork( |
172 | "singleNode" , std::move(contexts_[i]), |
173 | [&localContexts, &promise, |
174 | i](runtime::RunIdentifierTy, Error err, |
175 | std::unique_ptr<ExecutionContext> contextPtr) { |
176 | EXIT_ON_ERR(std::move(err)); |
177 | localContexts[i] = std::move(contextPtr); |
178 | promise.set_value(); |
179 | }); |
180 | i++; |
181 | } |
182 | for (auto &fut : futures) { |
183 | fut.wait(); |
184 | } |
185 | for (dim_t j = 0; j < asyncLaunchSize_; j++) { |
186 | contexts_[j] = std::move(localContexts[j]); |
187 | } |
188 | } |
189 | |
190 | void teardown() override {} |
191 | |
192 | // Each row has numElementsPerRow bytes per row, plus scale and offset |
193 | double gflops() const { |
194 | return 2.0 * m_ * m_ * n_ * numLayers_ * batchSize_ / 1e9; |
195 | } |
196 | }; |
197 | |
198 | int main(int argc, char *argv[]) { |
199 | printf("BatchGEMM Microbenchmark\n" ); |
200 | printf("Usage: BatchGemmBench batchSize(Int) m(Int) n(Int) numLayers(Int) " |
201 | "numReps(Int) numAsyncLaunches(Int) numBatchGEMMChains(Int) " |
202 | "backendStr(String) dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n" ); |
203 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
204 | "environment variable\n" ); |
205 | benchParseGlowOpts(argc, argv); |
206 | |
207 | assert(argc == 10 || argc == 11); |
208 | size_t batchSize = atoi(argv[1]); |
209 | size_t m = atoi(argv[2]); |
210 | size_t n = atoi(argv[3]); |
211 | size_t numLayers = atoi(argv[4]); |
212 | size_t numReps = atoi(argv[5]); |
213 | size_t numAsyncLaunches = atoi(argv[6]); |
214 | size_t numCores = atoi(argv[7]); |
215 | const char *backendStr = argv[8]; |
216 | const char *dtypeStr = argv[9]; |
217 | char *dev_id = nullptr; |
218 | |
219 | if (argc > 10) { |
220 | dev_id = argv[10]; |
221 | printf("Setting backend device: \"%s\"\n" , dev_id); |
222 | } |
223 | |
224 | assert(numReps > 0); |
225 | |
226 | BatchGemmBench b(batchSize, m, n, numLayers, numAsyncLaunches, numCores, |
227 | backendStr, dtypeStr, dev_id); |
228 | |
229 | auto times = bench(&b, numReps); |
230 | printf("_,benchName,_,batchSize,m,n,numLayers,numReps,numAsyncLaunches," |
231 | "numBatchGEMMChains,backendStr,dtypeStr,runtime,gflopsPerSec\n" ); |
232 | for (auto t : times) { |
233 | printf("BenchResult,BatchGemmBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f," |
234 | "%f\n" , |
235 | batchSize, m, n, numLayers, numReps, numAsyncLaunches, numCores, |
236 | backendStr, dtypeStr, t / numAsyncLaunches, |
237 | b.gflops() * numAsyncLaunches / t); |
238 | } |
239 | double min = *(std::min_element(times.begin(), times.end())); |
240 | size_t midElt = times.size() / 2; |
241 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
242 | double median = times[midElt]; |
243 | double median_runtime = median / ((double)numAsyncLaunches); |
244 | double min_runtime = min / ((double)numAsyncLaunches); |
245 | printf("_,benchName,_,batchSize,m,n,numLayers,numReps,numAsyncLaunches," |
246 | "numBatchGEMMChains,backendStr,dtypeStr,medianRuntime,minRuntime," |
247 | "medianGflopsPerSec,maxGflopsPerSec\n" ); |
248 | printf("BenchSummary,BatchGemmBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%" |
249 | "f,%f,%" |
250 | "f\n" , |
251 | batchSize, m, n, numLayers, numReps, numAsyncLaunches, numCores, |
252 | backendStr, dtypeStr, median_runtime, min_runtime, |
253 | b.gflops() / median_runtime, b.gflops() / min_runtime); |
254 | } |
255 | |