1/**
2 * Copyright (c) 2017-present, Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <algorithm>
17#include <array>
18#include <cstdlib>
19#include <future>
20#include <random>
21
22#include "Bench.h"
23
24#include "glow/ExecutionEngine/ExecutionEngine.h"
25#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27using namespace glow;
28
29/*
30 * This class implements a batch GEMM microbenchmark. Each layer contains a
31 * batch of (m x m) * (m x n) matrix multiplications. There are a number of
32 * layers which do successive GEMMs on the intermediate outputs (RHS) and
33 * inputs (LHS)
34 *
35 * Microbenchmarks are generally useful for understanding performance
36 * through targeted experiementation and are not representative of
37 * end-to-end workloads.
38 */
39class BatchGemmBench : public Benchmark {
40 dim_t batchSize_;
41 dim_t m_;
42 dim_t n_;
43 dim_t numLayers_;
44 std::unique_ptr<runtime::HostManager> hostManager_;
45 std::vector<std::unique_ptr<ExecutionContext>> contexts_;
46 dim_t asyncLaunchSize_;
47 dim_t numCores_;
48 const char *backendStr_;
49 ElemKind dtype_;
50 dim_t elementSize_;
51 const char *devId_;
52
53public:
54 BatchGemmBench(dim_t batchSize_, dim_t m_, dim_t n_, dim_t numLayers_,
55 dim_t asyncLaunchSize_, dim_t numCores_,
56 const char *backendStr_, const char *dtypeStr_,
57 const char *devId_ = nullptr)
58 : batchSize_(batchSize_), m_(m_), n_(n_), numLayers_(numLayers_),
59 asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
60 backendStr_(backendStr_), devId_(devId_) {
61
62 dtype_ = ElemKind::Float16Ty;
63 elementSize_ = 2;
64 if (std::string(dtypeStr_) == "Float16") {
65 dtype_ = ElemKind::Float16Ty;
66 elementSize_ = 2;
67 } else if (std::string(dtypeStr_) == "Float32") {
68 dtype_ = ElemKind::FloatTy;
69 elementSize_ = 4;
70 }
71 }
72
73 void setup() override {
74
75 // Create execution contexts here
76 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
77 std::unique_ptr<ExecutionContext> context(new ExecutionContext);
78 contexts_.push_back(std::move(context));
79 }
80
81 // Setup host manager
82 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
83 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
84 if (devId_ != nullptr) {
85 config->parameters["DeviceID"] = devId_;
86 }
87 configs.push_back(std::move(config));
88 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
89
90 std::unique_ptr<Module> mod(new Module);
91 auto fn = mod->createFunction("singleNode");
92
93 std::vector<Placeholder *> A(numCores_);
94 std::vector<Placeholder *> B(numCores_);
95 std::vector<SaveNode *> S(numCores_);
96
97 // Calculate the batch size per core
98 auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_);
99
100 for (dim_t core = 0; core < numCores_; core++) {
101 if (batchSizePerCore[core] == 0)
102 continue;
103 A[core] = mod->createPlaceholder(dtype_, {batchSizePerCore[core], m_, m_},
104 "A" + std::to_string(core), false);
105 B[core] = mod->createPlaceholder(dtype_, {batchSizePerCore[core], m_, n_},
106 "B" + std::to_string(core), false);
107 }
108
109 // for each context, add input bindings
110 for (dim_t core = 0; core < numCores_; core++) {
111 if (batchSizePerCore[core] == 0)
112 continue;
113 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
114 if (dtype_ == ElemKind::FloatTy) {
115 contexts_[i]
116 ->getPlaceholderBindings()
117 ->allocate(A[core])
118 ->getHandle<float>()
119 .randomize(0.0f, 1.0f, mod->getPRNG());
120 contexts_[i]
121 ->getPlaceholderBindings()
122 ->allocate(B[core])
123 ->getHandle<float>()
124 .randomize(0.0f, 1.0f, mod->getPRNG());
125 } else if (dtype_ == ElemKind::Float16Ty) {
126 contexts_[i]
127 ->getPlaceholderBindings()
128 ->allocate(A[core])
129 ->getHandle<float16_t>()
130 .randomize(0.0f, 1.0f, mod->getPRNG());
131 contexts_[i]
132 ->getPlaceholderBindings()
133 ->allocate(B[core])
134 ->getHandle<float16_t>()
135 .randomize(0.0f, 1.0f, mod->getPRNG());
136 }
137 }
138
139 Node *cur = B[core];
140 for (dim_t layer = 0; layer < numLayers_; layer++) {
141 auto *bmm = fn->createBatchMatMul(
142 "batchmatmul" + std::to_string(layer) + "_" + std::to_string(core),
143 A[core], cur);
144 cur = bmm;
145 }
146
147 S[core] = fn->createSave("save" + std::to_string(core), cur);
148
149 // for each context, add output bindings
150 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
151 contexts_[i]->getPlaceholderBindings()->allocate(
152 S[core]->getPlaceholder());
153 }
154 }
155
156 CompilationContext ctx;
157 ctx.dumpFinalGraph = true;
158 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
159 }
160
161 void run() override {
162 std::vector<std::unique_ptr<ExecutionContext>> localContexts(
163 asyncLaunchSize_);
164 std::vector<std::promise<void>> promises(asyncLaunchSize_);
165 std::vector<std::future<void>> futures;
166
167 // Launch a number of independent requests
168 int i = 0;
169 for (auto &promise : promises) {
170 futures.push_back(promise.get_future());
171 hostManager_->runNetwork(
172 "singleNode", std::move(contexts_[i]),
173 [&localContexts, &promise,
174 i](runtime::RunIdentifierTy, Error err,
175 std::unique_ptr<ExecutionContext> contextPtr) {
176 EXIT_ON_ERR(std::move(err));
177 localContexts[i] = std::move(contextPtr);
178 promise.set_value();
179 });
180 i++;
181 }
182 for (auto &fut : futures) {
183 fut.wait();
184 }
185 for (dim_t j = 0; j < asyncLaunchSize_; j++) {
186 contexts_[j] = std::move(localContexts[j]);
187 }
188 }
189
190 void teardown() override {}
191
192 // Each row has numElementsPerRow bytes per row, plus scale and offset
193 double gflops() const {
194 return 2.0 * m_ * m_ * n_ * numLayers_ * batchSize_ / 1e9;
195 }
196};
197
198int main(int argc, char *argv[]) {
199 printf("BatchGEMM Microbenchmark\n");
200 printf("Usage: BatchGemmBench batchSize(Int) m(Int) n(Int) numLayers(Int) "
201 "numReps(Int) numAsyncLaunches(Int) numBatchGEMMChains(Int) "
202 "backendStr(String) dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n");
203 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
204 "environment variable\n");
205 benchParseGlowOpts(argc, argv);
206
207 assert(argc == 10 || argc == 11);
208 size_t batchSize = atoi(argv[1]);
209 size_t m = atoi(argv[2]);
210 size_t n = atoi(argv[3]);
211 size_t numLayers = atoi(argv[4]);
212 size_t numReps = atoi(argv[5]);
213 size_t numAsyncLaunches = atoi(argv[6]);
214 size_t numCores = atoi(argv[7]);
215 const char *backendStr = argv[8];
216 const char *dtypeStr = argv[9];
217 char *dev_id = nullptr;
218
219 if (argc > 10) {
220 dev_id = argv[10];
221 printf("Setting backend device: \"%s\"\n", dev_id);
222 }
223
224 assert(numReps > 0);
225
226 BatchGemmBench b(batchSize, m, n, numLayers, numAsyncLaunches, numCores,
227 backendStr, dtypeStr, dev_id);
228
229 auto times = bench(&b, numReps);
230 printf("_,benchName,_,batchSize,m,n,numLayers,numReps,numAsyncLaunches,"
231 "numBatchGEMMChains,backendStr,dtypeStr,runtime,gflopsPerSec\n");
232 for (auto t : times) {
233 printf("BenchResult,BatchGemmBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,"
234 "%f\n",
235 batchSize, m, n, numLayers, numReps, numAsyncLaunches, numCores,
236 backendStr, dtypeStr, t / numAsyncLaunches,
237 b.gflops() * numAsyncLaunches / t);
238 }
239 double min = *(std::min_element(times.begin(), times.end()));
240 size_t midElt = times.size() / 2;
241 std::nth_element(times.begin(), times.begin() + midElt, times.end());
242 double median = times[midElt];
243 double median_runtime = median / ((double)numAsyncLaunches);
244 double min_runtime = min / ((double)numAsyncLaunches);
245 printf("_,benchName,_,batchSize,m,n,numLayers,numReps,numAsyncLaunches,"
246 "numBatchGEMMChains,backendStr,dtypeStr,medianRuntime,minRuntime,"
247 "medianGflopsPerSec,maxGflopsPerSec\n");
248 printf("BenchSummary,BatchGemmBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%"
249 "f,%f,%"
250 "f\n",
251 batchSize, m, n, numLayers, numReps, numAsyncLaunches, numCores,
252 backendStr, dtypeStr, median_runtime, min_runtime,
253 b.gflops() / median_runtime, b.gflops() / min_runtime);
254}
255