1/**
2 * Copyright (c) 2017-present, Facebook, Inc.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <algorithm>
17#include <array>
18#include <cstdlib>
19#include <future>
20#include <random>
21
22#include "Bench.h"
23
24#include "glow/ExecutionEngine/ExecutionEngine.h"
25#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27using namespace glow;
28
29/*
30 * This class implements a transpose microbenchmark. There are multiple
31 * layers of transpose, followed by an Add with the tensor from the previous
32 * layer.
33 *
34 * Microbenchmarks are generally useful for understanding performance
35 * through targeted experiementation and are not representative of
36 * end-to-end workloads.
37 */
38class TransposeBench : public Benchmark {
39 dim_t batchSize_;
40 dim_t n_;
41 dim_t numLayers_;
42 std::unique_ptr<runtime::HostManager> hostManager_;
43 std::vector<std::unique_ptr<ExecutionContext>> contexts_;
44 dim_t asyncLaunchSize_;
45 dim_t numCores_;
46 const char *backendStr_;
47 ElemKind dtype_;
48 dim_t elementSize_;
49 const char *devId_;
50
51public:
52 TransposeBench(dim_t batchSize_, dim_t n_, dim_t numLayers_,
53 dim_t asyncLaunchSize_, dim_t numCores_,
54 const char *backendStr_, const char *dtypeStr_,
55 const char *devId_ = nullptr)
56 : batchSize_(batchSize_), n_(n_), numLayers_(numLayers_),
57 asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
58 backendStr_(backendStr_), devId_(devId_) {
59
60 dtype_ = ElemKind::Float16Ty;
61 elementSize_ = 2;
62 if (std::string(dtypeStr_) == "Float16") {
63 dtype_ = ElemKind::Float16Ty;
64 elementSize_ = 2;
65 } else if (std::string(dtypeStr_) == "Float32") {
66 dtype_ = ElemKind::FloatTy;
67 elementSize_ = 4;
68 }
69 }
70
71 void setup() override {
72
73 // Create execution contexts here
74 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
75 std::unique_ptr<ExecutionContext> context(new ExecutionContext);
76 contexts_.push_back(std::move(context));
77 }
78
79 // Setup host manager
80 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
81 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
82 if (devId_ != nullptr) {
83 config->parameters["DeviceID"] = devId_;
84 }
85 configs.push_back(std::move(config));
86 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
87
88 std::unique_ptr<Module> mod(new Module);
89 auto fn = mod->createFunction("singleNode");
90
91 std::vector<Placeholder *> input(numCores_);
92 std::vector<SaveNode *> S(numCores_);
93 auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_);
94
95 for (dim_t core = 0; core < numCores_; core++) {
96 if (batchSizePerCore[core] == 0)
97 continue;
98 input[core] =
99 mod->createPlaceholder(dtype_, {batchSizePerCore[core], n_, n_},
100 "A" + std::to_string(core), false);
101 }
102
103 // Create multiple chains of Transpose and Add nodes
104 for (dim_t core = 0; core < numCores_; core++) {
105 if (batchSizePerCore[core] == 0)
106 continue;
107 // for each context, add input bindings
108 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
109 if (dtype_ == ElemKind::FloatTy) {
110 contexts_[i]
111 ->getPlaceholderBindings()
112 ->allocate(input[core])
113 ->getHandle<float>()
114 .randomize(0.0f, 1.0f, mod->getPRNG());
115 } else if (dtype_ == ElemKind::Float16Ty) {
116 contexts_[i]
117 ->getPlaceholderBindings()
118 ->allocate(input[core])
119 ->getHandle<float16_t>()
120 .randomize(0.0f, 1.0f, mod->getPRNG());
121 }
122 }
123
124 Node *cur = input[core];
125 for (dim_t layer = 0; layer < numLayers_; layer++) {
126 auto *xp = fn->createTranspose("transpose_" + std::to_string(layer) +
127 "_" + std::to_string(core),
128 cur, {0, 2, 1});
129 auto *ad = fn->createAdd("add_" + std::to_string(layer) + "_" +
130 std::to_string(core),
131 cur, xp);
132 cur = ad;
133 }
134
135 S[core] = fn->createSave("save", cur);
136
137 // for each context, allocate output
138 for (dim_t i = 0; i < asyncLaunchSize_; i++) {
139 contexts_[i]->getPlaceholderBindings()->allocate(
140 S[core]->getPlaceholder());
141 }
142 }
143
144 CompilationContext ctx;
145 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
146 }
147
148 void run() override {
149 std::vector<std::unique_ptr<ExecutionContext>> localContexts(
150 asyncLaunchSize_);
151 std::vector<std::promise<void>> promises(asyncLaunchSize_);
152 std::vector<std::future<void>> futures;
153
154 // Launch a number of independent requests
155 int i = 0;
156 for (auto &promise : promises) {
157 futures.push_back(promise.get_future());
158 hostManager_->runNetwork(
159 "singleNode", std::move(contexts_[i]),
160 [&localContexts, &promise,
161 i](runtime::RunIdentifierTy, Error err,
162 std::unique_ptr<ExecutionContext> contextPtr) {
163 EXIT_ON_ERR(std::move(err));
164 localContexts[i] = std::move(contextPtr);
165 promise.set_value();
166 });
167 i++;
168 }
169 for (auto &fut : futures) {
170 fut.wait();
171 }
172 for (dim_t j = 0; j < asyncLaunchSize_; j++) {
173 contexts_[j] = std::move(localContexts[j]);
174 }
175 }
176
177 void teardown() override {}
178
179 // Each layer reads the tensor thrice, and writes the tensor twice
180 double gbytes() const {
181 return (5.0 * numLayers_ * batchSize_ * n_ * n_ * elementSize_) / 1e9;
182 }
183};
184
185int main(int argc, char *argv[]) {
186 printf("Transpose Microbenchmark\n");
187 printf("Usage: TransposeBench batchSize(Int) n(Int) numLayers(Int) "
188 "numReps(Int) numAsyncLaunches(Int) numTransposeChains(Int) "
189 "backendStr(String) dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n");
190 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
191 "environment variable\n");
192 benchParseGlowOpts(argc, argv);
193 assert(argc == 9 || argc == 10);
194 size_t batchSize = atoi(argv[1]);
195 size_t n = atoi(argv[2]);
196 size_t numLayers = atoi(argv[3]);
197 size_t numReps = atoi(argv[4]);
198 size_t numAsyncLaunches = atoi(argv[5]);
199 size_t numCores = atoi(argv[6]);
200 const char *backendStr = argv[7];
201 const char *dtypeStr = argv[8];
202 char *dev_id = nullptr;
203
204 if (argc > 9) {
205 dev_id = argv[9];
206 printf("Setting backend device: \"%s\"\n", dev_id);
207 }
208
209 assert(numReps > 0);
210
211 TransposeBench b(batchSize, n, numLayers, numAsyncLaunches, numCores,
212 backendStr, dtypeStr, dev_id);
213
214 auto times = bench(&b, numReps);
215 printf("_,benchName,_,batchSize,n,numLayers,numReps,numAsyncLaunches,"
216 "numTransposeChains,backendStr,dtypeStr,runtime,gbytesPerSec\n");
217 for (auto t : times) {
218 printf(
219 "BenchResult,TransposeBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%f\n",
220 batchSize, n, numLayers, numReps, numAsyncLaunches, numCores,
221 backendStr, dtypeStr, t / numAsyncLaunches,
222 b.gbytes() * numAsyncLaunches / t);
223 }
224 double min = *(std::min_element(times.begin(), times.end()));
225 size_t midElt = times.size() / 2;
226 std::nth_element(times.begin(), times.begin() + midElt, times.end());
227 double median = times[midElt];
228 double median_runtime = median / ((double)numAsyncLaunches);
229 double min_runtime = min / ((double)numAsyncLaunches);
230 printf("_,benchName,_,batchSize,n,numLayers,numReps,numAsyncLaunches,"
231 "numTransposeChains,backendStr,dtypeStr,medianRuntime,minRuntime,"
232 "medianGbytesPerSec,maxGbytesPerSec\n");
233 printf(
234 "BenchSummary,TransposeBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%f,%f,%"
235 "f\n",
236 batchSize, n, numLayers, numReps, numAsyncLaunches, numCores, backendStr,
237 dtypeStr, median_runtime, min_runtime, b.gbytes() / median_runtime,
238 b.gbytes() / min_runtime);
239}
240