1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <algorithm>
17#include <array>
18#include <cstdlib>
19#include <future>
20#include <random>
21
22#include "Bench.h"
23
24#include "glow/ExecutionEngine/ExecutionEngine.h"
25#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27using namespace glow;
28
29/*
30 * This class implements an Concat microbenchmark. There are a number of
31 * parallel Concat nodes which are created, one per core. Then these are
32 * chained together in multiple layers.
33 *
34 * Microbenchmarks are generally useful for understanding performance
35 * through targeted experiementation and are not representative of
36 * end-to-end workloads.
37 */
38class ConcatBench : public Benchmark {
39 dim_t m_;
40 dim_t n_;
41 dim_t numTensors_;
42 dim_t numLayers_;
43 PlaceholderBindings bindings_;
44 std::unique_ptr<runtime::HostManager> hostManager_;
45 size_t asyncLaunchSize_;
46 const char *backendStr_;
47 ElemKind dtype_;
48 size_t elementSize_;
49 const char *devId_;
50
51public:
52 ConcatBench(dim_t m_, dim_t n_, dim_t numTensors_, dim_t numLayers_,
53 dim_t asyncLaunchSize_, const char *backendStr_,
54 const char *dtypeStr_, const char *devId_ = nullptr)
55 : m_(m_), n_(n_), numTensors_(numTensors_), numLayers_(numLayers_),
56 asyncLaunchSize_(asyncLaunchSize_), backendStr_(backendStr_),
57 devId_(devId_) {
58
59 dtype_ = ElemKind::Float16Ty;
60 elementSize_ = 2;
61 if (std::string(dtypeStr_) == "Float16") {
62 dtype_ = ElemKind::Float16Ty;
63 elementSize_ = 2;
64 } else if (std::string(dtypeStr_) == "Float32") {
65 dtype_ = ElemKind::FloatTy;
66 elementSize_ = 4;
67 }
68 }
69
70 void setup() override {
71
72 // Setup host manager
73 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
74 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
75 if (devId_ != nullptr) {
76 config->parameters["DeviceID"] = devId_;
77 }
78 configs.push_back(std::move(config));
79 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
80
81 std::unique_ptr<Module> mod(new Module);
82 auto fn = mod->createFunction("singleNode");
83 // Create multiple chains of Concat nodes
84 std::vector<Placeholder *> A(numTensors_);
85 std::vector<NodeValue> A_broadcast(numTensors_);
86 std::vector<NodeValue> A_concat(numTensors_);
87 std::vector<NodeValue> slices(numTensors_);
88
89 Placeholder *output;
90
91 for (size_t tensor = 0; tensor < numTensors_; tensor++) {
92 A[tensor] = mod->createPlaceholder(dtype_, {1, n_},
93 "A" + std::to_string(tensor), false);
94 A_broadcast[tensor] = fn->createBroadcast(
95 "A_bcast" + std::to_string(tensor), A[tensor], {m_, n_}, 0);
96 }
97 output =
98 mod->createPlaceholder(dtype_, {1, n_ * numTensors_}, "output", false);
99
100 for (size_t tensor = 0; tensor < numTensors_; tensor++) {
101 A_concat[tensor / 2 * 2 + ((tensor % 2) ? 0 : 1)] = A_broadcast[tensor];
102 }
103 auto *concat = fn->createConcat("concat_0", A_concat, 1);
104
105 for (size_t layer = 1; layer < numLayers_; layer++) {
106 for (size_t tensor = 0; tensor < numTensors_; tensor++) {
107 dim_t start_n =
108 tensor / 2 * 2 * n_ + ((tensor % 2) ? (3 * n_ / 2) : (0));
109 dim_t end_n = start_n + ((tensor % 2) ? (n_ / 2) : (3 * n_ / 2));
110 slices[tensor] = fn->createSlice("slice_" + std::to_string(tensor),
111 concat, {0, start_n}, {m_, end_n});
112 }
113 for (size_t tensor = 0; tensor < numTensors_; tensor++) {
114 A_concat[tensor / 2 * 2 + ((tensor % 2) ? 0 : 1)] = slices[tensor];
115 }
116 concat = fn->createConcat("concat_" + std::to_string(layer), A_concat, 1);
117 }
118 Node *slice =
119 fn->createSlice("slice_final", concat, {0, 0}, {1, n_ * numTensors_});
120 fn->createSave("save", slice, output);
121 CompilationContext ctx;
122 ctx.dumpFinalGraph = true;
123 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
124 }
125
126 void run() override {
127 std::vector<std::promise<void>> promises(asyncLaunchSize_);
128 std::vector<std::future<void>> futures;
129
130 // Launch a number of independent requests
131 for (auto &runPromise : promises) {
132 std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
133 futures.push_back(runPromise.get_future());
134 hostManager_->runNetwork(
135 "singleNode", std::move(contextPtr),
136 [&runPromise](runtime::RunIdentifierTy, Error err,
137 std::unique_ptr<ExecutionContext> /* contextPtr */) {
138 EXIT_ON_ERR(std::move(err));
139 runPromise.set_value();
140 });
141 }
142 for (auto &fut : futures) {
143 fut.wait();
144 }
145 }
146
147 void teardown() override {}
148
149 // Two inputs per layer and one output
150 double gbytes() const {
151 return elementSize_ * m_ * n_ * numTensors_ * numLayers_ / 1e9;
152 }
153};
154
155int main(int argc, char *argv[]) {
156 printf("Concat Microbenchmark\n");
157 printf("Usage: ConcatBench m(Int) n(Int) numTensors(Int) "
158 "numLayers(Int) numReps(Int) "
159 "numAsyncLaunches(Int) backendStr(String) "
160 "dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n");
161 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
162 "environment variable\n");
163 benchParseGlowOpts(argc, argv);
164 assert(argc == 9 || argc == 10);
165 size_t m = atoi(argv[1]);
166 size_t n = atoi(argv[2]);
167 size_t numTensors = atoi(argv[3]);
168 size_t numLayers = atoi(argv[4]);
169 size_t reps = atoi(argv[5]);
170 size_t asyncLaunches = atoi(argv[6]);
171 const char *backendStr = argv[7];
172 const char *dtypeStr = argv[8];
173 char *dev_id = nullptr;
174
175 if (argc > 9) {
176 dev_id = argv[9];
177 printf("Setting backend device: \"%s\"\n", dev_id);
178 }
179
180 assert(reps > 0);
181
182 ConcatBench b(m, n, numTensors, numLayers, asyncLaunches, backendStr,
183 dtypeStr, dev_id);
184 auto times = bench(&b, reps);
185 printf("_,benchName,_,m,n,numTensors,numLayers,numReps,numAsyncLaunches,"
186 "backendStr,dtypeStr,runtime,gbytesPerSecPerChain\n");
187 for (auto t : times) {
188 printf("BenchResult,ConcatBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,"
189 "%2.6lf,%5.2lf\n",
190 m, n, numTensors, numLayers, reps, asyncLaunches, backendStr,
191 dtypeStr, t / asyncLaunches, b.gbytes() * asyncLaunches / t);
192 }
193 double min = *(std::min_element(times.begin(), times.end()));
194 size_t midElt = times.size() / 2;
195 std::nth_element(times.begin(), times.begin() + midElt, times.end());
196 double median = times[midElt];
197 double median_runtime = median / ((double)asyncLaunches);
198 double min_runtime = min / ((double)asyncLaunches);
199 printf("_,benchName,_,m,n,numTensors,numLayers,numReps,numAsyncLaunches,"
200 "backendStr,dtypeStr,medianRuntime,minRuntime,"
201 "medianGbytesPerSecPerChain,maxGbytesPerSecPerChain\n");
202 printf("BenchSummary,ConcatBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,"
203 "%2.6lf,%2.6lf,%"
204 "5.2lf, %5.2lf\n",
205 m, n, numTensors, numLayers, reps, asyncLaunches, backendStr, dtypeStr,
206 median_runtime, min_runtime, b.gbytes() / median_runtime,
207 b.gbytes() / min_runtime);
208}
209