1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <algorithm> |
17 | #include <array> |
18 | #include <cstdlib> |
19 | #include <future> |
20 | #include <random> |
21 | |
22 | #include "Bench.h" |
23 | |
24 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
25 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
26 | |
27 | using namespace glow; |
28 | |
29 | /* |
30 | * This class implements an Concat microbenchmark. There are a number of |
31 | * parallel Concat nodes which are created, one per core. Then these are |
32 | * chained together in multiple layers. |
33 | * |
34 | * Microbenchmarks are generally useful for understanding performance |
35 | * through targeted experiementation and are not representative of |
36 | * end-to-end workloads. |
37 | */ |
38 | class ConcatBench : public Benchmark { |
39 | dim_t m_; |
40 | dim_t n_; |
41 | dim_t numTensors_; |
42 | dim_t numLayers_; |
43 | PlaceholderBindings bindings_; |
44 | std::unique_ptr<runtime::HostManager> hostManager_; |
45 | size_t asyncLaunchSize_; |
46 | const char *backendStr_; |
47 | ElemKind dtype_; |
48 | size_t elementSize_; |
49 | const char *devId_; |
50 | |
51 | public: |
52 | ConcatBench(dim_t m_, dim_t n_, dim_t numTensors_, dim_t numLayers_, |
53 | dim_t asyncLaunchSize_, const char *backendStr_, |
54 | const char *dtypeStr_, const char *devId_ = nullptr) |
55 | : m_(m_), n_(n_), numTensors_(numTensors_), numLayers_(numLayers_), |
56 | asyncLaunchSize_(asyncLaunchSize_), backendStr_(backendStr_), |
57 | devId_(devId_) { |
58 | |
59 | dtype_ = ElemKind::Float16Ty; |
60 | elementSize_ = 2; |
61 | if (std::string(dtypeStr_) == "Float16" ) { |
62 | dtype_ = ElemKind::Float16Ty; |
63 | elementSize_ = 2; |
64 | } else if (std::string(dtypeStr_) == "Float32" ) { |
65 | dtype_ = ElemKind::FloatTy; |
66 | elementSize_ = 4; |
67 | } |
68 | } |
69 | |
70 | void setup() override { |
71 | |
72 | // Setup host manager |
73 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
74 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
75 | if (devId_ != nullptr) { |
76 | config->parameters["DeviceID" ] = devId_; |
77 | } |
78 | configs.push_back(std::move(config)); |
79 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
80 | |
81 | std::unique_ptr<Module> mod(new Module); |
82 | auto fn = mod->createFunction("singleNode" ); |
83 | // Create multiple chains of Concat nodes |
84 | std::vector<Placeholder *> A(numTensors_); |
85 | std::vector<NodeValue> A_broadcast(numTensors_); |
86 | std::vector<NodeValue> A_concat(numTensors_); |
87 | std::vector<NodeValue> slices(numTensors_); |
88 | |
89 | Placeholder *output; |
90 | |
91 | for (size_t tensor = 0; tensor < numTensors_; tensor++) { |
92 | A[tensor] = mod->createPlaceholder(dtype_, {1, n_}, |
93 | "A" + std::to_string(tensor), false); |
94 | A_broadcast[tensor] = fn->createBroadcast( |
95 | "A_bcast" + std::to_string(tensor), A[tensor], {m_, n_}, 0); |
96 | } |
97 | output = |
98 | mod->createPlaceholder(dtype_, {1, n_ * numTensors_}, "output" , false); |
99 | |
100 | for (size_t tensor = 0; tensor < numTensors_; tensor++) { |
101 | A_concat[tensor / 2 * 2 + ((tensor % 2) ? 0 : 1)] = A_broadcast[tensor]; |
102 | } |
103 | auto *concat = fn->createConcat("concat_0" , A_concat, 1); |
104 | |
105 | for (size_t layer = 1; layer < numLayers_; layer++) { |
106 | for (size_t tensor = 0; tensor < numTensors_; tensor++) { |
107 | dim_t start_n = |
108 | tensor / 2 * 2 * n_ + ((tensor % 2) ? (3 * n_ / 2) : (0)); |
109 | dim_t end_n = start_n + ((tensor % 2) ? (n_ / 2) : (3 * n_ / 2)); |
110 | slices[tensor] = fn->createSlice("slice_" + std::to_string(tensor), |
111 | concat, {0, start_n}, {m_, end_n}); |
112 | } |
113 | for (size_t tensor = 0; tensor < numTensors_; tensor++) { |
114 | A_concat[tensor / 2 * 2 + ((tensor % 2) ? 0 : 1)] = slices[tensor]; |
115 | } |
116 | concat = fn->createConcat("concat_" + std::to_string(layer), A_concat, 1); |
117 | } |
118 | Node *slice = |
119 | fn->createSlice("slice_final" , concat, {0, 0}, {1, n_ * numTensors_}); |
120 | fn->createSave("save" , slice, output); |
121 | CompilationContext ctx; |
122 | ctx.dumpFinalGraph = true; |
123 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
124 | } |
125 | |
126 | void run() override { |
127 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
128 | std::vector<std::future<void>> futures; |
129 | |
130 | // Launch a number of independent requests |
131 | for (auto &runPromise : promises) { |
132 | std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext); |
133 | futures.push_back(runPromise.get_future()); |
134 | hostManager_->runNetwork( |
135 | "singleNode" , std::move(contextPtr), |
136 | [&runPromise](runtime::RunIdentifierTy, Error err, |
137 | std::unique_ptr<ExecutionContext> /* contextPtr */) { |
138 | EXIT_ON_ERR(std::move(err)); |
139 | runPromise.set_value(); |
140 | }); |
141 | } |
142 | for (auto &fut : futures) { |
143 | fut.wait(); |
144 | } |
145 | } |
146 | |
147 | void teardown() override {} |
148 | |
149 | // Two inputs per layer and one output |
150 | double gbytes() const { |
151 | return elementSize_ * m_ * n_ * numTensors_ * numLayers_ / 1e9; |
152 | } |
153 | }; |
154 | |
155 | int main(int argc, char *argv[]) { |
156 | printf("Concat Microbenchmark\n" ); |
157 | printf("Usage: ConcatBench m(Int) n(Int) numTensors(Int) " |
158 | "numLayers(Int) numReps(Int) " |
159 | "numAsyncLaunches(Int) backendStr(String) " |
160 | "dtypeStr(\"Float16\"|\"Float32\") dev_id(Int)\n" ); |
161 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
162 | "environment variable\n" ); |
163 | benchParseGlowOpts(argc, argv); |
164 | assert(argc == 9 || argc == 10); |
165 | size_t m = atoi(argv[1]); |
166 | size_t n = atoi(argv[2]); |
167 | size_t numTensors = atoi(argv[3]); |
168 | size_t numLayers = atoi(argv[4]); |
169 | size_t reps = atoi(argv[5]); |
170 | size_t asyncLaunches = atoi(argv[6]); |
171 | const char *backendStr = argv[7]; |
172 | const char *dtypeStr = argv[8]; |
173 | char *dev_id = nullptr; |
174 | |
175 | if (argc > 9) { |
176 | dev_id = argv[9]; |
177 | printf("Setting backend device: \"%s\"\n" , dev_id); |
178 | } |
179 | |
180 | assert(reps > 0); |
181 | |
182 | ConcatBench b(m, n, numTensors, numLayers, asyncLaunches, backendStr, |
183 | dtypeStr, dev_id); |
184 | auto times = bench(&b, reps); |
185 | printf("_,benchName,_,m,n,numTensors,numLayers,numReps,numAsyncLaunches," |
186 | "backendStr,dtypeStr,runtime,gbytesPerSecPerChain\n" ); |
187 | for (auto t : times) { |
188 | printf("BenchResult,ConcatBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s," |
189 | "%2.6lf,%5.2lf\n" , |
190 | m, n, numTensors, numLayers, reps, asyncLaunches, backendStr, |
191 | dtypeStr, t / asyncLaunches, b.gbytes() * asyncLaunches / t); |
192 | } |
193 | double min = *(std::min_element(times.begin(), times.end())); |
194 | size_t midElt = times.size() / 2; |
195 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
196 | double median = times[midElt]; |
197 | double median_runtime = median / ((double)asyncLaunches); |
198 | double min_runtime = min / ((double)asyncLaunches); |
199 | printf("_,benchName,_,m,n,numTensors,numLayers,numReps,numAsyncLaunches," |
200 | "backendStr,dtypeStr,medianRuntime,minRuntime," |
201 | "medianGbytesPerSecPerChain,maxGbytesPerSecPerChain\n" ); |
202 | printf("BenchSummary,ConcatBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s," |
203 | "%2.6lf,%2.6lf,%" |
204 | "5.2lf, %5.2lf\n" , |
205 | m, n, numTensors, numLayers, reps, asyncLaunches, backendStr, dtypeStr, |
206 | median_runtime, min_runtime, b.gbytes() / median_runtime, |
207 | b.gbytes() / min_runtime); |
208 | } |
209 | |