1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <array>
17#include <cstdlib>
18#include <future>
19#include <random>
20
21#include "Bench.h"
22
23#include "ConvUtils.h"
24#include "glow/ExecutionEngine/ExecutionEngine.h"
25#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27using namespace glow;
28using namespace std;
29
30vector<vector<conv_param_t<3>>> shapes_3d = {
31 // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h,
32 // stride_w},
33 // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right}
34 // Regular
35 {conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1},
36 {1, 1, 1, 1, 1, 1})},
37
38 // Groupwise
39 {conv_param_t<3>(32, 192, 192, {2, 28, 28}, 96, {3, 3, 3}, {1, 1, 1},
40 {1, 1, 1, 1, 1, 1})},
41 {conv_param_t<3>(32, 192, 192, {1, 14, 14}, 96, {3, 3, 3}, {1, 1, 1},
42 {1, 1, 1, 1, 1, 1})},
43 {conv_param_t<3>(32, 384, 384, {1, 14, 14}, 192, {3, 3, 3}, {1, 1, 1},
44 {1, 1, 1, 1, 1, 1})},
45 {conv_param_t<3>(32, 384, 384, {1, 7, 7}, 192, {3, 3, 3}, {1, 1, 1},
46 {1, 1, 1, 1, 1, 1})},
47
48 {conv_param_t<3>(32, 16, 16, {4, 56, 56}, 8, {3, 3, 3}, {1, 1, 1},
49 {1, 1, 1, 1, 1, 1})},
50 {conv_param_t<3>(32, 16, 16, {2, 28, 28}, 8, {3, 3, 3}, {1, 1, 1},
51 {1, 1, 1, 1, 1, 1})},
52 {conv_param_t<3>(32, 32, 32, {4, 56, 56}, 16, {3, 3, 3}, {1, 1, 1},
53 {1, 1, 1, 1, 1, 1})},
54 {conv_param_t<3>(32, 32, 32, {2, 28, 28}, 16, {3, 3, 3}, {1, 1, 1},
55 {1, 1, 1, 1, 1, 1})},
56 {conv_param_t<3>(32, 32, 32, {2, 28, 28}, 16, {3, 3, 3}, {1, 1, 1},
57 {1, 1, 1, 1, 1, 1})},
58 {conv_param_t<3>(32, 32, 32, {1, 14, 14}, 16, {3, 3, 3}, {1, 1, 1},
59 {1, 1, 1, 1, 1, 1})},
60 {conv_param_t<3>(32, 128, 128, {2, 28, 28}, 32, {3, 3, 3}, {1, 1, 1},
61 {1, 1, 1, 1, 1, 1})},
62 {conv_param_t<3>(32, 128, 128, {1, 14, 14}, 32, {3, 3, 3}, {1, 1, 1},
63 {1, 1, 1, 1, 1, 1})},
64 {conv_param_t<3>(32, 256, 256, {1, 14, 14}, 64, {3, 3, 3}, {1, 1, 1},
65 {1, 1, 1, 1, 1, 1})},
66 {conv_param_t<3>(32, 256, 256, {1, 7, 7}, 64, {3, 3, 3}, {1, 1, 1},
67 {1, 1, 1, 1, 1, 1})},
68
69 // Depthwise
70 {conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1},
71 {1, 1, 1, 1, 1, 1})},
72
73 // Pointwise
74 {conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1},
75 {0, 0, 0, 0})},
76 // bottleneck blocks
77 {conv_param_t<3>(1, 192, 192, {1, 14, 14}, 96, {3, 3, 3}, {1, 1, 1},
78 {1, 1, 1, 1, 1, 1}),
79 conv_param_t<3>(1, 192, 1024, {1, 14, 14}, 1, {1, 1, 1}, {1, 1, 1},
80 {0, 0, 0, 0, 0, 0}),
81 conv_param_t<3>(1, 1024, 192, {1, 14, 14}, 1, {1, 1, 1}, {1, 1, 1},
82 {0, 0, 0, 0, 0, 0})}
83
84};
85
86/*
87 * Benchmark a number of Conv3d operators with representative input shapes.
88 * There are a number of parallel Conv3d nodes which are created, one
89 * per core. Each core handles one weight matrix. Then these are chained
90 * together in multiple layers. After each layer, output tensor is passed to the
91 * next layer.
92 */
93class Int8Conv3dParallelBench : public Benchmark {
94 /// Matrices.
95 std::vector<conv_param_t<3>> input_shapes_;
96 size_t numLayers_;
97 PlaceholderBindings bindings_;
98 std::unique_ptr<runtime::HostManager> hostManager_;
99 size_t asyncLaunchSize_;
100 size_t numCores_;
101 const char *backendStr_;
102 const char *devId_;
103
104public:
105 Int8Conv3dParallelBench(vector<conv_param_t<3>> &input_shapes_,
106 size_t numLayers_, size_t asyncLaunchSize_,
107 size_t numCores_, const char *backendStr_,
108 const char *devId_)
109 : input_shapes_(input_shapes_), numLayers_(numLayers_),
110 asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
111 backendStr_(backendStr_), devId_(devId_) {}
112
113 void setup() override {
114
115 // Setup host manager
116 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
117 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
118 if (devId_ != nullptr) {
119 config->parameters["DeviceID"] = devId_;
120 }
121 configs.push_back(std::move(config));
122 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
123
124 dim_t N, IC, IT, IH, IW, OC, OT, OH, OW;
125 if (input_shapes_.size() == 1) {
126 N = input_shapes_[0].MB;
127 IC = input_shapes_[0].IC;
128 IT = input_shapes_[0].IN_DIM[0];
129 IH = input_shapes_[0].IN_DIM[1];
130 IW = input_shapes_[0].IN_DIM[1];
131 OC = input_shapes_[0].OC;
132 OT = input_shapes_[0].OUT_DIM[0];
133 OH = input_shapes_[0].OUT_DIM[1];
134 OW = input_shapes_[0].OUT_DIM[2];
135 } else {
136 N = input_shapes_[0].MB;
137 IC = input_shapes_[0].IC;
138 IT = input_shapes_[0].IN_DIM[0];
139 IH = input_shapes_[0].IN_DIM[1];
140 IW = input_shapes_[0].IN_DIM[1];
141 OC = input_shapes_[input_shapes_.size() - 1].OC;
142 OT = input_shapes_[input_shapes_.size() - 1].OUT_DIM[0];
143 OH = input_shapes_[input_shapes_.size() - 1].OUT_DIM[1];
144 OW = input_shapes_[input_shapes_.size() - 1].OUT_DIM[1];
145 }
146 std::unique_ptr<Module> mod(new Module);
147 auto fn = mod->createFunction("singleNode");
148
149 std::vector<Node *> cur(numCores_);
150 std::vector<Placeholder *> filters(numCores_ * input_shapes_.size());
151 std::vector<Placeholder *> bias(numCores_ * input_shapes_.size());
152 std::vector<Node *> conv(numCores_ * input_shapes_.size());
153 std::vector<Placeholder *> input(numCores_);
154 std::vector<Placeholder *> output(numCores_);
155
156 for (size_t core = 0; core < numCores_; core++) {
157 input[core] =
158 mod->createPlaceholder(ElemKind::Int8QTy, {N, IT, IH, IW, IC}, 1.0, 0,
159 "input_" + std::to_string(core), false);
160 output[core] =
161 mod->createPlaceholder(ElemKind::Int8QTy, {N, OT, OH, OW, OC}, 1.0, 0,
162 "output_" + std::to_string(core), false);
163 cur[core] = input[core];
164 }
165
166 for (size_t layer = 0; layer < numLayers_; layer++) {
167 for (size_t core = 0; core < numCores_; core++) {
168 size_t conv_ops = 0;
169 for (auto conv_param : input_shapes_) {
170 filters[core * input_shapes_.size() + conv_ops] =
171 mod->createPlaceholder(
172 ElemKind::Int8QTy,
173 {(dim_t)(conv_param.OC), (dim_t)(conv_param.K[0]),
174 (dim_t)(conv_param.K[1]), (dim_t)(conv_param.K[2]),
175 (dim_t)(conv_param.IC / conv_param.G)},
176 1.0, 0,
177 "filters_" + std::to_string(core) + "_" +
178 std::to_string(conv_ops),
179 false);
180 bias[core * input_shapes_.size() + conv_ops] = mod->createPlaceholder(
181 ElemKind::Int32QTy, {(dim_t)(conv_param.OC)}, 1.0, 0,
182 "bias_" + std::to_string(core) + "_" + std::to_string(conv_ops),
183 false);
184 bindings_.allocate(filters[core * input_shapes_.size() + conv_ops])
185 ->getHandle<int8_t>()
186 .clear(0);
187 bindings_.allocate(bias[core * input_shapes_.size() + conv_ops])
188 ->getHandle<int32_t>()
189 .clear(0);
190 auto outTy = mod->uniqueType(
191 ElemKind::Int8QTy,
192 {(dim_t)(conv_param.MB), (dim_t)(conv_param.OUT_DIM[0]),
193 (dim_t)(conv_param.OUT_DIM[1]), (dim_t)(conv_param.OUT_DIM[2]),
194 (dim_t)(conv_param.OC)},
195 1.0, 0);
196 conv[core * input_shapes_.size() + conv_ops] = fn->createConv3D(
197 "conv" + std::to_string(core) + "_" + std::to_string(layer) +
198 "_" + std::to_string(conv_ops),
199 cur[core], filters[core * input_shapes_.size() + conv_ops],
200 bias[core * input_shapes_.size() + conv_ops], outTy,
201 {(unsigned int)(conv_param.K[0]), (unsigned int)(conv_param.K[1]),
202 (unsigned int)(conv_param.K[2])},
203 {(unsigned int)(conv_param.stride[0]),
204 (unsigned int)(conv_param.stride[1]),
205 (unsigned int)(conv_param.stride[2])},
206 {(unsigned int)(conv_param.pad[0]),
207 (unsigned int)(conv_param.pad[1]),
208 (unsigned int)(conv_param.pad[2]),
209 (unsigned int)(conv_param.pad[3]),
210 (unsigned int)(conv_param.pad[4]),
211 (unsigned int)(conv_param.pad[5])},
212 (unsigned int)(conv_param.G));
213
214 cur[core] = conv[core * input_shapes_.size() + conv_ops];
215 conv_ops += 1;
216 }
217 }
218 }
219 for (size_t core = 0; core < numCores_; core++) {
220 fn->createSave("save" + std::to_string(core), cur[core], output[core]);
221 }
222
223 for (size_t core = 0; core < numCores_; core++) {
224 ::glow::convertPlaceholdersToConstants(fn, bindings_,
225 {
226 input[core],
227 output[core],
228 });
229 }
230
231 CompilationContext ctx;
232 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
233 }
234
235 void run() override {
236 std::vector<std::promise<void>> promises(asyncLaunchSize_);
237 std::vector<std::future<void>> futures;
238 for (auto &runPromise : promises) {
239 std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
240 futures.push_back(runPromise.get_future());
241 hostManager_->runNetwork(
242 "singleNode", std::move(contextPtr),
243 [&runPromise](runtime::RunIdentifierTy, Error err,
244 std::unique_ptr<ExecutionContext> /* contextPtr */) {
245 EXIT_ON_ERR(std::move(err));
246 runPromise.set_value();
247 });
248 }
249 for (auto &fut : futures) {
250 fut.wait();
251 }
252 }
253
254 void teardown() override {}
255};
256
257int main(int argc, char *argv[]) {
258 size_t numLayers = atoi(argv[1]);
259 size_t reps = atoi(argv[2]);
260 size_t asyncLaunches = atoi(argv[3]);
261 size_t numCores = atoi(argv[4]);
262 const char *backendStr = argv[5];
263 char *dev_id = nullptr;
264
265 printf("Int8Conv3dParallel Microbenchmark\n");
266 printf(
267 "Usage: Int8Conv3dParallelBench numLayers(Int) "
268 "numReps(Int) "
269 "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n");
270 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
271 "environment variable\n");
272 benchParseGlowOpts(argc, argv);
273 assert(argc == 6 || argc == 7);
274 if (argc > 6) {
275 dev_id = argv[6];
276 printf("Setting backend device: \"%s\"\n", dev_id);
277 }
278 printf("Start Int8Conv3dParallelBench\n");
279 size_t shape_idx = 0;
280 size_t total_input_shapes = shapes_3d.size();
281 for (auto shapes : shapes_3d) {
282 double gflops = 0;
283 string shape_info = "";
284 for (auto shape : shapes) {
285 gflops += 2.0 * shape.G * (shape.IC / shape.G) * shape.K[0] * shape.K[1] *
286 shape.K[2] * (shape.OC / shape.G) * shape.OUT_DIM[0] *
287 shape.OUT_DIM[1] * shape.OUT_DIM[2];
288 if (shape_info != "") {
289 shape_info += ";";
290 }
291 shape_info += shape.toString();
292 }
293 gflops *= numLayers * numCores / 1e9;
294 printf("\n=====Input shape %zu/%zu: %s\n", shape_idx, total_input_shapes,
295 shape_info.c_str());
296 Int8Conv3dParallelBench b(shapes, numLayers, asyncLaunches, numCores,
297 backendStr, dev_id);
298 auto times = bench(&b, reps);
299 for (auto t : times) {
300 printf("BenchResult,Conv3dParallelBench,SW,%4zu,%4zu,%4zu,%4zu,"
301 "%s,%"
302 "2.6lf,%5.2lf\n",
303 numLayers, reps, asyncLaunches, numCores, backendStr,
304 t / asyncLaunches, gflops * asyncLaunches / t);
305 }
306 double min = *(std::min_element(times.begin(), times.end()));
307 size_t midElt = times.size() / 2;
308 std::nth_element(times.begin(), times.begin() + midElt, times.end());
309 double median = times[midElt];
310 double median_runtime = median / ((double)asyncLaunches);
311 double min_runtime = min / ((double)asyncLaunches);
312 printf("BenchSummary,Conv3dParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%s,%"
313 "2.6lf,%2.6lf,%5.2lf,%5.2lf\n",
314 numLayers, reps, asyncLaunches, numCores, backendStr, median_runtime,
315 min_runtime, gflops / median_runtime, gflops / min_runtime);
316 shape_idx++;
317 }
318}
319