1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <array>
17#include <cstdlib>
18#include <future>
19#include <random>
20
21#include "Bench.h"
22
23#include "ConvUtils.h"
24#include "glow/ExecutionEngine/ExecutionEngine.h"
25#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27using namespace glow;
28using namespace std;
29
30vector<vector<conv_param_t<2>>> shapes_2d = {
31 // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w,
32 // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right
33 // 2D convolutions
34 // regular
35 {conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1})},
36 // groupwise
37 {conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1})},
38 // DW
39 {conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1})},
40 // Pointwise
41 {conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})},
42 // bottleneck blocks
43 {conv_param_t<>(1, 256, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
44 conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
45 conv_param_t<>(1, 128, 256, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})},
46 {conv_param_t<>(1, 512, 256, {28, 28}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
47 conv_param_t<>(1, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
48 conv_param_t<>(1, 256, 512, {28, 28}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})},
49 {conv_param_t<>(1, 1024, 512, {14, 14}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
50 conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
51 conv_param_t<>(1, 512, 1024, {14, 14}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})},
52 {conv_param_t<>(1, 2048, 1024, {7, 7}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}),
53 conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}),
54 conv_param_t<>(1, 1024, 2048, {7, 7}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})}
55
56};
57
58/*
59 * Benchmark a number of Conv2d operators with representative input shapes.
60 * There are a number of parallel Conv2d nodes which are created, one
61 * per core. Each core handles one weight matrix. Then these are chained
62 * together in multiple layers. After each layer, output tensor is passed to the
63 * next layer.
64 */
65class Int8Conv2dParallelBench : public Benchmark {
66 /// Matrices.
67 std::vector<conv_param_t<2>> input_shapes_;
68 size_t numLayers_;
69 PlaceholderBindings bindings_;
70 std::unique_ptr<runtime::HostManager> hostManager_;
71 size_t asyncLaunchSize_;
72 size_t numCores_;
73 const char *backendStr_;
74 const char *devId_;
75
76public:
77 Int8Conv2dParallelBench(vector<conv_param_t<2>> &input_shapes_,
78 size_t numLayers_, size_t asyncLaunchSize_,
79 size_t numCores_, const char *backendStr_,
80 const char *devId_)
81 : input_shapes_(input_shapes_), numLayers_(numLayers_),
82 asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
83 backendStr_(backendStr_), devId_(devId_) {}
84
85 void setup() override {
86
87 // Setup host manager
88 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
89 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
90 if (devId_ != nullptr) {
91 config->parameters["DeviceID"] = devId_;
92 }
93 configs.push_back(std::move(config));
94 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
95
96 dim_t N, IC, IH, IW, OC, OH, OW;
97 if (input_shapes_.size() == 1) {
98 N = input_shapes_[0].MB;
99 IC = input_shapes_[0].IC;
100 IH = input_shapes_[0].IN_DIM[0];
101 IW = input_shapes_[0].IN_DIM[1];
102 OC = input_shapes_[0].OC;
103 OH = input_shapes_[0].OUT_DIM[0];
104 OW = input_shapes_[0].OUT_DIM[1];
105 } else {
106 N = input_shapes_[0].MB;
107 IC = input_shapes_[0].IC;
108 IH = input_shapes_[0].IN_DIM[0];
109 IW = input_shapes_[0].IN_DIM[1];
110 OC = input_shapes_[input_shapes_.size() - 1].OC;
111 OH = input_shapes_[input_shapes_.size() - 1].OUT_DIM[0];
112 OW = input_shapes_[input_shapes_.size() - 1].OUT_DIM[1];
113 }
114 std::unique_ptr<Module> mod(new Module);
115 auto fn = mod->createFunction("singleNode");
116
117 std::vector<Node *> cur(numCores_);
118 std::vector<Placeholder *> filters(numCores_ * input_shapes_.size());
119 std::vector<Placeholder *> bias(numCores_ * input_shapes_.size());
120 std::vector<Node *> conv(numCores_ * input_shapes_.size());
121 std::vector<Placeholder *> input(numCores_);
122 std::vector<Placeholder *> output(numCores_);
123
124 for (size_t core = 0; core < numCores_; core++) {
125 input[core] =
126 mod->createPlaceholder(ElemKind::Int8QTy, {N, IH, IW, IC}, 1.0, 0,
127 "input_" + std::to_string(core), false);
128 output[core] =
129 mod->createPlaceholder(ElemKind::Int8QTy, {N, OH, OW, OC}, 1.0, 0,
130 "output_" + std::to_string(core), false);
131 cur[core] = input[core];
132 }
133
134 for (size_t layer = 0; layer < numLayers_; layer++) {
135 for (size_t core = 0; core < numCores_; core++) {
136 size_t conv_ops = 0;
137 for (auto conv_param : input_shapes_) {
138 filters[core * input_shapes_.size() + conv_ops] =
139 mod->createPlaceholder(ElemKind::Int8QTy,
140 {(dim_t)(conv_param.OC),
141 (dim_t)(conv_param.K[0]),
142 (dim_t)(conv_param.K[1]),
143 (dim_t)(conv_param.IC / conv_param.G)},
144 1.0, 0,
145 "filters_" + std::to_string(core) + "_" +
146 std::to_string(conv_ops),
147 false);
148 bias[core * input_shapes_.size() + conv_ops] = mod->createPlaceholder(
149 ElemKind::Int32QTy, {(dim_t)(conv_param.OC)}, 1.0, 0,
150 "bias_" + std::to_string(core) + "_" + std::to_string(conv_ops),
151 false);
152 bindings_.allocate(filters[core * input_shapes_.size() + conv_ops])
153 ->getHandle<int8_t>()
154 .clear(0);
155 bindings_.allocate(bias[core * input_shapes_.size() + conv_ops])
156 ->getHandle<int32_t>()
157 .clear(0);
158 auto outTy = mod->uniqueType(
159 ElemKind::Int8QTy,
160 {(dim_t)(conv_param.MB), (dim_t)(conv_param.OUT_DIM[0]),
161 (dim_t)(conv_param.OUT_DIM[1]), (dim_t)(conv_param.OC)},
162 1.0, 0);
163 conv[core * input_shapes_.size() + conv_ops] = fn->createConv(
164 "conv" + std::to_string(core) + "_" + std::to_string(layer) +
165 "_" + std::to_string(conv_ops),
166 cur[core], filters[core * input_shapes_.size() + conv_ops],
167 bias[core * input_shapes_.size() + conv_ops], outTy,
168 {(unsigned int)(conv_param.K[0]),
169 (unsigned int)(conv_param.K[1])},
170 {(unsigned int)(conv_param.stride[0]),
171 (unsigned int)(conv_param.stride[1])},
172 {(unsigned int)(conv_param.pad[0]),
173 (unsigned int)(conv_param.pad[1]),
174 (unsigned int)(conv_param.pad[2]),
175 (unsigned int)(conv_param.pad[3])},
176 (unsigned int)(conv_param.G),
177 {(unsigned int)(conv_param.dilation[0]),
178 (unsigned int)(conv_param.dilation[1])});
179 cur[core] = conv[core * input_shapes_.size() + conv_ops];
180 conv_ops += 1;
181 }
182 }
183 }
184 for (size_t core = 0; core < numCores_; core++) {
185 fn->createSave("save" + std::to_string(core), cur[core], output[core]);
186 }
187
188 for (size_t core = 0; core < numCores_; core++) {
189 ::glow::convertPlaceholdersToConstants(fn, bindings_,
190 {
191 input[core],
192 output[core],
193 });
194 }
195
196 CompilationContext ctx;
197 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
198 }
199
200 void run() override {
201 std::vector<std::promise<void>> promises(asyncLaunchSize_);
202 std::vector<std::future<void>> futures;
203 for (auto &runPromise : promises) {
204 std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
205 futures.push_back(runPromise.get_future());
206 hostManager_->runNetwork(
207 "singleNode", std::move(contextPtr),
208 [&runPromise](runtime::RunIdentifierTy, Error err,
209 std::unique_ptr<ExecutionContext> /* contextPtr */) {
210 EXIT_ON_ERR(std::move(err));
211 runPromise.set_value();
212 });
213 }
214 for (auto &fut : futures) {
215 fut.wait();
216 }
217 }
218
219 void teardown() override {}
220};
221
222int main(int argc, char *argv[]) {
223 size_t numLayers = atoi(argv[1]);
224 size_t reps = atoi(argv[2]);
225 size_t asyncLaunches = atoi(argv[3]);
226 size_t numCores = atoi(argv[4]);
227 const char *backendStr = argv[5];
228 char *dev_id = nullptr;
229
230 printf("Int8Conv2dParallel Microbenchmark\n");
231 printf(
232 "Usage: Int8Conv2dParallelBench numLayers(Int) "
233 "numReps(Int) "
234 "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n");
235 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
236 "environment variable\n");
237 benchParseGlowOpts(argc, argv);
238 assert(argc == 6 || argc == 7);
239 if (argc > 6) {
240 dev_id = argv[6];
241 printf("Setting backend device: \"%s\"\n", dev_id);
242 }
243 printf("Start Int8Conv2dParallelBench\n");
244 size_t shape_idx = 0;
245 size_t total_input_shapes = shapes_2d.size();
246 for (auto shapes : shapes_2d) {
247 double gflops = 0;
248 string shape_info = "";
249 for (auto shape : shapes) {
250 gflops += 2.0 * shape.G * (shape.IC / shape.G) * shape.K[0] * shape.K[1] *
251 (shape.OC / shape.G) * shape.OUT_DIM[0] * shape.OUT_DIM[1];
252 if (shape_info != "") {
253 shape_info += ";";
254 }
255 shape_info += shape.toString();
256 }
257 gflops *= numLayers * numCores / 1e9;
258 printf("\n=====Input shape %zu/%zu: %s\n", shape_idx, total_input_shapes,
259 shape_info.c_str());
260 Int8Conv2dParallelBench b(shapes, numLayers, asyncLaunches, numCores,
261 backendStr, dev_id);
262 auto times = bench(&b, reps);
263 for (auto t : times) {
264 printf("BenchResult,Conv2dParallelBench,SW,%4zu,%4zu,%4zu,%4zu,"
265 "%s,%"
266 "2.6lf,%5.2lf\n",
267 numLayers, reps, asyncLaunches, numCores, backendStr,
268 t / asyncLaunches, gflops * asyncLaunches / t);
269 }
270 double min = *(std::min_element(times.begin(), times.end()));
271 size_t midElt = times.size() / 2;
272 std::nth_element(times.begin(), times.begin() + midElt, times.end());
273 double median = times[midElt];
274 double median_runtime = median / ((double)asyncLaunches);
275 double min_runtime = min / ((double)asyncLaunches);
276 printf("BenchSummary,Conv2dParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%s,%"
277 "2.6lf,%2.6lf,%5.2lf,%5.2lf\n",
278 numLayers, reps, asyncLaunches, numCores, backendStr, median_runtime,
279 min_runtime, gflops / median_runtime, gflops / min_runtime);
280 shape_idx++;
281 }
282}
283