1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <array> |
17 | #include <cstdlib> |
18 | #include <future> |
19 | #include <random> |
20 | |
21 | #include "Bench.h" |
22 | |
23 | #include "ConvUtils.h" |
24 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
25 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
26 | |
27 | using namespace glow; |
28 | using namespace std; |
29 | |
30 | vector<vector<conv_param_t<2>>> shapes_2d = { |
31 | // MB, IC, OC, IH, IW, G, KH, KW, stride_h, stride_w, |
32 | // pad_h_top, pad_w_left, pad_h_bottom, pad_w_right |
33 | // 2D convolutions |
34 | // regular |
35 | {conv_param_t<>(1, 128, 128, {56, 56}, 1, {3, 3}, {1, 1}, {1, 1, 1, 1})}, |
36 | // groupwise |
37 | {conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1})}, |
38 | // DW |
39 | {conv_param_t<>(1, 272, 272, {47, 125}, 272, {3, 3}, {1, 1}, {1, 1, 1, 1})}, |
40 | // Pointwise |
41 | {conv_param_t<>(1, 128, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})}, |
42 | // bottleneck blocks |
43 | {conv_param_t<>(1, 256, 128, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}), |
44 | conv_param_t<>(1, 128, 128, {56, 56}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), |
45 | conv_param_t<>(1, 128, 256, {56, 56}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})}, |
46 | {conv_param_t<>(1, 512, 256, {28, 28}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}), |
47 | conv_param_t<>(1, 256, 256, {28, 28}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), |
48 | conv_param_t<>(1, 256, 512, {28, 28}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})}, |
49 | {conv_param_t<>(1, 1024, 512, {14, 14}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}), |
50 | conv_param_t<>(1, 512, 512, {14, 14}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), |
51 | conv_param_t<>(1, 512, 1024, {14, 14}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})}, |
52 | {conv_param_t<>(1, 2048, 1024, {7, 7}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0}), |
53 | conv_param_t<>(1, 1024, 1024, {7, 7}, 32, {3, 3}, {1, 1}, {1, 1, 1, 1}), |
54 | conv_param_t<>(1, 1024, 2048, {7, 7}, 1, {1, 1}, {1, 1}, {0, 0, 0, 0})} |
55 | |
56 | }; |
57 | |
58 | /* |
59 | * Benchmark a number of Conv2d operators with representative input shapes. |
60 | * There are a number of parallel Conv2d nodes which are created, one |
61 | * per core. Each core handles one weight matrix. Then these are chained |
62 | * together in multiple layers. After each layer, output tensor is passed to the |
63 | * next layer. |
64 | */ |
65 | class Int8Conv2dParallelBench : public Benchmark { |
66 | /// Matrices. |
67 | std::vector<conv_param_t<2>> input_shapes_; |
68 | size_t numLayers_; |
69 | PlaceholderBindings bindings_; |
70 | std::unique_ptr<runtime::HostManager> hostManager_; |
71 | size_t asyncLaunchSize_; |
72 | size_t numCores_; |
73 | const char *backendStr_; |
74 | const char *devId_; |
75 | |
76 | public: |
77 | Int8Conv2dParallelBench(vector<conv_param_t<2>> &input_shapes_, |
78 | size_t numLayers_, size_t asyncLaunchSize_, |
79 | size_t numCores_, const char *backendStr_, |
80 | const char *devId_) |
81 | : input_shapes_(input_shapes_), numLayers_(numLayers_), |
82 | asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_), |
83 | backendStr_(backendStr_), devId_(devId_) {} |
84 | |
85 | void setup() override { |
86 | |
87 | // Setup host manager |
88 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
89 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
90 | if (devId_ != nullptr) { |
91 | config->parameters["DeviceID" ] = devId_; |
92 | } |
93 | configs.push_back(std::move(config)); |
94 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
95 | |
96 | dim_t N, IC, IH, IW, OC, OH, OW; |
97 | if (input_shapes_.size() == 1) { |
98 | N = input_shapes_[0].MB; |
99 | IC = input_shapes_[0].IC; |
100 | IH = input_shapes_[0].IN_DIM[0]; |
101 | IW = input_shapes_[0].IN_DIM[1]; |
102 | OC = input_shapes_[0].OC; |
103 | OH = input_shapes_[0].OUT_DIM[0]; |
104 | OW = input_shapes_[0].OUT_DIM[1]; |
105 | } else { |
106 | N = input_shapes_[0].MB; |
107 | IC = input_shapes_[0].IC; |
108 | IH = input_shapes_[0].IN_DIM[0]; |
109 | IW = input_shapes_[0].IN_DIM[1]; |
110 | OC = input_shapes_[input_shapes_.size() - 1].OC; |
111 | OH = input_shapes_[input_shapes_.size() - 1].OUT_DIM[0]; |
112 | OW = input_shapes_[input_shapes_.size() - 1].OUT_DIM[1]; |
113 | } |
114 | std::unique_ptr<Module> mod(new Module); |
115 | auto fn = mod->createFunction("singleNode" ); |
116 | |
117 | std::vector<Node *> cur(numCores_); |
118 | std::vector<Placeholder *> filters(numCores_ * input_shapes_.size()); |
119 | std::vector<Placeholder *> bias(numCores_ * input_shapes_.size()); |
120 | std::vector<Node *> conv(numCores_ * input_shapes_.size()); |
121 | std::vector<Placeholder *> input(numCores_); |
122 | std::vector<Placeholder *> output(numCores_); |
123 | |
124 | for (size_t core = 0; core < numCores_; core++) { |
125 | input[core] = |
126 | mod->createPlaceholder(ElemKind::Int8QTy, {N, IH, IW, IC}, 1.0, 0, |
127 | "input_" + std::to_string(core), false); |
128 | output[core] = |
129 | mod->createPlaceholder(ElemKind::Int8QTy, {N, OH, OW, OC}, 1.0, 0, |
130 | "output_" + std::to_string(core), false); |
131 | cur[core] = input[core]; |
132 | } |
133 | |
134 | for (size_t layer = 0; layer < numLayers_; layer++) { |
135 | for (size_t core = 0; core < numCores_; core++) { |
136 | size_t conv_ops = 0; |
137 | for (auto conv_param : input_shapes_) { |
138 | filters[core * input_shapes_.size() + conv_ops] = |
139 | mod->createPlaceholder(ElemKind::Int8QTy, |
140 | {(dim_t)(conv_param.OC), |
141 | (dim_t)(conv_param.K[0]), |
142 | (dim_t)(conv_param.K[1]), |
143 | (dim_t)(conv_param.IC / conv_param.G)}, |
144 | 1.0, 0, |
145 | "filters_" + std::to_string(core) + "_" + |
146 | std::to_string(conv_ops), |
147 | false); |
148 | bias[core * input_shapes_.size() + conv_ops] = mod->createPlaceholder( |
149 | ElemKind::Int32QTy, {(dim_t)(conv_param.OC)}, 1.0, 0, |
150 | "bias_" + std::to_string(core) + "_" + std::to_string(conv_ops), |
151 | false); |
152 | bindings_.allocate(filters[core * input_shapes_.size() + conv_ops]) |
153 | ->getHandle<int8_t>() |
154 | .clear(0); |
155 | bindings_.allocate(bias[core * input_shapes_.size() + conv_ops]) |
156 | ->getHandle<int32_t>() |
157 | .clear(0); |
158 | auto outTy = mod->uniqueType( |
159 | ElemKind::Int8QTy, |
160 | {(dim_t)(conv_param.MB), (dim_t)(conv_param.OUT_DIM[0]), |
161 | (dim_t)(conv_param.OUT_DIM[1]), (dim_t)(conv_param.OC)}, |
162 | 1.0, 0); |
163 | conv[core * input_shapes_.size() + conv_ops] = fn->createConv( |
164 | "conv" + std::to_string(core) + "_" + std::to_string(layer) + |
165 | "_" + std::to_string(conv_ops), |
166 | cur[core], filters[core * input_shapes_.size() + conv_ops], |
167 | bias[core * input_shapes_.size() + conv_ops], outTy, |
168 | {(unsigned int)(conv_param.K[0]), |
169 | (unsigned int)(conv_param.K[1])}, |
170 | {(unsigned int)(conv_param.stride[0]), |
171 | (unsigned int)(conv_param.stride[1])}, |
172 | {(unsigned int)(conv_param.pad[0]), |
173 | (unsigned int)(conv_param.pad[1]), |
174 | (unsigned int)(conv_param.pad[2]), |
175 | (unsigned int)(conv_param.pad[3])}, |
176 | (unsigned int)(conv_param.G), |
177 | {(unsigned int)(conv_param.dilation[0]), |
178 | (unsigned int)(conv_param.dilation[1])}); |
179 | cur[core] = conv[core * input_shapes_.size() + conv_ops]; |
180 | conv_ops += 1; |
181 | } |
182 | } |
183 | } |
184 | for (size_t core = 0; core < numCores_; core++) { |
185 | fn->createSave("save" + std::to_string(core), cur[core], output[core]); |
186 | } |
187 | |
188 | for (size_t core = 0; core < numCores_; core++) { |
189 | ::glow::convertPlaceholdersToConstants(fn, bindings_, |
190 | { |
191 | input[core], |
192 | output[core], |
193 | }); |
194 | } |
195 | |
196 | CompilationContext ctx; |
197 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
198 | } |
199 | |
200 | void run() override { |
201 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
202 | std::vector<std::future<void>> futures; |
203 | for (auto &runPromise : promises) { |
204 | std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext); |
205 | futures.push_back(runPromise.get_future()); |
206 | hostManager_->runNetwork( |
207 | "singleNode" , std::move(contextPtr), |
208 | [&runPromise](runtime::RunIdentifierTy, Error err, |
209 | std::unique_ptr<ExecutionContext> /* contextPtr */) { |
210 | EXIT_ON_ERR(std::move(err)); |
211 | runPromise.set_value(); |
212 | }); |
213 | } |
214 | for (auto &fut : futures) { |
215 | fut.wait(); |
216 | } |
217 | } |
218 | |
219 | void teardown() override {} |
220 | }; |
221 | |
222 | int main(int argc, char *argv[]) { |
223 | size_t numLayers = atoi(argv[1]); |
224 | size_t reps = atoi(argv[2]); |
225 | size_t asyncLaunches = atoi(argv[3]); |
226 | size_t numCores = atoi(argv[4]); |
227 | const char *backendStr = argv[5]; |
228 | char *dev_id = nullptr; |
229 | |
230 | printf("Int8Conv2dParallel Microbenchmark\n" ); |
231 | printf( |
232 | "Usage: Int8Conv2dParallelBench numLayers(Int) " |
233 | "numReps(Int) " |
234 | "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n" ); |
235 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
236 | "environment variable\n" ); |
237 | benchParseGlowOpts(argc, argv); |
238 | assert(argc == 6 || argc == 7); |
239 | if (argc > 6) { |
240 | dev_id = argv[6]; |
241 | printf("Setting backend device: \"%s\"\n" , dev_id); |
242 | } |
243 | printf("Start Int8Conv2dParallelBench\n" ); |
244 | size_t shape_idx = 0; |
245 | size_t total_input_shapes = shapes_2d.size(); |
246 | for (auto shapes : shapes_2d) { |
247 | double gflops = 0; |
248 | string shape_info = "" ; |
249 | for (auto shape : shapes) { |
250 | gflops += 2.0 * shape.G * (shape.IC / shape.G) * shape.K[0] * shape.K[1] * |
251 | (shape.OC / shape.G) * shape.OUT_DIM[0] * shape.OUT_DIM[1]; |
252 | if (shape_info != "" ) { |
253 | shape_info += ";" ; |
254 | } |
255 | shape_info += shape.toString(); |
256 | } |
257 | gflops *= numLayers * numCores / 1e9; |
258 | printf("\n=====Input shape %zu/%zu: %s\n" , shape_idx, total_input_shapes, |
259 | shape_info.c_str()); |
260 | Int8Conv2dParallelBench b(shapes, numLayers, asyncLaunches, numCores, |
261 | backendStr, dev_id); |
262 | auto times = bench(&b, reps); |
263 | for (auto t : times) { |
264 | printf("BenchResult,Conv2dParallelBench,SW,%4zu,%4zu,%4zu,%4zu," |
265 | "%s,%" |
266 | "2.6lf,%5.2lf\n" , |
267 | numLayers, reps, asyncLaunches, numCores, backendStr, |
268 | t / asyncLaunches, gflops * asyncLaunches / t); |
269 | } |
270 | double min = *(std::min_element(times.begin(), times.end())); |
271 | size_t midElt = times.size() / 2; |
272 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
273 | double median = times[midElt]; |
274 | double median_runtime = median / ((double)asyncLaunches); |
275 | double min_runtime = min / ((double)asyncLaunches); |
276 | printf("BenchSummary,Conv2dParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%s,%" |
277 | "2.6lf,%2.6lf,%5.2lf,%5.2lf\n" , |
278 | numLayers, reps, asyncLaunches, numCores, backendStr, median_runtime, |
279 | min_runtime, gflops / median_runtime, gflops / min_runtime); |
280 | shape_idx++; |
281 | } |
282 | } |
283 | |