1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <array> |
17 | #include <cstdlib> |
18 | #include <future> |
19 | #include <random> |
20 | |
21 | #include "Bench.h" |
22 | |
23 | #include "ConvUtils.h" |
24 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
25 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
26 | |
27 | using namespace glow; |
28 | using namespace std; |
29 | |
30 | vector<vector<conv_param_t<3>>> shapes_3d = { |
31 | // MB, IC, OC, {IT, IH, IW}, G, {KT, KH, KW}, {stride_t, stride_h, |
32 | // stride_w}, |
33 | // {pad_prev, pad_h_top, pad_w_left, pad_next, pad_h_bottom, pad_w_right} |
34 | // Regular |
35 | {conv_param_t<3>(1, 64, 64, {8, 14, 14}, 1, {3, 3, 3}, {1, 1, 1}, |
36 | {1, 1, 1, 1, 1, 1})}, |
37 | |
38 | // Groupwise |
39 | {conv_param_t<3>(32, 192, 192, {2, 28, 28}, 96, {3, 3, 3}, {1, 1, 1}, |
40 | {1, 1, 1, 1, 1, 1})}, |
41 | {conv_param_t<3>(32, 192, 192, {1, 14, 14}, 96, {3, 3, 3}, {1, 1, 1}, |
42 | {1, 1, 1, 1, 1, 1})}, |
43 | {conv_param_t<3>(32, 384, 384, {1, 14, 14}, 192, {3, 3, 3}, {1, 1, 1}, |
44 | {1, 1, 1, 1, 1, 1})}, |
45 | {conv_param_t<3>(32, 384, 384, {1, 7, 7}, 192, {3, 3, 3}, {1, 1, 1}, |
46 | {1, 1, 1, 1, 1, 1})}, |
47 | |
48 | {conv_param_t<3>(32, 16, 16, {4, 56, 56}, 8, {3, 3, 3}, {1, 1, 1}, |
49 | {1, 1, 1, 1, 1, 1})}, |
50 | {conv_param_t<3>(32, 16, 16, {2, 28, 28}, 8, {3, 3, 3}, {1, 1, 1}, |
51 | {1, 1, 1, 1, 1, 1})}, |
52 | {conv_param_t<3>(32, 32, 32, {4, 56, 56}, 16, {3, 3, 3}, {1, 1, 1}, |
53 | {1, 1, 1, 1, 1, 1})}, |
54 | {conv_param_t<3>(32, 32, 32, {2, 28, 28}, 16, {3, 3, 3}, {1, 1, 1}, |
55 | {1, 1, 1, 1, 1, 1})}, |
56 | {conv_param_t<3>(32, 32, 32, {2, 28, 28}, 16, {3, 3, 3}, {1, 1, 1}, |
57 | {1, 1, 1, 1, 1, 1})}, |
58 | {conv_param_t<3>(32, 32, 32, {1, 14, 14}, 16, {3, 3, 3}, {1, 1, 1}, |
59 | {1, 1, 1, 1, 1, 1})}, |
60 | {conv_param_t<3>(32, 128, 128, {2, 28, 28}, 32, {3, 3, 3}, {1, 1, 1}, |
61 | {1, 1, 1, 1, 1, 1})}, |
62 | {conv_param_t<3>(32, 128, 128, {1, 14, 14}, 32, {3, 3, 3}, {1, 1, 1}, |
63 | {1, 1, 1, 1, 1, 1})}, |
64 | {conv_param_t<3>(32, 256, 256, {1, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, |
65 | {1, 1, 1, 1, 1, 1})}, |
66 | {conv_param_t<3>(32, 256, 256, {1, 7, 7}, 64, {3, 3, 3}, {1, 1, 1}, |
67 | {1, 1, 1, 1, 1, 1})}, |
68 | |
69 | // Depthwise |
70 | {conv_param_t<3>(1, 64, 64, {8, 14, 14}, 64, {3, 3, 3}, {1, 1, 1}, |
71 | {1, 1, 1, 1, 1, 1})}, |
72 | |
73 | // Pointwise |
74 | {conv_param_t<3>(1, 128, 128, {8, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, |
75 | {0, 0, 0, 0})}, |
76 | // bottleneck blocks |
77 | {conv_param_t<3>(1, 192, 192, {1, 14, 14}, 96, {3, 3, 3}, {1, 1, 1}, |
78 | {1, 1, 1, 1, 1, 1}), |
79 | conv_param_t<3>(1, 192, 1024, {1, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, |
80 | {0, 0, 0, 0, 0, 0}), |
81 | conv_param_t<3>(1, 1024, 192, {1, 14, 14}, 1, {1, 1, 1}, {1, 1, 1}, |
82 | {0, 0, 0, 0, 0, 0})} |
83 | |
84 | }; |
85 | |
86 | /* |
87 | * Benchmark a number of Conv3d operators with representative input shapes. |
88 | * There are a number of parallel Conv3d nodes which are created, one |
89 | * per core. Each core handles one weight matrix. Then these are chained |
90 | * together in multiple layers. After each layer, output tensor is passed to the |
91 | * next layer. |
92 | */ |
93 | class Int8Conv3dParallelBench : public Benchmark { |
94 | /// Matrices. |
95 | std::vector<conv_param_t<3>> input_shapes_; |
96 | size_t numLayers_; |
97 | PlaceholderBindings bindings_; |
98 | std::unique_ptr<runtime::HostManager> hostManager_; |
99 | size_t asyncLaunchSize_; |
100 | size_t numCores_; |
101 | const char *backendStr_; |
102 | const char *devId_; |
103 | |
104 | public: |
105 | Int8Conv3dParallelBench(vector<conv_param_t<3>> &input_shapes_, |
106 | size_t numLayers_, size_t asyncLaunchSize_, |
107 | size_t numCores_, const char *backendStr_, |
108 | const char *devId_) |
109 | : input_shapes_(input_shapes_), numLayers_(numLayers_), |
110 | asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_), |
111 | backendStr_(backendStr_), devId_(devId_) {} |
112 | |
113 | void setup() override { |
114 | |
115 | // Setup host manager |
116 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
117 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
118 | if (devId_ != nullptr) { |
119 | config->parameters["DeviceID" ] = devId_; |
120 | } |
121 | configs.push_back(std::move(config)); |
122 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
123 | |
124 | dim_t N, IC, IT, IH, IW, OC, OT, OH, OW; |
125 | if (input_shapes_.size() == 1) { |
126 | N = input_shapes_[0].MB; |
127 | IC = input_shapes_[0].IC; |
128 | IT = input_shapes_[0].IN_DIM[0]; |
129 | IH = input_shapes_[0].IN_DIM[1]; |
130 | IW = input_shapes_[0].IN_DIM[1]; |
131 | OC = input_shapes_[0].OC; |
132 | OT = input_shapes_[0].OUT_DIM[0]; |
133 | OH = input_shapes_[0].OUT_DIM[1]; |
134 | OW = input_shapes_[0].OUT_DIM[2]; |
135 | } else { |
136 | N = input_shapes_[0].MB; |
137 | IC = input_shapes_[0].IC; |
138 | IT = input_shapes_[0].IN_DIM[0]; |
139 | IH = input_shapes_[0].IN_DIM[1]; |
140 | IW = input_shapes_[0].IN_DIM[1]; |
141 | OC = input_shapes_[input_shapes_.size() - 1].OC; |
142 | OT = input_shapes_[input_shapes_.size() - 1].OUT_DIM[0]; |
143 | OH = input_shapes_[input_shapes_.size() - 1].OUT_DIM[1]; |
144 | OW = input_shapes_[input_shapes_.size() - 1].OUT_DIM[1]; |
145 | } |
146 | std::unique_ptr<Module> mod(new Module); |
147 | auto fn = mod->createFunction("singleNode" ); |
148 | |
149 | std::vector<Node *> cur(numCores_); |
150 | std::vector<Placeholder *> filters(numCores_ * input_shapes_.size()); |
151 | std::vector<Placeholder *> bias(numCores_ * input_shapes_.size()); |
152 | std::vector<Node *> conv(numCores_ * input_shapes_.size()); |
153 | std::vector<Placeholder *> input(numCores_); |
154 | std::vector<Placeholder *> output(numCores_); |
155 | |
156 | for (size_t core = 0; core < numCores_; core++) { |
157 | input[core] = |
158 | mod->createPlaceholder(ElemKind::Int8QTy, {N, IT, IH, IW, IC}, 1.0, 0, |
159 | "input_" + std::to_string(core), false); |
160 | output[core] = |
161 | mod->createPlaceholder(ElemKind::Int8QTy, {N, OT, OH, OW, OC}, 1.0, 0, |
162 | "output_" + std::to_string(core), false); |
163 | cur[core] = input[core]; |
164 | } |
165 | |
166 | for (size_t layer = 0; layer < numLayers_; layer++) { |
167 | for (size_t core = 0; core < numCores_; core++) { |
168 | size_t conv_ops = 0; |
169 | for (auto conv_param : input_shapes_) { |
170 | filters[core * input_shapes_.size() + conv_ops] = |
171 | mod->createPlaceholder( |
172 | ElemKind::Int8QTy, |
173 | {(dim_t)(conv_param.OC), (dim_t)(conv_param.K[0]), |
174 | (dim_t)(conv_param.K[1]), (dim_t)(conv_param.K[2]), |
175 | (dim_t)(conv_param.IC / conv_param.G)}, |
176 | 1.0, 0, |
177 | "filters_" + std::to_string(core) + "_" + |
178 | std::to_string(conv_ops), |
179 | false); |
180 | bias[core * input_shapes_.size() + conv_ops] = mod->createPlaceholder( |
181 | ElemKind::Int32QTy, {(dim_t)(conv_param.OC)}, 1.0, 0, |
182 | "bias_" + std::to_string(core) + "_" + std::to_string(conv_ops), |
183 | false); |
184 | bindings_.allocate(filters[core * input_shapes_.size() + conv_ops]) |
185 | ->getHandle<int8_t>() |
186 | .clear(0); |
187 | bindings_.allocate(bias[core * input_shapes_.size() + conv_ops]) |
188 | ->getHandle<int32_t>() |
189 | .clear(0); |
190 | auto outTy = mod->uniqueType( |
191 | ElemKind::Int8QTy, |
192 | {(dim_t)(conv_param.MB), (dim_t)(conv_param.OUT_DIM[0]), |
193 | (dim_t)(conv_param.OUT_DIM[1]), (dim_t)(conv_param.OUT_DIM[2]), |
194 | (dim_t)(conv_param.OC)}, |
195 | 1.0, 0); |
196 | conv[core * input_shapes_.size() + conv_ops] = fn->createConv3D( |
197 | "conv" + std::to_string(core) + "_" + std::to_string(layer) + |
198 | "_" + std::to_string(conv_ops), |
199 | cur[core], filters[core * input_shapes_.size() + conv_ops], |
200 | bias[core * input_shapes_.size() + conv_ops], outTy, |
201 | {(unsigned int)(conv_param.K[0]), (unsigned int)(conv_param.K[1]), |
202 | (unsigned int)(conv_param.K[2])}, |
203 | {(unsigned int)(conv_param.stride[0]), |
204 | (unsigned int)(conv_param.stride[1]), |
205 | (unsigned int)(conv_param.stride[2])}, |
206 | {(unsigned int)(conv_param.pad[0]), |
207 | (unsigned int)(conv_param.pad[1]), |
208 | (unsigned int)(conv_param.pad[2]), |
209 | (unsigned int)(conv_param.pad[3]), |
210 | (unsigned int)(conv_param.pad[4]), |
211 | (unsigned int)(conv_param.pad[5])}, |
212 | (unsigned int)(conv_param.G)); |
213 | |
214 | cur[core] = conv[core * input_shapes_.size() + conv_ops]; |
215 | conv_ops += 1; |
216 | } |
217 | } |
218 | } |
219 | for (size_t core = 0; core < numCores_; core++) { |
220 | fn->createSave("save" + std::to_string(core), cur[core], output[core]); |
221 | } |
222 | |
223 | for (size_t core = 0; core < numCores_; core++) { |
224 | ::glow::convertPlaceholdersToConstants(fn, bindings_, |
225 | { |
226 | input[core], |
227 | output[core], |
228 | }); |
229 | } |
230 | |
231 | CompilationContext ctx; |
232 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
233 | } |
234 | |
235 | void run() override { |
236 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
237 | std::vector<std::future<void>> futures; |
238 | for (auto &runPromise : promises) { |
239 | std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext); |
240 | futures.push_back(runPromise.get_future()); |
241 | hostManager_->runNetwork( |
242 | "singleNode" , std::move(contextPtr), |
243 | [&runPromise](runtime::RunIdentifierTy, Error err, |
244 | std::unique_ptr<ExecutionContext> /* contextPtr */) { |
245 | EXIT_ON_ERR(std::move(err)); |
246 | runPromise.set_value(); |
247 | }); |
248 | } |
249 | for (auto &fut : futures) { |
250 | fut.wait(); |
251 | } |
252 | } |
253 | |
254 | void teardown() override {} |
255 | }; |
256 | |
257 | int main(int argc, char *argv[]) { |
258 | size_t numLayers = atoi(argv[1]); |
259 | size_t reps = atoi(argv[2]); |
260 | size_t asyncLaunches = atoi(argv[3]); |
261 | size_t numCores = atoi(argv[4]); |
262 | const char *backendStr = argv[5]; |
263 | char *dev_id = nullptr; |
264 | |
265 | printf("Int8Conv3dParallel Microbenchmark\n" ); |
266 | printf( |
267 | "Usage: Int8Conv3dParallelBench numLayers(Int) " |
268 | "numReps(Int) " |
269 | "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n" ); |
270 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
271 | "environment variable\n" ); |
272 | benchParseGlowOpts(argc, argv); |
273 | assert(argc == 6 || argc == 7); |
274 | if (argc > 6) { |
275 | dev_id = argv[6]; |
276 | printf("Setting backend device: \"%s\"\n" , dev_id); |
277 | } |
278 | printf("Start Int8Conv3dParallelBench\n" ); |
279 | size_t shape_idx = 0; |
280 | size_t total_input_shapes = shapes_3d.size(); |
281 | for (auto shapes : shapes_3d) { |
282 | double gflops = 0; |
283 | string shape_info = "" ; |
284 | for (auto shape : shapes) { |
285 | gflops += 2.0 * shape.G * (shape.IC / shape.G) * shape.K[0] * shape.K[1] * |
286 | shape.K[2] * (shape.OC / shape.G) * shape.OUT_DIM[0] * |
287 | shape.OUT_DIM[1] * shape.OUT_DIM[2]; |
288 | if (shape_info != "" ) { |
289 | shape_info += ";" ; |
290 | } |
291 | shape_info += shape.toString(); |
292 | } |
293 | gflops *= numLayers * numCores / 1e9; |
294 | printf("\n=====Input shape %zu/%zu: %s\n" , shape_idx, total_input_shapes, |
295 | shape_info.c_str()); |
296 | Int8Conv3dParallelBench b(shapes, numLayers, asyncLaunches, numCores, |
297 | backendStr, dev_id); |
298 | auto times = bench(&b, reps); |
299 | for (auto t : times) { |
300 | printf("BenchResult,Conv3dParallelBench,SW,%4zu,%4zu,%4zu,%4zu," |
301 | "%s,%" |
302 | "2.6lf,%5.2lf\n" , |
303 | numLayers, reps, asyncLaunches, numCores, backendStr, |
304 | t / asyncLaunches, gflops * asyncLaunches / t); |
305 | } |
306 | double min = *(std::min_element(times.begin(), times.end())); |
307 | size_t midElt = times.size() / 2; |
308 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
309 | double median = times[midElt]; |
310 | double median_runtime = median / ((double)asyncLaunches); |
311 | double min_runtime = min / ((double)asyncLaunches); |
312 | printf("BenchSummary,Conv3dParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%s,%" |
313 | "2.6lf,%2.6lf,%5.2lf,%5.2lf\n" , |
314 | numLayers, reps, asyncLaunches, numCores, backendStr, median_runtime, |
315 | min_runtime, gflops / median_runtime, gflops / min_runtime); |
316 | shape_idx++; |
317 | } |
318 | } |
319 | |