1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <array> |
17 | #include <cstdlib> |
18 | #include <future> |
19 | #include <random> |
20 | |
21 | #include "Bench.h" |
22 | |
23 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
24 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
25 | |
26 | using namespace glow; |
27 | |
28 | /* |
29 | * Benchmark a number of (m x n) * (n x n) matrix multiplications. |
30 | * There are a number of parallel FC nodes which are created, one per core. |
31 | * Each core handles one weight matrix. Then these are |
32 | * chained together in multiple layers. After each layer, output tensor |
33 | * is passed to the next layer. |
34 | */ |
35 | class GemmParallelBench : public Benchmark { |
36 | /// Matrices. |
37 | std::vector<float> a; |
38 | std::vector<float> b; |
39 | std::vector<float> c; |
40 | |
41 | /// Dimensions expressed in libjit's format. |
42 | size_t aDims[2]; |
43 | size_t cDims[2]; |
44 | size_t numLayers_; |
45 | PlaceholderBindings bindings_; |
46 | std::unique_ptr<runtime::HostManager> hostManager_; |
47 | size_t asyncLaunchSize_; |
48 | size_t numCores_; |
49 | const char *backendStr_; |
50 | const char *dtypeStr_; |
51 | |
52 | public: |
53 | GemmParallelBench(size_t m, size_t n, size_t numLayers_, |
54 | size_t asyncLaunchSize_, size_t numCores_, |
55 | const char *backendStr_, const char *dtypeStr_) |
56 | : aDims{m, n}, cDims{m, n}, numLayers_(numLayers_), |
57 | asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_), |
58 | backendStr_(backendStr_), dtypeStr_(dtypeStr_) {} |
59 | |
60 | void setup() override { |
61 | |
62 | // Setup host manager |
63 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
64 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
65 | configs.push_back(std::move(config)); |
66 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
67 | dim_t m = cDims[0]; |
68 | dim_t n = cDims[1]; |
69 | dim_t k = aDims[1]; |
70 | a.resize(m * k); |
71 | b.resize(k * n); |
72 | c.resize(m * n); |
73 | |
74 | ElemKind dtype = ElemKind::Float16Ty; |
75 | if (std::string(dtypeStr_) == "Float16" ) { |
76 | dtype = ElemKind::Float16Ty; |
77 | } else if (std::string(dtypeStr_) == "Float32" ) { |
78 | dtype = ElemKind::FloatTy; |
79 | } |
80 | |
81 | std::unique_ptr<Module> mod(new Module); |
82 | auto fn = mod->createFunction("singleNode" ); |
83 | |
84 | std::vector<Node *> cur(numCores_); |
85 | std::vector<Placeholder *> weights(numCores_); |
86 | std::vector<Placeholder *> bias(numCores_); |
87 | std::vector<Node *> fc(numCores_); |
88 | std::vector<Placeholder *> input(numCores_); |
89 | std::vector<Placeholder *> output(numCores_); |
90 | |
91 | for (size_t core = 0; core < numCores_; core++) { |
92 | input[core] = mod->createPlaceholder( |
93 | dtype, {m, k}, "input" + std::to_string(core), false); |
94 | output[core] = mod->createPlaceholder( |
95 | dtype, {m, n}, "output" + std::to_string(core), false); |
96 | cur[core] = input[core]; |
97 | } |
98 | |
99 | for (size_t layer = 0; layer < numLayers_; layer++) { |
100 | for (size_t core = 0; core < numCores_; core++) { |
101 | weights[core] = mod->createPlaceholder( |
102 | dtype, {k, n}, "weights" + std::to_string(core), false); |
103 | bias[core] = mod->createPlaceholder( |
104 | dtype, {n}, "bias" + std::to_string(core), false); |
105 | bindings_.allocate(weights[core]) |
106 | ->getHandle<float16_t>() |
107 | .randomize(-128.f, 128.f, mod->getPRNG()); |
108 | bindings_.allocate(bias[core]) |
109 | ->getHandle<float16_t>() |
110 | .randomize(-128.f, 128.f, mod->getPRNG()); |
111 | fc[core] = fn->createFullyConnected( |
112 | "fc" + std::to_string(core) + "_" + std::to_string(layer), |
113 | cur[core], weights[core], bias[core]); |
114 | cur[core] = fc[core]; |
115 | } |
116 | } |
117 | for (size_t core = 0; core < numCores_; core++) { |
118 | fn->createSave("save" + std::to_string(core), cur[core], output[core]); |
119 | } |
120 | |
121 | for (size_t core = 0; core < numCores_; core++) { |
122 | ::glow::convertPlaceholdersToConstants(fn, bindings_, |
123 | { |
124 | input[core], |
125 | output[core], |
126 | }); |
127 | } |
128 | CompilationContext ctx; |
129 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
130 | } |
131 | |
132 | void run() override { |
133 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
134 | std::vector<std::future<void>> futures; |
135 | for (auto &runPromise : promises) { |
136 | std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext); |
137 | futures.push_back(runPromise.get_future()); |
138 | hostManager_->runNetwork( |
139 | "singleNode" , std::move(contextPtr), |
140 | [&runPromise](runtime::RunIdentifierTy, Error err, |
141 | std::unique_ptr<ExecutionContext> /* contextPtr */) { |
142 | EXIT_ON_ERR(std::move(err)); |
143 | runPromise.set_value(); |
144 | }); |
145 | } |
146 | for (auto &fut : futures) { |
147 | fut.wait(); |
148 | } |
149 | } |
150 | |
151 | void teardown() override {} |
152 | |
153 | double gflops() const { |
154 | return 2.0 * cDims[0] * cDims[1] * aDims[1] * numLayers_ * numCores_ / 1e9; |
155 | } |
156 | }; |
157 | |
158 | int main(int argc, char *argv[]) { |
159 | benchParseGlowOpts(argc, argv); |
160 | assert(argc == 9); |
161 | size_t m = atoi(argv[1]); |
162 | size_t n = atoi(argv[2]); |
163 | size_t numLayers = atoi(argv[3]); |
164 | size_t reps = atoi(argv[4]); |
165 | size_t asyncLaunches = atoi(argv[5]); |
166 | size_t numCores = atoi(argv[6]); |
167 | const char *backendStr = argv[7]; |
168 | const char *dtypeStr = argv[8]; |
169 | |
170 | GemmParallelBench b(m, n, numLayers, asyncLaunches, numCores, backendStr, |
171 | dtypeStr); |
172 | auto times = bench(&b, reps); |
173 | for (auto t : times) { |
174 | printf( |
175 | "BenchResult,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%" |
176 | "2.6lf,%5.2lf\n" , |
177 | m, n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr, |
178 | t / asyncLaunches, b.gflops() * asyncLaunches / t); |
179 | } |
180 | double min = *(std::min_element(times.begin(), times.end())); |
181 | size_t midElt = times.size() / 2; |
182 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
183 | double median = times[midElt]; |
184 | double median_runtime = median / ((double)asyncLaunches); |
185 | double min_runtime = min / ((double)asyncLaunches); |
186 | printf( |
187 | "BenchSummary,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%" |
188 | "2.6lf,%2.6lf,%5.2lf, %5.2lf\n" , |
189 | m, n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr, |
190 | median_runtime, min_runtime, b.gflops() / median_runtime, |
191 | b.gflops() / min_runtime); |
192 | } |
193 | |