1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #include <array> |
17 | #include <cstdlib> |
18 | #include <future> |
19 | #include <random> |
20 | |
21 | #include "Bench.h" |
22 | |
23 | #include "glow/ExecutionEngine/ExecutionEngine.h" |
24 | #include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h" |
25 | |
26 | using namespace glow; |
27 | |
28 | /* |
29 | * Benchmark a number of (m x n) * (n x n) matrix multiplications. |
30 | * There are a number of parallel FC nodes which are created, one per core. |
31 | * Each core handles one weight matrix. Then these are |
32 | * chained together in multiple layers. After each layer, output tensor |
33 | * is passed to the next layer. |
34 | */ |
35 | class Int8GemmParallelBench : public Benchmark { |
36 | /// Matrices. |
37 | std::vector<float> a; |
38 | std::vector<float> b; |
39 | std::vector<float> c; |
40 | |
41 | /// Dimensions expressed in libjit's format. |
42 | size_t aDims[2]; |
43 | size_t cDims[2]; |
44 | size_t numLayers_; |
45 | PlaceholderBindings bindings_; |
46 | std::unique_ptr<runtime::HostManager> hostManager_; |
47 | size_t asyncLaunchSize_; |
48 | size_t numCores_; |
49 | const char *backendStr_; |
50 | const char *devId_; |
51 | |
52 | public: |
53 | Int8GemmParallelBench(size_t m, size_t n, size_t numLayers_, |
54 | size_t asyncLaunchSize_, size_t numCores_, |
55 | const char *backendStr_, const char *devId_) |
56 | : aDims{m, n}, cDims{m, n}, numLayers_(numLayers_), |
57 | asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_), |
58 | backendStr_(backendStr_), devId_(devId_) {} |
59 | |
60 | void setup() override { |
61 | |
62 | // Setup host manager |
63 | std::vector<std::unique_ptr<runtime::DeviceConfig>> configs; |
64 | auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_); |
65 | if (devId_ != nullptr) { |
66 | config->parameters["DeviceID" ] = devId_; |
67 | } |
68 | configs.push_back(std::move(config)); |
69 | hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs)); |
70 | printf("set up host manager\n" ); |
71 | |
72 | dim_t m = cDims[0]; |
73 | dim_t n = cDims[1]; |
74 | dim_t k = aDims[1]; |
75 | a.resize(m * k); |
76 | b.resize(k * n); |
77 | c.resize(m * n); |
78 | |
79 | std::unique_ptr<Module> mod(new Module); |
80 | auto fn = mod->createFunction("singleNode" ); |
81 | printf("set up module \n" ); |
82 | |
83 | std::vector<Node *> cur(numCores_); |
84 | std::vector<Placeholder *> weights(numCores_); |
85 | std::vector<Placeholder *> bias(numCores_); |
86 | std::vector<Node *> fc(numCores_); |
87 | std::vector<Placeholder *> input(numCores_); |
88 | std::vector<Placeholder *> output(numCores_); |
89 | |
90 | printf("set up inputs and outputs" ); |
91 | for (size_t core = 0; core < numCores_; core++) { |
92 | input[core] = |
93 | mod->createPlaceholder(ElemKind::Int8QTy, {m, k}, 1.0, 0, |
94 | "input_" + std::to_string(core), false); |
95 | output[core] = |
96 | mod->createPlaceholder(ElemKind::Int8QTy, {m, n}, 1.0, 0, |
97 | "output_" + std::to_string(core), false); |
98 | cur[core] = input[core]; |
99 | } |
100 | |
101 | printf("set up weights and bias" ); |
102 | for (size_t layer = 0; layer < numLayers_; layer++) { |
103 | for (size_t core = 0; core < numCores_; core++) { |
104 | weights[core] = |
105 | mod->createPlaceholder(ElemKind::Int8QTy, {k, n}, 1.0, 0, |
106 | "weights_" + std::to_string(core), false); |
107 | bias[core] = |
108 | mod->createPlaceholder(ElemKind::Int32QTy, {n}, 1.0, 0, |
109 | "bias_" + std::to_string(core), false); |
110 | bindings_.allocate(weights[core]) |
111 | ->getHandle<int8_t>() |
112 | .randomize(0, 128, mod->getPRNG()); |
113 | bindings_.allocate(bias[core]) |
114 | ->getHandle<int32_t>() |
115 | .randomize(0, 128, mod->getPRNG()); |
116 | fc[core] = fn->createFullyConnected( |
117 | "fc" + std::to_string(core) + "_" + std::to_string(layer), |
118 | cur[core], weights[core], bias[core]); |
119 | cur[core] = fc[core]; |
120 | } |
121 | } |
122 | printf("save output" ); |
123 | for (size_t core = 0; core < numCores_; core++) { |
124 | fn->createSave("save" + std::to_string(core), cur[core], output[core]); |
125 | } |
126 | |
127 | for (size_t core = 0; core < numCores_; core++) { |
128 | ::glow::convertPlaceholdersToConstants(fn, bindings_, |
129 | { |
130 | input[core], |
131 | output[core], |
132 | }); |
133 | } |
134 | |
135 | CompilationContext ctx; |
136 | EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx)); |
137 | } |
138 | |
139 | void run() override { |
140 | printf("Running module" ); |
141 | std::vector<std::promise<void>> promises(asyncLaunchSize_); |
142 | std::vector<std::future<void>> futures; |
143 | for (auto &runPromise : promises) { |
144 | std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext); |
145 | futures.push_back(runPromise.get_future()); |
146 | hostManager_->runNetwork( |
147 | "singleNode" , std::move(contextPtr), |
148 | [&runPromise](runtime::RunIdentifierTy, Error err, |
149 | std::unique_ptr<ExecutionContext> /* contextPtr */) { |
150 | EXIT_ON_ERR(std::move(err)); |
151 | runPromise.set_value(); |
152 | }); |
153 | } |
154 | for (auto &fut : futures) { |
155 | fut.wait(); |
156 | } |
157 | } |
158 | |
159 | void teardown() override {} |
160 | |
161 | double gflops() const { |
162 | return 2.0 * cDims[0] * cDims[1] * aDims[1] * numLayers_ * numCores_ / 1e9; |
163 | } |
164 | }; |
165 | |
166 | int main(int argc, char *argv[]) { |
167 | size_t m = atoi(argv[1]); |
168 | size_t n = atoi(argv[2]); |
169 | size_t numLayers = atoi(argv[3]); |
170 | size_t reps = atoi(argv[4]); |
171 | size_t asyncLaunches = atoi(argv[5]); |
172 | size_t numCores = atoi(argv[6]); |
173 | const char *backendStr = argv[7]; |
174 | char *dev_id = nullptr; |
175 | |
176 | printf("Int8GEMMParallel Microbenchmark\n" ); |
177 | printf( |
178 | "Usage: Int8GemmParallelBench m(Int) n(Int) numLayers(Int) numReps(Int) " |
179 | "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n" ); |
180 | printf("Standard Glow command-line options may be passed via the GLOW_OPTS " |
181 | "environment variable\n" ); |
182 | benchParseGlowOpts(argc, argv); |
183 | assert(argc == 8 || argc == 9); |
184 | if (argc > 8) { |
185 | dev_id = argv[8]; |
186 | printf("Setting backend device: \"%s\"\n" , dev_id); |
187 | } |
188 | printf("Start Int8GemmParallelBench\n" ); |
189 | Int8GemmParallelBench b(m, n, numLayers, asyncLaunches, numCores, backendStr, |
190 | dev_id); |
191 | auto times = bench(&b, reps); |
192 | for (auto t : times) { |
193 | printf("BenchResult,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%" |
194 | "2.6lf,%5.2lf\n" , |
195 | m, n, numLayers, reps, asyncLaunches, numCores, backendStr, |
196 | t / asyncLaunches, b.gflops() * asyncLaunches / t); |
197 | } |
198 | double min = *(std::min_element(times.begin(), times.end())); |
199 | size_t midElt = times.size() / 2; |
200 | std::nth_element(times.begin(), times.begin() + midElt, times.end()); |
201 | double median = times[midElt]; |
202 | double median_runtime = median / ((double)asyncLaunches); |
203 | double min_runtime = min / ((double)asyncLaunches); |
204 | printf("BenchSummary,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%" |
205 | "2.6lf,%2.6lf,%5.2lf, %5.2lf\n" , |
206 | m, n, numLayers, reps, asyncLaunches, numCores, backendStr, |
207 | median_runtime, min_runtime, b.gflops() / median_runtime, |
208 | b.gflops() / min_runtime); |
209 | } |
210 | |