1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <array>
17#include <cstdlib>
18#include <future>
19#include <random>
20
21#include "Bench.h"
22
23#include "glow/ExecutionEngine/ExecutionEngine.h"
24#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26using namespace glow;
27
28/*
29 * Benchmark a number of (m x n) * (n x n) matrix multiplications.
30 * There are a number of parallel FC nodes which are created, one per core.
31 * Each core handles one weight matrix. Then these are
32 * chained together in multiple layers. After each layer, output tensor
33 * is passed to the next layer.
34 */
35class Int8GemmParallelBench : public Benchmark {
36 /// Matrices.
37 std::vector<float> a;
38 std::vector<float> b;
39 std::vector<float> c;
40
41 /// Dimensions expressed in libjit's format.
42 size_t aDims[2];
43 size_t cDims[2];
44 size_t numLayers_;
45 PlaceholderBindings bindings_;
46 std::unique_ptr<runtime::HostManager> hostManager_;
47 size_t asyncLaunchSize_;
48 size_t numCores_;
49 const char *backendStr_;
50 const char *devId_;
51
52public:
53 Int8GemmParallelBench(size_t m, size_t n, size_t numLayers_,
54 size_t asyncLaunchSize_, size_t numCores_,
55 const char *backendStr_, const char *devId_)
56 : aDims{m, n}, cDims{m, n}, numLayers_(numLayers_),
57 asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
58 backendStr_(backendStr_), devId_(devId_) {}
59
60 void setup() override {
61
62 // Setup host manager
63 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
64 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
65 if (devId_ != nullptr) {
66 config->parameters["DeviceID"] = devId_;
67 }
68 configs.push_back(std::move(config));
69 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
70 printf("set up host manager\n");
71
72 dim_t m = cDims[0];
73 dim_t n = cDims[1];
74 dim_t k = aDims[1];
75 a.resize(m * k);
76 b.resize(k * n);
77 c.resize(m * n);
78
79 std::unique_ptr<Module> mod(new Module);
80 auto fn = mod->createFunction("singleNode");
81 printf("set up module \n");
82
83 std::vector<Node *> cur(numCores_);
84 std::vector<Placeholder *> weights(numCores_);
85 std::vector<Placeholder *> bias(numCores_);
86 std::vector<Node *> fc(numCores_);
87 std::vector<Placeholder *> input(numCores_);
88 std::vector<Placeholder *> output(numCores_);
89
90 printf("set up inputs and outputs");
91 for (size_t core = 0; core < numCores_; core++) {
92 input[core] =
93 mod->createPlaceholder(ElemKind::Int8QTy, {m, k}, 1.0, 0,
94 "input_" + std::to_string(core), false);
95 output[core] =
96 mod->createPlaceholder(ElemKind::Int8QTy, {m, n}, 1.0, 0,
97 "output_" + std::to_string(core), false);
98 cur[core] = input[core];
99 }
100
101 printf("set up weights and bias");
102 for (size_t layer = 0; layer < numLayers_; layer++) {
103 for (size_t core = 0; core < numCores_; core++) {
104 weights[core] =
105 mod->createPlaceholder(ElemKind::Int8QTy, {k, n}, 1.0, 0,
106 "weights_" + std::to_string(core), false);
107 bias[core] =
108 mod->createPlaceholder(ElemKind::Int32QTy, {n}, 1.0, 0,
109 "bias_" + std::to_string(core), false);
110 bindings_.allocate(weights[core])
111 ->getHandle<int8_t>()
112 .randomize(0, 128, mod->getPRNG());
113 bindings_.allocate(bias[core])
114 ->getHandle<int32_t>()
115 .randomize(0, 128, mod->getPRNG());
116 fc[core] = fn->createFullyConnected(
117 "fc" + std::to_string(core) + "_" + std::to_string(layer),
118 cur[core], weights[core], bias[core]);
119 cur[core] = fc[core];
120 }
121 }
122 printf("save output");
123 for (size_t core = 0; core < numCores_; core++) {
124 fn->createSave("save" + std::to_string(core), cur[core], output[core]);
125 }
126
127 for (size_t core = 0; core < numCores_; core++) {
128 ::glow::convertPlaceholdersToConstants(fn, bindings_,
129 {
130 input[core],
131 output[core],
132 });
133 }
134
135 CompilationContext ctx;
136 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
137 }
138
139 void run() override {
140 printf("Running module");
141 std::vector<std::promise<void>> promises(asyncLaunchSize_);
142 std::vector<std::future<void>> futures;
143 for (auto &runPromise : promises) {
144 std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
145 futures.push_back(runPromise.get_future());
146 hostManager_->runNetwork(
147 "singleNode", std::move(contextPtr),
148 [&runPromise](runtime::RunIdentifierTy, Error err,
149 std::unique_ptr<ExecutionContext> /* contextPtr */) {
150 EXIT_ON_ERR(std::move(err));
151 runPromise.set_value();
152 });
153 }
154 for (auto &fut : futures) {
155 fut.wait();
156 }
157 }
158
159 void teardown() override {}
160
161 double gflops() const {
162 return 2.0 * cDims[0] * cDims[1] * aDims[1] * numLayers_ * numCores_ / 1e9;
163 }
164};
165
166int main(int argc, char *argv[]) {
167 size_t m = atoi(argv[1]);
168 size_t n = atoi(argv[2]);
169 size_t numLayers = atoi(argv[3]);
170 size_t reps = atoi(argv[4]);
171 size_t asyncLaunches = atoi(argv[5]);
172 size_t numCores = atoi(argv[6]);
173 const char *backendStr = argv[7];
174 char *dev_id = nullptr;
175
176 printf("Int8GEMMParallel Microbenchmark\n");
177 printf(
178 "Usage: Int8GemmParallelBench m(Int) n(Int) numLayers(Int) numReps(Int) "
179 "numAsyncLaunches(Int) numCores(Int) backendStr(String) dev_id(Int)\n");
180 printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
181 "environment variable\n");
182 benchParseGlowOpts(argc, argv);
183 assert(argc == 8 || argc == 9);
184 if (argc > 8) {
185 dev_id = argv[8];
186 printf("Setting backend device: \"%s\"\n", dev_id);
187 }
188 printf("Start Int8GemmParallelBench\n");
189 Int8GemmParallelBench b(m, n, numLayers, asyncLaunches, numCores, backendStr,
190 dev_id);
191 auto times = bench(&b, reps);
192 for (auto t : times) {
193 printf("BenchResult,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%"
194 "2.6lf,%5.2lf\n",
195 m, n, numLayers, reps, asyncLaunches, numCores, backendStr,
196 t / asyncLaunches, b.gflops() * asyncLaunches / t);
197 }
198 double min = *(std::min_element(times.begin(), times.end()));
199 size_t midElt = times.size() / 2;
200 std::nth_element(times.begin(), times.begin() + midElt, times.end());
201 double median = times[midElt];
202 double median_runtime = median / ((double)asyncLaunches);
203 double min_runtime = min / ((double)asyncLaunches);
204 printf("BenchSummary,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%"
205 "2.6lf,%2.6lf,%5.2lf, %5.2lf\n",
206 m, n, numLayers, reps, asyncLaunches, numCores, backendStr,
207 median_runtime, min_runtime, b.gflops() / median_runtime,
208 b.gflops() / min_runtime);
209}
210