1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#include <array>
17#include <cstdlib>
18#include <future>
19#include <random>
20
21#include "Bench.h"
22
23#include "glow/ExecutionEngine/ExecutionEngine.h"
24#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26using namespace glow;
27
28/*
29 * Benchmark a number of (m x n) * (n x n) matrix multiplications.
30 * There are a number of parallel FC nodes which are created, one per core.
31 * Each core handles one weight matrix. Then these are
32 * chained together in multiple layers. After each layer, output tensor
33 * is passed to the next layer.
34 */
35class GemmParallelBench : public Benchmark {
36 /// Matrices.
37 std::vector<float> a;
38 std::vector<float> b;
39 std::vector<float> c;
40
41 /// Dimensions expressed in libjit's format.
42 size_t aDims[2];
43 size_t cDims[2];
44 size_t numLayers_;
45 PlaceholderBindings bindings_;
46 std::unique_ptr<runtime::HostManager> hostManager_;
47 size_t asyncLaunchSize_;
48 size_t numCores_;
49 const char *backendStr_;
50 const char *dtypeStr_;
51
52public:
53 GemmParallelBench(size_t m, size_t n, size_t numLayers_,
54 size_t asyncLaunchSize_, size_t numCores_,
55 const char *backendStr_, const char *dtypeStr_)
56 : aDims{m, n}, cDims{m, n}, numLayers_(numLayers_),
57 asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
58 backendStr_(backendStr_), dtypeStr_(dtypeStr_) {}
59
60 void setup() override {
61
62 // Setup host manager
63 std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
64 auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
65 configs.push_back(std::move(config));
66 hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
67 dim_t m = cDims[0];
68 dim_t n = cDims[1];
69 dim_t k = aDims[1];
70 a.resize(m * k);
71 b.resize(k * n);
72 c.resize(m * n);
73
74 ElemKind dtype = ElemKind::Float16Ty;
75 if (std::string(dtypeStr_) == "Float16") {
76 dtype = ElemKind::Float16Ty;
77 } else if (std::string(dtypeStr_) == "Float32") {
78 dtype = ElemKind::FloatTy;
79 }
80
81 std::unique_ptr<Module> mod(new Module);
82 auto fn = mod->createFunction("singleNode");
83
84 std::vector<Node *> cur(numCores_);
85 std::vector<Placeholder *> weights(numCores_);
86 std::vector<Placeholder *> bias(numCores_);
87 std::vector<Node *> fc(numCores_);
88 std::vector<Placeholder *> input(numCores_);
89 std::vector<Placeholder *> output(numCores_);
90
91 for (size_t core = 0; core < numCores_; core++) {
92 input[core] = mod->createPlaceholder(
93 dtype, {m, k}, "input" + std::to_string(core), false);
94 output[core] = mod->createPlaceholder(
95 dtype, {m, n}, "output" + std::to_string(core), false);
96 cur[core] = input[core];
97 }
98
99 for (size_t layer = 0; layer < numLayers_; layer++) {
100 for (size_t core = 0; core < numCores_; core++) {
101 weights[core] = mod->createPlaceholder(
102 dtype, {k, n}, "weights" + std::to_string(core), false);
103 bias[core] = mod->createPlaceholder(
104 dtype, {n}, "bias" + std::to_string(core), false);
105 bindings_.allocate(weights[core])
106 ->getHandle<float16_t>()
107 .randomize(-128.f, 128.f, mod->getPRNG());
108 bindings_.allocate(bias[core])
109 ->getHandle<float16_t>()
110 .randomize(-128.f, 128.f, mod->getPRNG());
111 fc[core] = fn->createFullyConnected(
112 "fc" + std::to_string(core) + "_" + std::to_string(layer),
113 cur[core], weights[core], bias[core]);
114 cur[core] = fc[core];
115 }
116 }
117 for (size_t core = 0; core < numCores_; core++) {
118 fn->createSave("save" + std::to_string(core), cur[core], output[core]);
119 }
120
121 for (size_t core = 0; core < numCores_; core++) {
122 ::glow::convertPlaceholdersToConstants(fn, bindings_,
123 {
124 input[core],
125 output[core],
126 });
127 }
128 CompilationContext ctx;
129 EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
130 }
131
132 void run() override {
133 std::vector<std::promise<void>> promises(asyncLaunchSize_);
134 std::vector<std::future<void>> futures;
135 for (auto &runPromise : promises) {
136 std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
137 futures.push_back(runPromise.get_future());
138 hostManager_->runNetwork(
139 "singleNode", std::move(contextPtr),
140 [&runPromise](runtime::RunIdentifierTy, Error err,
141 std::unique_ptr<ExecutionContext> /* contextPtr */) {
142 EXIT_ON_ERR(std::move(err));
143 runPromise.set_value();
144 });
145 }
146 for (auto &fut : futures) {
147 fut.wait();
148 }
149 }
150
151 void teardown() override {}
152
153 double gflops() const {
154 return 2.0 * cDims[0] * cDims[1] * aDims[1] * numLayers_ * numCores_ / 1e9;
155 }
156};
157
158int main(int argc, char *argv[]) {
159 benchParseGlowOpts(argc, argv);
160 assert(argc == 9);
161 size_t m = atoi(argv[1]);
162 size_t n = atoi(argv[2]);
163 size_t numLayers = atoi(argv[3]);
164 size_t reps = atoi(argv[4]);
165 size_t asyncLaunches = atoi(argv[5]);
166 size_t numCores = atoi(argv[6]);
167 const char *backendStr = argv[7];
168 const char *dtypeStr = argv[8];
169
170 GemmParallelBench b(m, n, numLayers, asyncLaunches, numCores, backendStr,
171 dtypeStr);
172 auto times = bench(&b, reps);
173 for (auto t : times) {
174 printf(
175 "BenchResult,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%"
176 "2.6lf,%5.2lf\n",
177 m, n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr,
178 t / asyncLaunches, b.gflops() * asyncLaunches / t);
179 }
180 double min = *(std::min_element(times.begin(), times.end()));
181 size_t midElt = times.size() / 2;
182 std::nth_element(times.begin(), times.begin() + midElt, times.end());
183 double median = times[midElt];
184 double median_runtime = median / ((double)asyncLaunches);
185 double min_runtime = min / ((double)asyncLaunches);
186 printf(
187 "BenchSummary,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%"
188 "2.6lf,%2.6lf,%5.2lf, %5.2lf\n",
189 m, n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr,
190 median_runtime, min_runtime, b.gflops() / median_runtime,
191 b.gflops() / min_runtime);
192}
193