GemmParallelBench.cpp source code [glow/tests/benchmark/GemmParallelBench.cpp]

1	/**
2	* Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#include <array>
17	#include <cstdlib>
18	#include <future>
19	#include <random>
20
21	#include "Bench.h"
22
23	#include "glow/ExecutionEngine/ExecutionEngine.h"
24	#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
25
26	using namespace glow;
27
28	/*
29	* Benchmark a number of (m x n) * (n x n) matrix multiplications.
30	* There are a number of parallel FC nodes which are created, one per core.
31	* Each core handles one weight matrix. Then these are
32	* chained together in multiple layers. After each layer, output tensor
33	* is passed to the next layer.
34	*/
35	class GemmParallelBench : public Benchmark {
36	/// Matrices.
37	std::vector<float> a;
38	std::vector<float> b;
39	std::vector<float> c;
40
41	/// Dimensions expressed in libjit's format.
42	size_t aDims[`2`];
43	size_t cDims[`2`];
44	size_t numLayers_;
45	PlaceholderBindings bindings_;
46	std::unique_ptr<runtime::HostManager> hostManager_;
47	size_t asyncLaunchSize_;
48	size_t numCores_;
49	const char *backendStr_;
50	const char *dtypeStr_;
51
52	public:
53	GemmParallelBench(size_t m, size_t n, size_t numLayers_,
54	size_t asyncLaunchSize_, size_t numCores_,
55	const char backendStr_, const* char *dtypeStr_)
56	: aDims{m, n}, cDims{m, n}, numLayers_(numLayers_),
57	asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
58	backendStr_(backendStr_), dtypeStr_(dtypeStr_) {}
59
60	void setup() override {
61
62	// Setup host manager
63	std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
64	auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
65	configs.push_back(std::move(config));
66	hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
67	dim_t m = cDims[`0`];
68	dim_t n = cDims[`1`];
69	dim_t k = aDims[`1`];
70	a.resize(m * k);
71	b.resize(k * n);
72	c.resize(m * n);
73
74	ElemKind dtype = ElemKind::Float16Ty;
75	if (std::string (dtypeStr_) == "Float16") {
76	dtype = ElemKind::Float16Ty;
77	} else if (std::string (dtypeStr_) == "Float32") {
78	dtype = ElemKind::FloatTy;
79	}
80
81	std::unique_ptr<Module> mod(new Module);
82	auto fn = mod ->createFunction("singleNode");
83
84	std::vector<Node *> cur(numCores_);
85	std::vector<Placeholder *> weights(numCores_);
86	std::vector<Placeholder *> bias(numCores_);
87	std::vector<Node *> fc(numCores_);
88	std::vector<Placeholder *> input(numCores_);
89	std::vector<Placeholder *> output(numCores_);
90
91	for (size_t core = `0`; core < numCores_; core++) {
92	input [core] = mod ->createPlaceholder(
93	dtype, {m, k}, "input" + std::to_string(core), false);
94	output [core] = mod ->createPlaceholder(
95	dtype, {m, n}, "output" + std::to_string(core), false);
96	cur [core] = input [core];
97	}
98
99	for (size_t layer = `0`; layer < numLayers_; layer++) {
100	for (size_t core = `0`; core < numCores_; core++) {
101	weights [core] = mod ->createPlaceholder(
102	dtype, {k, n}, "weights" + std::to_string(core), false);
103	bias [core] = mod ->createPlaceholder(
104	dtype, {n}, "bias" + std::to_string(core), false);
105	bindings_.allocate(weights [core])
106	->getHandle<float16_t>()
107	.randomize(-`128.f`, `128.f`, mod ->getPRNG());
108	bindings_.allocate(bias [core])
109	->getHandle<float16_t>()
110	.randomize(-`128.f`, `128.f`, mod ->getPRNG());
111	fc [core] = fn->createFullyConnected(
112	"fc" + std::to_string(core) + "_" + std::to_string(layer),
113	cur [core], weights [core], bias [core]);
114	cur [core] = fc [core];
115	}
116	}
117	for (size_t core = `0`; core < numCores_; core++) {
118	fn->createSave("save" + std::to_string(core), cur [core], output [core]);
119	}
120
121	for (size_t core = `0`; core < numCores_; core++) {
122	::glow::convertPlaceholdersToConstants(fn, bindings_,
123	{
124	input [core],
125	output [core],
126	});
127	}
128	CompilationContext ctx;
129	EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
130	}
131
132	void run() override {
133	std::vector<std::promise<void>> promises(asyncLaunchSize_);
134	std::vector<std::future<void>> futures;
135	for (auto &runPromise : promises) {
136	std::unique_ptr<ExecutionContext> contextPtr(new ExecutionContext);
137	futures.push_back(runPromise.get_future());
138	hostManager_->runNetwork(
139	"singleNode", std::move(contextPtr),
140	[&runPromise](runtime::RunIdentifierTy, Error err,
141	std::unique_ptr<ExecutionContext> / contextPtr /) {
142	EXIT_ON_ERR(std::move(err));
143	runPromise.set_value();
144	});
145	}
146	for (auto &fut : futures) {
147	fut.wait();
148	}
149	}
150
151	void teardown() override {}
152
153	double gflops() const {
154	return `2.0` * cDims[`0`] * cDims[`1`] * aDims[`1`] * numLayers_ * numCores_ / `1e9`;
155	}
156	};
157
158	int main(int argc, char *argv[]) {
159	benchParseGlowOpts(argc, argv);
160	assert(argc == `9`);
161	size_t m = atoi(argv[`1`]);
162	size_t n = atoi(argv[`2`]);
163	size_t numLayers = atoi(argv[`3`]);
164	size_t reps = atoi(argv[`4`]);
165	size_t asyncLaunches = atoi(argv[`5`]);
166	size_t numCores = atoi(argv[`6`]);
167	const char *backendStr = argv[`7`];
168	const char *dtypeStr = argv[`8`];
169
170	GemmParallelBench b(m, n, numLayers, asyncLaunches, numCores, backendStr,
171	dtypeStr);
172	auto times = bench(&b, reps);
173	for (auto t : times) {
174	printf(
175	"BenchResult,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%"
176	"2.6lf,%5.2lf\n",
177	m, n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr,
178	t / asyncLaunches, b.gflops() * asyncLaunches / t);
179	}
180	double min = *(std::min_element(times.begin(), times.end()));
181	size_t midElt = times.size() / `2`;
182	std::nth_element(times.begin(), times.begin() + midElt, times.end());
183	double median = times [midElt];
184	double median_runtime = median / ((double)asyncLaunches);
185	double min_runtime = min / ((double)asyncLaunches);
186	printf(
187	"BenchSummary,GemmParallelBench,SW,%4zu,%4zu,%4zu,%4zu,%4zu,%4zu,%s,%s,%"
188	"2.6lf,%2.6lf,%5.2lf, %5.2lf\n",
189	m, n, numLayers, reps, asyncLaunches, numCores, backendStr, dtypeStr,
190	median_runtime, min_runtime, b.gflops() / median_runtime,
191	b.gflops() / min_runtime);
192	}
193

Browse the source code of glow/tests/benchmark/GemmParallelBench.cpp