BatchGemmBench.cpp source code [glow/tests/benchmark/BatchGemmBench.cpp]

1	/**
2	* Copyright (c) 2017-present, Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#include <algorithm>
17	#include <array>
18	#include <cstdlib>
19	#include <future>
20	#include <random>
21
22	#include "Bench.h"
23
24	#include "glow/ExecutionEngine/ExecutionEngine.h"
25	#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27	using namespace glow;
28
29	/*
30	* This class implements a batch GEMM microbenchmark. Each layer contains a
31	* batch of (m x m) * (m x n) matrix multiplications. There are a number of
32	* layers which do successive GEMMs on the intermediate outputs (RHS) and
33	* inputs (LHS)
34	*
35	* Microbenchmarks are generally useful for understanding performance
36	* through targeted experiementation and are not representative of
37	* end-to-end workloads.
38	*/
39	class BatchGemmBench : public Benchmark {
40	dim_t batchSize_;
41	dim_t m_;
42	dim_t n_;
43	dim_t numLayers_;
44	std::unique_ptr<runtime::HostManager> hostManager_;
45	std::vector<std::unique_ptr<ExecutionContext>> contexts_;
46	dim_t asyncLaunchSize_;
47	dim_t numCores_;
48	const char *backendStr_;
49	ElemKind dtype_;
50	dim_t elementSize_;
51	const char *devId_;
52
53	public:
54	BatchGemmBench(dim_t batchSize_, dim_t m_, dim_t n_, dim_t numLayers_,
55	dim_t asyncLaunchSize_, dim_t numCores_,
56	const char backendStr_, const* char *dtypeStr_,
57	const char devId_ = nullptr*)
58	: batchSize_(batchSize_), m_(m_), n_(n_), numLayers_(numLayers_),
59	asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
60	backendStr_(backendStr_), devId_(devId_) {
61
62	dtype_ = ElemKind::Float16Ty;
63	elementSize_ = `2`;
64	if (std::string (dtypeStr_) == "Float16") {
65	dtype_ = ElemKind::Float16Ty;
66	elementSize_ = `2`;
67	} else if (std::string (dtypeStr_) == "Float32") {
68	dtype_ = ElemKind::FloatTy;
69	elementSize_ = `4`;
70	}
71	}
72
73	void setup() override {
74
75	// Create execution contexts here
76	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
77	std::unique_ptr<ExecutionContext> context(new ExecutionContext);
78	contexts_.push_back(std::move(context));
79	}
80
81	// Setup host manager
82	std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
83	auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
84	if (devId_ != nullptr) {
85	config ->parameters ["DeviceID"] = devId_;
86	}
87	configs.push_back(std::move(config));
88	hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
89
90	std::unique_ptr<Module> mod(new Module);
91	auto fn = mod ->createFunction("singleNode");
92
93	std::vector<Placeholder *> A(numCores_);
94	std::vector<Placeholder *> B(numCores_);
95	std::vector<SaveNode *> S(numCores_);
96
97	// Calculate the batch size per core
98	auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_);
99
100	for (dim_t core = `0`; core < numCores_; core++) {
101	if (batchSizePerCore [core] == `0`)
102	continue;
103	A [core] = mod ->createPlaceholder(dtype_, {batchSizePerCore [core], m_, m_},
104	"A" + std::to_string(core), false);
105	B [core] = mod ->createPlaceholder(dtype_, {batchSizePerCore [core], m_, n_},
106	"B" + std::to_string(core), false);
107	}
108
109	// for each context, add input bindings
110	for (dim_t core = `0`; core < numCores_; core++) {
111	if (batchSizePerCore [core] == `0`)
112	continue;
113	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
114	if (dtype_ == ElemKind::FloatTy) {
115	contexts_[i]
116	->getPlaceholderBindings()
117	->allocate(A [core])
118	->getHandle<float>()
119	.randomize(`0.0f`, `1.0f`, mod ->getPRNG());
120	contexts_[i]
121	->getPlaceholderBindings()
122	->allocate(B [core])
123	->getHandle<float>()
124	.randomize(`0.0f`, `1.0f`, mod ->getPRNG());
125	} else if (dtype_ == ElemKind::Float16Ty) {
126	contexts_[i]
127	->getPlaceholderBindings()
128	->allocate(A [core])
129	->getHandle<float16_t>()
130	.randomize(`0.0f`, `1.0f`, mod ->getPRNG());
131	contexts_[i]
132	->getPlaceholderBindings()
133	->allocate(B [core])
134	->getHandle<float16_t>()
135	.randomize(`0.0f`, `1.0f`, mod ->getPRNG());
136	}
137	}
138
139	Node *cur = B [core];
140	for (dim_t layer = `0`; layer < numLayers_; layer++) {
141	auto *bmm = fn->createBatchMatMul(
142	"batchmatmul" + std::to_string(layer) + "_" + std::to_string(core),
143	A [core], cur);
144	cur = bmm;
145	}
146
147	S [core] = fn->createSave("save" + std::to_string(core), cur);
148
149	// for each context, add output bindings
150	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
151	contexts_[i]->getPlaceholderBindings()->allocate(
152	S [core]->getPlaceholder());
153	}
154	}
155
156	CompilationContext ctx;
157	ctx.dumpFinalGraph = true;
158	EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
159	}
160
161	void run() override {
162	std::vector<std::unique_ptr<ExecutionContext>> localContexts(
163	asyncLaunchSize_);
164	std::vector<std::promise<void>> promises(asyncLaunchSize_);
165	std::vector<std::future<void>> futures;
166
167	// Launch a number of independent requests
168	int i = `0`;
169	for (auto &promise : promises) {
170	futures.push_back(promise.get_future());
171	hostManager_->runNetwork(
172	"singleNode", std::move(contexts_[i]),
173	[&localContexts, &promise,
174	i](runtime::RunIdentifierTy, Error err,
175	std::unique_ptr<ExecutionContext> contextPtr) {
176	EXIT_ON_ERR(std::move(err));
177	localContexts [i] = std::move(contextPtr);
178	promise.set_value();
179	});
180	i++;
181	}
182	for (auto &fut : futures) {
183	fut.wait();
184	}
185	for (dim_t j = `0`; j < asyncLaunchSize_; j++) {
186	contexts_[j] = std::move(localContexts [j]);
187	}
188	}
189
190	void teardown() override {}
191
192	// Each row has numElementsPerRow bytes per row, plus scale and offset
193	double gflops() const {
194	return `2.0` * m_ * m_ * n_ * numLayers_ * batchSize_ / `1e9`;
195	}
196	};
197
198	int main(int argc, char *argv[]) {
199	printf("BatchGEMM Microbenchmark\n");
200	printf("Usage: BatchGemmBench batchSize(Int) m(Int) n(Int) numLayers(Int) "
201	"numReps(Int) numAsyncLaunches(Int) numBatchGEMMChains(Int) "
202	"backendStr(String) dtypeStr(\"Float16\"\|\"Float32\") dev_id(Int)\n");
203	printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
204	"environment variable\n");
205	benchParseGlowOpts(argc, argv);
206
207	assert(argc == `10` \|\| argc == `11`);
208	size_t batchSize = atoi(argv[`1`]);
209	size_t m = atoi(argv[`2`]);
210	size_t n = atoi(argv[`3`]);
211	size_t numLayers = atoi(argv[`4`]);
212	size_t numReps = atoi(argv[`5`]);
213	size_t numAsyncLaunches = atoi(argv[`6`]);
214	size_t numCores = atoi(argv[`7`]);
215	const char *backendStr = argv[`8`];
216	const char *dtypeStr = argv[`9`];
217	char dev_id = nullptr*;
218
219	if (argc > `10`) {
220	dev_id = argv[`10`];
221	printf("Setting backend device: \"%s\"\n", dev_id);
222	}
223
224	assert(numReps > `0`);
225
226	BatchGemmBench b(batchSize, m, n, numLayers, numAsyncLaunches, numCores,
227	backendStr, dtypeStr, dev_id);
228
229	auto times = bench(&b, numReps);
230	printf("_,benchName,_,batchSize,m,n,numLayers,numReps,numAsyncLaunches,"
231	"numBatchGEMMChains,backendStr,dtypeStr,runtime,gflopsPerSec\n");
232	for (auto t : times) {
233	printf("BenchResult,BatchGemmBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,"
234	"%f\n",
235	batchSize, m, n, numLayers, numReps, numAsyncLaunches, numCores,
236	backendStr, dtypeStr, t / numAsyncLaunches,
237	b.gflops() * numAsyncLaunches / t);
238	}
239	double min = *(std::min_element(times.begin(), times.end()));
240	size_t midElt = times.size() / `2`;
241	std::nth_element(times.begin(), times.begin() + midElt, times.end());
242	double median = times [midElt];
243	double median_runtime = median / ((double)numAsyncLaunches);
244	double min_runtime = min / ((double)numAsyncLaunches);
245	printf("_,benchName,_,batchSize,m,n,numLayers,numReps,numAsyncLaunches,"
246	"numBatchGEMMChains,backendStr,dtypeStr,medianRuntime,minRuntime,"
247	"medianGflopsPerSec,maxGflopsPerSec\n");
248	printf("BenchSummary,BatchGemmBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%"
249	"f,%f,%"
250	"f\n",
251	batchSize, m, n, numLayers, numReps, numAsyncLaunches, numCores,
252	backendStr, dtypeStr, median_runtime, min_runtime,
253	b.gflops() / median_runtime, b.gflops() / min_runtime);
254	}
255

Browse the source code of glow/tests/benchmark/BatchGemmBench.cpp