TransposeBench.cpp source code [glow/tests/benchmark/TransposeBench.cpp]

1	/**
2	* Copyright (c) 2017-present, Facebook, Inc.
3	*
4	* Licensed under the Apache License, Version 2.0 (the "License");
5	* you may not use this file except in compliance with the License.
6	* You may obtain a copy of the License at
7	*
8	* http://www.apache.org/licenses/LICENSE-2.0
9	*
10	* Unless required by applicable law or agreed to in writing, software
11	* distributed under the License is distributed on an "AS IS" BASIS,
12	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13	* See the License for the specific language governing permissions and
14	* limitations under the License.
15	*/
16	#include <algorithm>
17	#include <array>
18	#include <cstdlib>
19	#include <future>
20	#include <random>
21
22	#include "Bench.h"
23
24	#include "glow/ExecutionEngine/ExecutionEngine.h"
25	#include "glow/Optimizer/GraphOptimizer/GraphOptimizer.h"
26
27	using namespace glow;
28
29	/*
30	* This class implements a transpose microbenchmark. There are multiple
31	* layers of transpose, followed by an Add with the tensor from the previous
32	* layer.
33	*
34	* Microbenchmarks are generally useful for understanding performance
35	* through targeted experiementation and are not representative of
36	* end-to-end workloads.
37	*/
38	class TransposeBench : public Benchmark {
39	dim_t batchSize_;
40	dim_t n_;
41	dim_t numLayers_;
42	std::unique_ptr<runtime::HostManager> hostManager_;
43	std::vector<std::unique_ptr<ExecutionContext>> contexts_;
44	dim_t asyncLaunchSize_;
45	dim_t numCores_;
46	const char *backendStr_;
47	ElemKind dtype_;
48	dim_t elementSize_;
49	const char *devId_;
50
51	public:
52	TransposeBench(dim_t batchSize_, dim_t n_, dim_t numLayers_,
53	dim_t asyncLaunchSize_, dim_t numCores_,
54	const char backendStr_, const* char *dtypeStr_,
55	const char devId_ = nullptr*)
56	: batchSize_(batchSize_), n_(n_), numLayers_(numLayers_),
57	asyncLaunchSize_(asyncLaunchSize_), numCores_(numCores_),
58	backendStr_(backendStr_), devId_(devId_) {
59
60	dtype_ = ElemKind::Float16Ty;
61	elementSize_ = `2`;
62	if (std::string (dtypeStr_) == "Float16") {
63	dtype_ = ElemKind::Float16Ty;
64	elementSize_ = `2`;
65	} else if (std::string (dtypeStr_) == "Float32") {
66	dtype_ = ElemKind::FloatTy;
67	elementSize_ = `4`;
68	}
69	}
70
71	void setup() override {
72
73	// Create execution contexts here
74	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
75	std::unique_ptr<ExecutionContext> context(new ExecutionContext);
76	contexts_.push_back(std::move(context));
77	}
78
79	// Setup host manager
80	std::vector<std::unique_ptr<runtime::DeviceConfig>> configs;
81	auto config = glow::make_unique<runtime::DeviceConfig>(backendStr_);
82	if (devId_ != nullptr) {
83	config ->parameters ["DeviceID"] = devId_;
84	}
85	configs.push_back(std::move(config));
86	hostManager_ = glow::make_unique<runtime::HostManager>(std::move(configs));
87
88	std::unique_ptr<Module> mod(new Module);
89	auto fn = mod ->createFunction("singleNode");
90
91	std::vector<Placeholder *> input(numCores_);
92	std::vector<SaveNode *> S(numCores_);
93	auto batchSizePerCore = getBatchSizePerCore(batchSize_, numCores_);
94
95	for (dim_t core = `0`; core < numCores_; core++) {
96	if (batchSizePerCore [core] == `0`)
97	continue;
98	input [core] =
99	mod ->createPlaceholder(dtype_, {batchSizePerCore [core], n_, n_},
100	"A" + std::to_string(core), false);
101	}
102
103	// Create multiple chains of Transpose and Add nodes
104	for (dim_t core = `0`; core < numCores_; core++) {
105	if (batchSizePerCore [core] == `0`)
106	continue;
107	// for each context, add input bindings
108	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
109	if (dtype_ == ElemKind::FloatTy) {
110	contexts_[i]
111	->getPlaceholderBindings()
112	->allocate(input [core])
113	->getHandle<float>()
114	.randomize(`0.0f`, `1.0f`, mod ->getPRNG());
115	} else if (dtype_ == ElemKind::Float16Ty) {
116	contexts_[i]
117	->getPlaceholderBindings()
118	->allocate(input [core])
119	->getHandle<float16_t>()
120	.randomize(`0.0f`, `1.0f`, mod ->getPRNG());
121	}
122	}
123
124	Node *cur = input [core];
125	for (dim_t layer = `0`; layer < numLayers_; layer++) {
126	auto *xp = fn->createTranspose("transpose_" + std::to_string(layer) +
127	"_" + std::to_string(core),
128	cur, {`0`, `2`, `1`});
129	auto *ad = fn->createAdd("add_" + std::to_string(layer) + "_" +
130	std::to_string(core),
131	cur, xp);
132	cur = ad;
133	}
134
135	S [core] = fn->createSave("save", cur);
136
137	// for each context, allocate output
138	for (dim_t i = `0`; i < asyncLaunchSize_; i++) {
139	contexts_[i]->getPlaceholderBindings()->allocate(
140	S [core]->getPlaceholder());
141	}
142	}
143
144	CompilationContext ctx;
145	EXIT_ON_ERR(hostManager_->addNetwork(std::move(mod), ctx));
146	}
147
148	void run() override {
149	std::vector<std::unique_ptr<ExecutionContext>> localContexts(
150	asyncLaunchSize_);
151	std::vector<std::promise<void>> promises(asyncLaunchSize_);
152	std::vector<std::future<void>> futures;
153
154	// Launch a number of independent requests
155	int i = `0`;
156	for (auto &promise : promises) {
157	futures.push_back(promise.get_future());
158	hostManager_->runNetwork(
159	"singleNode", std::move(contexts_[i]),
160	[&localContexts, &promise,
161	i](runtime::RunIdentifierTy, Error err,
162	std::unique_ptr<ExecutionContext> contextPtr) {
163	EXIT_ON_ERR(std::move(err));
164	localContexts [i] = std::move(contextPtr);
165	promise.set_value();
166	});
167	i++;
168	}
169	for (auto &fut : futures) {
170	fut.wait();
171	}
172	for (dim_t j = `0`; j < asyncLaunchSize_; j++) {
173	contexts_[j] = std::move(localContexts [j]);
174	}
175	}
176
177	void teardown() override {}
178
179	// Each layer reads the tensor thrice, and writes the tensor twice
180	double gbytes() const {
181	return (`5.0` * numLayers_ * batchSize_ * n_ * n_ * elementSize_) / `1e9`;
182	}
183	};
184
185	int main(int argc, char *argv[]) {
186	printf("Transpose Microbenchmark\n");
187	printf("Usage: TransposeBench batchSize(Int) n(Int) numLayers(Int) "
188	"numReps(Int) numAsyncLaunches(Int) numTransposeChains(Int) "
189	"backendStr(String) dtypeStr(\"Float16\"\|\"Float32\") dev_id(Int)\n");
190	printf("Standard Glow command-line options may be passed via the GLOW_OPTS "
191	"environment variable\n");
192	benchParseGlowOpts(argc, argv);
193	assert(argc == `9` \|\| argc == `10`);
194	size_t batchSize = atoi(argv[`1`]);
195	size_t n = atoi(argv[`2`]);
196	size_t numLayers = atoi(argv[`3`]);
197	size_t numReps = atoi(argv[`4`]);
198	size_t numAsyncLaunches = atoi(argv[`5`]);
199	size_t numCores = atoi(argv[`6`]);
200	const char *backendStr = argv[`7`];
201	const char *dtypeStr = argv[`8`];
202	char dev_id = nullptr*;
203
204	if (argc > `9`) {
205	dev_id = argv[`9`];
206	printf("Setting backend device: \"%s\"\n", dev_id);
207	}
208
209	assert(numReps > `0`);
210
211	TransposeBench b(batchSize, n, numLayers, numAsyncLaunches, numCores,
212	backendStr, dtypeStr, dev_id);
213
214	auto times = bench(&b, numReps);
215	printf("_,benchName,_,batchSize,n,numLayers,numReps,numAsyncLaunches,"
216	"numTransposeChains,backendStr,dtypeStr,runtime,gbytesPerSec\n");
217	for (auto t : times) {
218	printf(
219	"BenchResult,TransposeBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%f\n",
220	batchSize, n, numLayers, numReps, numAsyncLaunches, numCores,
221	backendStr, dtypeStr, t / numAsyncLaunches,
222	b.gbytes() * numAsyncLaunches / t);
223	}
224	double min = *(std::min_element(times.begin(), times.end()));
225	size_t midElt = times.size() / `2`;
226	std::nth_element(times.begin(), times.begin() + midElt, times.end());
227	double median = times [midElt];
228	double median_runtime = median / ((double)numAsyncLaunches);
229	double min_runtime = min / ((double)numAsyncLaunches);
230	printf("_,benchName,_,batchSize,n,numLayers,numReps,numAsyncLaunches,"
231	"numTransposeChains,backendStr,dtypeStr,medianRuntime,minRuntime,"
232	"medianGbytesPerSec,maxGbytesPerSec\n");
233	printf(
234	"BenchSummary,TransposeBench,SW,%zu,%zu,%zu,%zu,%zu,%zu,%s,%s,%f,%f,%f,%"
235	"f\n",
236	batchSize, n, numLayers, numReps, numAsyncLaunches, numCores, backendStr,
237	dtypeStr, median_runtime, min_runtime, b.gbytes() / median_runtime,
238	b.gbytes() / min_runtime);
239	}
240

Browse the source code of glow/tests/benchmark/TransposeBench.cpp