plan_generator.cc source code [tvm/src/contrib/ethosu/cascader/plan_generator.cc]

1	/*
2	* Licensed to the Apache Software Foundation (ASF) under one
3	* or more contributor license agreements. See the NOTICE file
4	* distributed with this work for additional information
5	* regarding copyright ownership. The ASF licenses this file
6	* to you under the Apache License, Version 2.0 (the
7	* "License"); you may not use this file except in compliance
8	* with the License. You may obtain a copy of the License at
9	*
10	* http://www.apache.org/licenses/LICENSE-2.0
11	*
12	* Unless required by applicable law or agreed to in writing,
13	* software distributed under the License is distributed on an
14	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15	* KIND, either express or implied. See the License for the
16	* specific language governing permissions and limitations
17	* under the License.
18	*/
19	#include "plan_generator.h"
20
21	#include <tvm/runtime/container/array.h>
22	#include <tvm/runtime/container/map.h>
23	#include <tvm/runtime/object.h>
24	#include <tvm/runtime/registry.h>
25	#include <tvm/support/parallel_for.h>
26
27	#include <algorithm>
28	#include <cmath>
29	#include <map>
30	#include <set>
31	#include <unordered_map>
32	#include <unordered_set>
33	#include <utility>
34	#include <vector>
35
36	#include "block_config.h"
37	#include "cascader_options.h"
38	#include "common.h"
39	#include "graph.h"
40	#include "pareto.h"
41	#include "plan.h"
42	#include "stripe_config.h"
43	#include "tensor_config.h"
44
45	namespace tvm {
46	namespace contrib {
47	namespace ethosu {
48	namespace cascader {
49
50	template <class T>
51	std::vector<std::vector<T>> EnumerateCombinations(std::vector<std::vector<T>> values) {
52	if (values.size() == `0`) {
53	return values;
54	}
55	if (values.size() == `1`) {
56	std::vector<std::vector<T>> combs;
57	for (const auto& value : values[`0`]) {
58	combs.push_back(std::vector<T>(`1`, value));
59	}
60	return combs;
61	}
62	auto combs = EnumerateCombinations(std::vector<std::vector<T>>(values.begin(), values.end() - `1`));
63	std::vector<std::vector<T>> new_combs;
64	for (const auto& value : values.back()) {
65	for (const auto& comb : combs) {
66	auto new_comb = std::vector<T>(comb);
67	new_comb.push_back(value);
68	new_combs.push_back(new_comb);
69	}
70	}
71	return new_combs;
72	}
73
74	float GetTransferEfficiency(const Tensor& tensor, const std::vector<int>& block_shape,
75	const MemoryRegion& memory) {
76	// The block_shape represents the shape of the data transfer required for each job. This is used
77	// to calculate how much of the block_shape is contiguous in memory (source memory for a read or
78	// destination memory for a write) and subsequently calculate how efficient each memory burst is.
79	const auto& shape = tensor ->GetShape();
80	int burst_length = block_shape [block_shape.size() - `1`];
81	if (block_shape [block_shape.size() - `1`] == shape [shape.size() - `1`]) {
82	burst_length *= block_shape [block_shape.size() - `2`];
83	}
84
85	burst_length *= tensor ->GetDataType().bytes();
86	return static_cast<float>(memory ->burst_length) / std::min(burst_length, memory ->burst_length);
87	}
88
89	std::vector<bool> GetCascadableAxes(const Part& part) {
90	std::vector<bool> cascadable_axes(part ->GetOutputTensor()->GetShape().size());
91	// Check all the propagators to see if an output axis is projected into any
92	// of the inputs. If it is, then that axis is cascadable.
93	for (const auto& propagator : part ->GetPropagators()) {
94	auto transform = propagator ->GetTransform();
95	for (size_t i = `0`; i < transform.size(); i++) {
96	for (size_t j = `0`; j < transform [`0`].size() - `1`; j++) {
97	// An axis is projected if there's a non-zero element
98	// in the transform matrix
99	if (transform [i][j] != `0`) {
100	cascadable_axes [j] = true;
101	}
102	}
103	}
104	}
105	return cascadable_axes;
106	}
107
108	std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
109	bool enable_striping,
110	bool multi_dimensional) {
111	// If stripe_factors is <= 0, then we won't produce any StripeConfigs
112	if (stripe_factors <= `0`) {
113	return std::vector<StripeConfig>();
114	}
115	// Work out the factors to divide by as inverse powers of 2.
116	// The last factor is always reserved to be '0' which will correspond to
117	// choosing a stripe size of 1 in the dimension. We always include this
118	// as it represents the most extreme striping choice that uses the least
119	// memory, so it is our choice of last resort.
120	// For example, if stripe_factors = 4 then the factors are 1, 1/2, 1/4, 0.
121	std::vector<float> factors;
122	for (size_t i = `0`; i < static_cast<size_t>(stripe_factors) - `1`; i++) {
123	factors.push_back(`1.0f` / (std::pow(`2.0f`, i)));
124	}
125	factors.push_back(`0`);
126	// Then use the factors to derive the possible ways to split each dimension
127	// into stripes. As an example, if an had extent 128 then by applying
128	// the factors derived above we get the following possible splits for that axis:
129	// 128, 64, 32, 1
130	std::vector<std::vector<int>> splits;
131	std::vector<int> output_shape = part ->GetOutputTensor()->GetShape();
132	size_t output_dims = output_shape.size();
133	// Only bother striping along the axes which are cascadable
134	auto cascadable_axes = GetCascadableAxes(part);
135	for (size_t i = `0`; i < output_dims; i++) {
136	auto axis = output_shape [i];
137	auto axis_align = part ->GetStripeAlignHint()[i];
138	std::set<int> axis_splits; // Note this is a set to remove duplicate splits
139	if (!cascadable_axes [i] \|\| (!enable_striping)) {
140	axis_splits.insert(axis);
141	} else {
142	for (float factor : factors) {
143	int split =
144	std::max(static_cast<int>(std::ceil(axis * factor / axis_align)), `1`) * axis_align;
145	split = std::min(axis, split);
146	axis_splits.insert(split);
147	}
148	}
149	splits.push_back(std::vector<int>(axis_splits.begin(), axis_splits.end()));
150	}
151
152	std::vector<std::vector<int>> stripe_shapes;
153	if (multi_dimensional) {
154	// Now calculate all the possible combinations of splits for each dimension
155	// to give us all the possible stripe shapes. For example, if we had two axes
156	// both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
157	// (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
158	stripe_shapes = EnumerateCombinations<int>(splits);
159	} else {
160	// Only consider splitting a single axis
161	int axis = `0`;
162	for (const auto& split : splits) {
163	for (const auto& axis_split : split) {
164	std::vector<int> stripe_shape = output_shape;
165	if (stripe_shape [axis] != axis_split) {
166	stripe_shape [axis] = axis_split;
167	stripe_shapes.push_back(stripe_shape);
168	}
169	}
170	axis++;
171	}
172	stripe_shapes.push_back(output_shape);
173	}
174	auto offset = std::vector<int>(output_dims);
175	std::vector<StripeConfig> stripe_configs;
176	// Calculate the possible axis orderings such that each axis has the opportunity
177	// to be the 'outermost' axis (which is axis that is chosen for rolling).
178	std::vector<std::vector<int>> orders;
179	for (size_t i = `0`; i < output_dims; i++) {
180	std::vector<int> order(output_dims);
181	for (size_t j = `0`; j < output_dims; j++) {
182	order [j] = `1` + (j + i) % output_dims;
183	}
184	orders.push_back(order);
185	}
186	// Finally, create the StripeConfigs from the possible stripe shapes and orders
187	for (const auto& stripe_shape : stripe_shapes) {
188	std::vector<int> stripes;
189	std::vector<float> strides;
190	for (size_t i = `0`; i < output_dims; i++) {
191	stripes.push_back(std::ceil(static_cast<float>(output_shape [i]) / stripe_shape [i]));
192	strides.push_back(static_cast<float>(stripe_shape [i])); // strides = stripe_shape
193	}
194	// If the stripe shape equals the output shape (i.e. there's no striping), then
195	// the order doesn't matter, so just pick the first order and continue.
196	if (stripe_shape == output_shape) {
197	stripe_configs.push_back(
198	StripeConfig (stripe_shape, output_shape, strides, orders [`0`], stripes, offset));
199	continue;
200	}
201	for (const auto& order : orders) {
202	// Some logic to avoid having an axis be the 'outermost' if the stripe is full
203	// size in that axis. This would otherwise be a waste because we can't roll
204	// over an axis that hasn't been split.
205	bool skip = false;
206	for (size_t i = `0`; i < output_dims; i++) {
207	if (order [i] == `1` && stripe_shape [i] == output_shape [i]) {
208	skip = true;
209	break;
210	}
211	}
212	if (skip) continue;
213	stripe_configs.push_back(
214	StripeConfig (stripe_shape, output_shape, strides, order, stripes, offset));
215	}
216	}
217	return stripe_configs;
218	}
219
220	std::vector<TensorConfig> GetPossibleInputConfigs(const StripeConfig& stripe_config,
221	const Tensor& tensor,
222	const std::vector<MemoryRegion>& home_regions,
223	const CascaderOptions& options) {
224	std::vector<TensorConfig> configs;
225	for (const auto& home_region : home_regions) {
226	// Boundary configs
227	if (home_region == options ->cascade_region \|\| tensor ->GetSize() > options ->always_copy_size) {
228	configs.push_back(TensorConfig (tensor, home_region, TensorConfigState::BOUNDARY,
229	BufferMode::RECOMPUTE, {stripe_config}, false, home_region));
230	}
231	if (home_region != options ->cascade_region) {
232	configs.push_back(TensorConfig (tensor, home_region, TensorConfigState::BOUNDARY,
233	BufferMode::ROLLING, {stripe_config}, true,
234	options ->cascade_region));
235	}
236	}
237	if (!tensor ->IsConstant()) {
238	// Interior configs
239	configs.push_back(TensorConfig (tensor, options ->cascade_region, TensorConfigState::INTERIOR,
240	BufferMode::RECOMPUTE, {stripe_config}, false,
241	options ->cascade_region));
242	configs.push_back(TensorConfig (tensor, options ->cascade_region, TensorConfigState::INTERIOR,
243	BufferMode::ROLLING, {stripe_config}, false,
244	options ->cascade_region));
245	}
246	return configs;
247	}
248
249	// Check whether a StripeConfig can be an output boundary config
250	bool CanBound(const StripeConfig& stripe_config) {
251	// Determine whether the StripeConfig results in non-overlapping stripes
252	// which is the case when the stripe shape equals the strides
253	for (size_t i = `0`; i < stripe_config ->GetShape().size(); i++) {
254	// Check that the stripe shape and strides are equal
255	if (stripe_config ->GetShape()[i] - stripe_config ->GetStrides()[i] != `0`) {
256	return false;
257	}
258	}
259	return true;
260	}
261
262	std::vector<TensorConfig> GetPossibleOutputConfigs(const StripeConfig& stripe_config,
263	const Tensor& tensor,
264	const std::vector<MemoryRegion>& home_regions,
265	const CascaderOptions& options) {
266	std::vector<TensorConfig> configs;
267	// Only StripeConfigs with non-overlapping stripes can be output boundary configs
268	if (CanBound(stripe_config)) {
269	for (const auto& home_region : home_regions) {
270	// Boundary configs
271	configs.push_back(TensorConfig (tensor, home_region, TensorConfigState::BOUNDARY,
272	BufferMode::RECOMPUTE, {stripe_config}, false, home_region));
273	}
274	}
275	// Interior configs
276	configs.push_back(TensorConfig (tensor, options ->cascade_region, TensorConfigState::INTERIOR,
277	BufferMode::RECOMPUTE, {stripe_config}, false,
278	options ->cascade_region));
279	configs.push_back(TensorConfig (tensor, options ->cascade_region, TensorConfigState::INTERIOR,
280	BufferMode::ROLLING, {stripe_config}, false,
281	options ->cascade_region));
282	return configs;
283	}
284
285	int GetInteriorMemoryUsage(const std::vector<TensorConfig>& input_configs,
286	const TensorConfig& output_config, const MemoryRegion& interior_region) {
287	int memory_usage = `0`;
288	if (output_config ->GetHomeRegion() == interior_region &&
289	output_config ->GetState() == TensorConfigState::BOUNDARY) {
290	memory_usage += output_config ->GetTensor()->GetSize();
291	}
292	for (const auto& input_config : input_configs) {
293	if (input_config ->GetHomeRegion() == interior_region &&
294	input_config ->GetState() == TensorConfigState::BOUNDARY) {
295	memory_usage += input_config ->GetTensor()->GetSize();
296	} else if (input_config ->GetHomeRegion() == interior_region \|\|
297	input_config ->GetCopyRegion() == interior_region) {
298	memory_usage += input_config ->GetBufferSize();
299	}
300	}
301	return memory_usage;
302	}
303
304	/**
305	* \brief Returns a hint estimating the number of cycles required for
306	* the copy specified by tensor_config.
307	*
308	* \param tensor_config The tensor configuration to estimate.
309	* \return mem2mem_cycles Total estimated cycles.
310	* \return initial_mem2mem_cycles Estimated cycles for the first block.
311	*/
312	std::pair<int, int> GetCopyCyclesHint(const TensorConfig& tensor_config) {
313	Tensor tensor = tensor_config ->GetTensor();
314	MemoryRegion home_region = tensor_config ->GetHomeRegion();
315	MemoryRegion copy_region = tensor_config ->GetCopyRegion();
316	int initial_mem2mem_cycles = `0`;
317	int mem2mem_cycles = `0`;
318
319	// This Tensor needs to be copied - Count stripes for this config
320	for (const auto& stripe_config : tensor_config ->GetStripeConfigs()) {
321	std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
322	bool first_block = true;
323	for (const auto& block : input_blocks) {
324	int bytes_transferred = mul_reduce(block.first) * tensor ->GetDataType().bytes() *
325	tensor ->GetCompressionRatio() * block.second;
326	int read_cycles = bytes_transferred * home_region ->read_bandwidth + home_region ->read_latency;
327	int write_cycles = bytes_transferred * copy_region ->write_bandwidth;
328
329	if (first_block) {
330	first_block = false;
331	initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
332	}
333	mem2mem_cycles += std::max(read_cycles, write_cycles);
334	}
335	}
336
337	return {mem2mem_cycles, initial_mem2mem_cycles};
338	}
339
340	std::vector<Plan> GenerateSinglePlans(
341	const Part& part, const std::vector<StripeConfig>& output_stripe_configs,
342	const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
343	home_map,
344	const CascaderOptions& options) {
345	std::vector<Plan> plans;
346	std::vector<Part> part_group{part};
347	// Create a selection of Plans per output_stripe_config
348	for (const auto& output_stripe_config : output_stripe_configs) {
349	// Calculate the input_stripe_configs
350	auto input_stripe_configs = part ->CalculateInputStripeConfigs(output_stripe_config);
351	// From the input_stripe_configs, now derive all the possible input TensorConfigs
352	std::vector<std::vector<TensorConfig>> all_possible_input_configs;
353	size_t i = `0`;
354	for (const auto& stripe_config : input_stripe_configs) {
355	Tensor tensor = part ->GetInputTensors()[i];
356	all_possible_input_configs.push_back(
357	GetPossibleInputConfigs(stripe_config, tensor, home_map.at(tensor), options));
358	i++;
359	}
360	// Now work out all the possible combinations of input TensorConfigs
361	auto input_config_combinations =
362	EnumerateCombinations<TensorConfig>(all_possible_input_configs);
363	Tensor output_tensor = part ->GetOutputTensor();
364	// Then determine the possible output TensorConfigs (no combinations here because there's only
365	// one output)
366	auto output_configs = GetPossibleOutputConfigs(output_stripe_config, output_tensor,
367	home_map.at(output_tensor), options);
368	// Calculate the performance information for the output_stripe_config for both the recompute and
369	// rolling cases
370	PerformanceInfo rolling_perf =
371	part ->GetPerformanceInfo(output_stripe_config, BufferMode::ROLLING);
372	PerformanceInfo recompute_perf =
373	part ->GetPerformanceInfo(output_stripe_config, BufferMode::RECOMPUTE);
374	// For all the possible input TensorConfig combinations
375	for (const auto& input_configs : input_config_combinations) {
376	std::vector<TensorConfig> tensor_configs;
377	std::vector<TensorConfig> open_input_configs;
378	// Add the input TensorConfigs to the 'tensor_configs' and
379	// record which input TensorConfigs are 'open' (i.e. 'INTERIOR')
380	for (const auto& input_config : input_configs) {
381	tensor_configs.push_back(input_config);
382	if (input_config ->GetState() == TensorConfigState::INTERIOR) {
383	open_input_configs.push_back(input_config);
384	}
385	}
386	for (const auto& output_config : output_configs) {
387	// Add the output TensorConfig to the tensor_configs and to
388	// the open configs (if it's 'INTERIOR')
389	tensor_configs.push_back(output_config);
390	std::vector<TensorConfig> open_configs = open_input_configs;
391	if (output_config ->GetState() == TensorConfigState::INTERIOR) {
392	open_configs.push_back(output_config);
393	}
394	int bandwidth_cycles = `0`;
395	int compute_cycles = `0`;
396	int mem2mem_cycles = `0`;
397	int initial_mem2mem_cycles = `0`;
398
399	// Pick the correct performance info based on the BufferMode
400	PerformanceInfo perf_info;
401	if (output_config ->GetBufferMode() == BufferMode::RECOMPUTE) {
402	perf_info = recompute_perf;
403	} else {
404	perf_info = rolling_perf;
405	}
406	// Calculate the bandwidth cycles by multiplying the bytes read/written by the
407	// bandwidth of the memories
408	BlockConfig block_config = perf_info ->block_config;
409	for (size_t i = `0`; i < input_configs.size(); i++) {
410	Tensor tensor = input_configs [i]->GetTensor();
411	MemoryRegion copy_region = input_configs [i]->GetCopyRegion();
412
413	if (input_configs [i]->DoCopy()) {
414	std::pair<int, int> ret = GetCopyCyclesHint(input_configs [i]);
415	mem2mem_cycles += ret.first;
416	initial_mem2mem_cycles += ret.second;
417	}
418	float read_efficiency =
419	GetTransferEfficiency(tensor, block_config ->GetInputBlockShape(), copy_region);
420	bandwidth_cycles +=
421	(perf_info ->read_bytes [i] / copy_region ->read_bandwidth) * read_efficiency;
422	}
423	MemoryRegion write_region = output_config ->GetCopyRegion();
424	float write_efficiency = GetTransferEfficiency(
425	output_config ->GetTensor(), block_config ->GetOutputBlockShape(), write_region);
426
427	bandwidth_cycles +=
428	perf_info ->write_bytes / write_region ->write_bandwidth * write_efficiency;
429	compute_cycles = perf_info ->compute_cycles;
430	// Take the max of compute and bandwidth cycles as we assume compute cycles
431	// can hide memory latency
432	int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles);
433	if (cycles > mem2mem_cycles) {
434	// NPU cycles are the bottleneck - add initial mem2mem transfer cycles
435	cycles += initial_mem2mem_cycles;
436	}
437
438	int memory_usage =
439	GetInteriorMemoryUsage(input_configs, output_config, options ->cascade_region);
440	plans.push_back(Plan (tensor_configs, open_configs, output_config, part_group,
441	options ->cascade_region, memory_usage, cycles));
442	}
443	}
444	}
445	return plans;
446	}
447
448	std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
449	const CascaderGraph& graph,
450	const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
451	home_map,
452	const CascaderOptions& options) {
453	ICHECK_GT(options ->stripe_factors, `0`)
454	<< "stripe_factors = " << options ->stripe_factors << ", but must be > 0";
455	ICHECK_GT(options ->max_plan_size, `0`)
456	<< "max_plan_size = " << options ->max_plan_size << ", but must be > 0";
457	// Define a map between the graph Tensors and possible StripeConfigs that the Tensor may be
458	// executed with
459	std::unordered_map<Tensor, std::set<StripeConfig>, ObjectPtrHash, ObjectPtrEqual>
460	stripe_configs_by_tensor;
461	// Define a map between a given open TensorConfig and all the Plans which provide it
462	std::unordered_map<TensorConfig, std::vector<Plan>> plans_by_config;
463	// Define a map between a group of connected Parts and all the closed plans covering them
464	std::unordered_map<std::vector<Part>, std::vector<Plan>> closed_plans;
465	// Define a nested map which indexes open plans by both Part group and the open TensorConfigs they
466	// provide. Note that we index in this way because Part group + open TensorConfigs combined
467	// defines a group of Plans which can be mutually Pareto culled. If we culled of Part group alone,
468	// we'd lose potentially valuable open Plans which could have gone on to be grown into Pareto
469	// optimal closed plans.
470	std::unordered_map<std::vector<Part>,
471	std::unordered_map<std::vector<TensorConfig>, std::vector<Plan>>>
472	open_plans;
473	// Traverse the graph in a reverse topological order (should be enforced by GetPartOrder)
474	for (const auto& part : graph ->GetPartOrder()) {
475	// First generate all the possible StripeConfigs for the Part assuming that it will become the
476	// output of a Plan. The number generated is a function of stripe_factors and the number of
477	// cascadable dimensions in the Part.
478	std::vector<StripeConfig> stripe_configs =
479	GenerateOutputStripeConfigs(part, options ->stripe_factors, options ->enable_striping,
480	options ->enable_multi_dimensional_striping);
481	// Check to see if the output Tensor is part of any existing open Plans
482	if (stripe_configs_by_tensor.find(part ->GetOutputTensor()) != stripe_configs_by_tensor.end()) {
483	// If there are other open Plans which have this Part's output Tensor as an input, then
484	// additionally consider the StripeConfigs of those open TensorConfigs so that we have the
485	// option to merge into those open Plans.
486	const std::set<StripeConfig>& connecting_configs =
487	stripe_configs_by_tensor.at(part ->GetOutputTensor());
488	std::copy(connecting_configs.begin(), connecting_configs.end(),
489	std::back_inserter(stripe_configs));
490	}
491	// Generate all the single Part Plans for the previously determined StripeConfigs
492	auto single_part_plans = GenerateSinglePlans(part, stripe_configs, home_map, options);
493	std::vector<Plan> plans;
494	for (const auto& partial_plan : single_part_plans) {
495	// If the output TensorConfig of the Plan is 'INTERIOR', then it must be merged with
496	// another open Plan
497	if (partial_plan ->GetOutputConfig()->GetState() == TensorConfigState::INTERIOR) {
498	if (plans_by_config.find(partial_plan ->GetOutputConfig()) != plans_by_config.end() &&
499	partial_plan ->GetOutputConfig()->GetTensor()->GetConsumers().size() == `1`) {
500	// Search for all the open Plans which require the same TensorConfig
501	const auto& join_plans = plans_by_config.at(partial_plan ->GetOutputConfig());
502	for (const auto& join_plan : join_plans) {
503	// Only merge to form a new Plan if the resulting Plan size won't exceed the
504	// max_plan_size
505	if (join_plan ->GetPartGroup().size() < static_cast<size_t>(options ->max_plan_size)) {
506	if (partial_plan ->GetMemoryUsage() + join_plan ->GetMemoryUsage() <
507	options ->cascade_region ->size) {
508	plans.push_back(partial_plan.Merge(join_plan));
509	}
510	}
511	}
512	}
513	} else {
514	// If the single Part Plan had a 'BOUNDARY' output TensorConfig, then it doesn't need
515	// merging and can stand on its own.
516	plans.push_back(partial_plan);
517	}
518	}
519	// For all the newly created Plans, update the various maps
520	std::unordered_set<std::vector<Part>> new_part_groups;
521	for (const auto& plan : plans) {
522	new_part_groups.insert(plan ->GetPartGroup());
523	if (plan ->IsClosed()) {
524	closed_plans [plan ->GetPartGroup()].push_back(plan);
525	} else {
526	open_plans [plan ->GetPartGroup()][plan ->GetOpenConfigs()].push_back(plan);
527	}
528	}
529	// Now Pareto cull both the open and closed Plans to remove non-optimal Plans
530	// Additionally, once culled we update another two maps, the stripe_configs_by_tensor
531	// and plans_by_config maps.
532	for (const auto& part_group : new_part_groups) {
533	if (closed_plans.find(part_group) != closed_plans.end()) {
534	closed_plans [part_group] = ParetoCullPlans(
535	closed_plans.at(part_group), options ->max_closed_plans, options ->disable_pareto_plans);
536	}
537	for (const auto& it : open_plans [part_group]) {
538	auto pareto_plans =
539	ParetoCullPlans(it.second, options ->max_open_plans, options ->disable_pareto_plans);
540	for (const auto& plan : pareto_plans) {
541	for (const auto& open_config : plan ->GetOpenConfigs()) {
542	if (open_config != plan ->GetOutputConfig()) {
543	for (const auto& stripe_config : open_config ->GetStripeConfigs()) {
544	// Only add a StripeConfig if it contains for than one stripe
545	if (mul_reduce(stripe_config ->GetStripes()) > `1`) {
546	stripe_configs_by_tensor [open_config ->GetTensor()].insert(stripe_config);
547	}
548	}
549	plans_by_config [open_config].push_back(plan);
550	}
551	}
552	}
553	}
554	}
555	}
556	return closed_plans;
557	}
558
559	TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs")
560	.set_body_typed([](Part part, int stripe_factors, bool enable_striping,
561	bool multi_dimensional) {
562	if (stripe_factors < `0`) {
563	return Array<StripeConfig>();
564	}
565	return Array<StripeConfig>(
566	GenerateOutputStripeConfigs(part, stripe_factors, enable_striping, multi_dimensional));
567	});
568
569	TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans")
570	.set_body_typed([](Part part, Array<StripeConfig> output_stripe_configs,
571	Map<Tensor, Array<MemoryRegion>> home_map, CascaderOptions options) {
572	std::vector<StripeConfig> voutput_stripe_configs(output_stripe_configs.begin(),
573	output_stripe_configs.end());
574	std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>
575	mhome_map;
576	for (const auto& it : home_map) {
577	std::vector<MemoryRegion> home_regions;
578	for (const auto& i : it.second) {
579	home_regions.push_back(i);
580	}
581	mhome_map [it.first] = home_regions;
582	}
583	return Array<Plan>(GenerateSinglePlans(part, voutput_stripe_configs, mhome_map, options));
584	});
585
586	TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans")
587	.set_body_typed([](CascaderGraph graph, Map<Tensor, Array<MemoryRegion>> home_map,
588	CascaderOptions options) {
589	std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>
590	mhome_map;
591	for (const auto& it : home_map) {
592	std::vector<MemoryRegion> home_regions;
593	for (const auto& i : it.second) {
594	home_regions.push_back(i);
595	}
596	mhome_map [it.first] = home_regions;
597	}
598	auto closed_plans = GenerateGraphPlans(graph, mhome_map, options);
599	Map<Array<Part>, Array<Plan>> tclosed_plans;
600	for (auto& it : closed_plans) {
601	Array<Part> part_arr(it.first.begin(), it.first.end());
602	Array<Plan> plan_arr(it.second);
603	tclosed_plans.Set(part_arr, plan_arr);
604	}
605	return tclosed_plans;
606	});
607
608	TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetCopyCyclesHint")
609	.set_body_typed([](TensorConfig tensor_config) {
610	std::pair<int, int> ret = GetCopyCyclesHint(tensor_config);
611	return Array<Integer>({ret.first, ret.second});
612	});
613
614	} // namespace cascader
615	} // namespace ethosu
616	} // namespace contrib
617	} // namespace tvm
618

Browse the source code of tvm/src/contrib/ethosu/cascader/plan_generator.cc