1/*
2 * Licensed to the Apache Software Foundation (ASF) under one
3 * or more contributor license agreements. See the NOTICE file
4 * distributed with this work for additional information
5 * regarding copyright ownership. The ASF licenses this file
6 * to you under the Apache License, Version 2.0 (the
7 * "License"); you may not use this file except in compliance
8 * with the License. You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing,
13 * software distributed under the License is distributed on an
14 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
15 * KIND, either express or implied. See the License for the
16 * specific language governing permissions and limitations
17 * under the License.
18 */
19#include "plan_generator.h"
20
21#include <tvm/runtime/container/array.h>
22#include <tvm/runtime/container/map.h>
23#include <tvm/runtime/object.h>
24#include <tvm/runtime/registry.h>
25#include <tvm/support/parallel_for.h>
26
27#include <algorithm>
28#include <cmath>
29#include <map>
30#include <set>
31#include <unordered_map>
32#include <unordered_set>
33#include <utility>
34#include <vector>
35
36#include "block_config.h"
37#include "cascader_options.h"
38#include "common.h"
39#include "graph.h"
40#include "pareto.h"
41#include "plan.h"
42#include "stripe_config.h"
43#include "tensor_config.h"
44
45namespace tvm {
46namespace contrib {
47namespace ethosu {
48namespace cascader {
49
50template <class T>
51std::vector<std::vector<T>> EnumerateCombinations(std::vector<std::vector<T>> values) {
52 if (values.size() == 0) {
53 return values;
54 }
55 if (values.size() == 1) {
56 std::vector<std::vector<T>> combs;
57 for (const auto& value : values[0]) {
58 combs.push_back(std::vector<T>(1, value));
59 }
60 return combs;
61 }
62 auto combs = EnumerateCombinations(std::vector<std::vector<T>>(values.begin(), values.end() - 1));
63 std::vector<std::vector<T>> new_combs;
64 for (const auto& value : values.back()) {
65 for (const auto& comb : combs) {
66 auto new_comb = std::vector<T>(comb);
67 new_comb.push_back(value);
68 new_combs.push_back(new_comb);
69 }
70 }
71 return new_combs;
72}
73
74float GetTransferEfficiency(const Tensor& tensor, const std::vector<int>& block_shape,
75 const MemoryRegion& memory) {
76 // The block_shape represents the shape of the data transfer required for each job. This is used
77 // to calculate how much of the block_shape is contiguous in memory (source memory for a read or
78 // destination memory for a write) and subsequently calculate how efficient each memory burst is.
79 const auto& shape = tensor->GetShape();
80 int burst_length = block_shape[block_shape.size() - 1];
81 if (block_shape[block_shape.size() - 1] == shape[shape.size() - 1]) {
82 burst_length *= block_shape[block_shape.size() - 2];
83 }
84
85 burst_length *= tensor->GetDataType().bytes();
86 return static_cast<float>(memory->burst_length) / std::min(burst_length, memory->burst_length);
87}
88
89std::vector<bool> GetCascadableAxes(const Part& part) {
90 std::vector<bool> cascadable_axes(part->GetOutputTensor()->GetShape().size());
91 // Check all the propagators to see if an output axis is projected into any
92 // of the inputs. If it is, then that axis is cascadable.
93 for (const auto& propagator : part->GetPropagators()) {
94 auto transform = propagator->GetTransform();
95 for (size_t i = 0; i < transform.size(); i++) {
96 for (size_t j = 0; j < transform[0].size() - 1; j++) {
97 // An axis is projected if there's a non-zero element
98 // in the transform matrix
99 if (transform[i][j] != 0) {
100 cascadable_axes[j] = true;
101 }
102 }
103 }
104 }
105 return cascadable_axes;
106}
107
108std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors,
109 bool enable_striping,
110 bool multi_dimensional) {
111 // If stripe_factors is <= 0, then we won't produce any StripeConfigs
112 if (stripe_factors <= 0) {
113 return std::vector<StripeConfig>();
114 }
115 // Work out the factors to divide by as inverse powers of 2.
116 // The last factor is always reserved to be '0' which will correspond to
117 // choosing a stripe size of 1 in the dimension. We always include this
118 // as it represents the most extreme striping choice that uses the least
119 // memory, so it is our choice of last resort.
120 // For example, if stripe_factors = 4 then the factors are 1, 1/2, 1/4, 0.
121 std::vector<float> factors;
122 for (size_t i = 0; i < static_cast<size_t>(stripe_factors) - 1; i++) {
123 factors.push_back(1.0f / (std::pow(2.0f, i)));
124 }
125 factors.push_back(0);
126 // Then use the factors to derive the possible ways to split each dimension
127 // into stripes. As an example, if an had extent 128 then by applying
128 // the factors derived above we get the following possible splits for that axis:
129 // 128, 64, 32, 1
130 std::vector<std::vector<int>> splits;
131 std::vector<int> output_shape = part->GetOutputTensor()->GetShape();
132 size_t output_dims = output_shape.size();
133 // Only bother striping along the axes which are cascadable
134 auto cascadable_axes = GetCascadableAxes(part);
135 for (size_t i = 0; i < output_dims; i++) {
136 auto axis = output_shape[i];
137 auto axis_align = part->GetStripeAlignHint()[i];
138 std::set<int> axis_splits; // Note this is a set to remove duplicate splits
139 if (!cascadable_axes[i] || (!enable_striping)) {
140 axis_splits.insert(axis);
141 } else {
142 for (float factor : factors) {
143 int split =
144 std::max(static_cast<int>(std::ceil(axis * factor / axis_align)), 1) * axis_align;
145 split = std::min(axis, split);
146 axis_splits.insert(split);
147 }
148 }
149 splits.push_back(std::vector<int>(axis_splits.begin(), axis_splits.end()));
150 }
151
152 std::vector<std::vector<int>> stripe_shapes;
153 if (multi_dimensional) {
154 // Now calculate all the possible combinations of splits for each dimension
155 // to give us all the possible stripe shapes. For example, if we had two axes
156 // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be:
157 // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1)
158 stripe_shapes = EnumerateCombinations<int>(splits);
159 } else {
160 // Only consider splitting a single axis
161 int axis = 0;
162 for (const auto& split : splits) {
163 for (const auto& axis_split : split) {
164 std::vector<int> stripe_shape = output_shape;
165 if (stripe_shape[axis] != axis_split) {
166 stripe_shape[axis] = axis_split;
167 stripe_shapes.push_back(stripe_shape);
168 }
169 }
170 axis++;
171 }
172 stripe_shapes.push_back(output_shape);
173 }
174 auto offset = std::vector<int>(output_dims);
175 std::vector<StripeConfig> stripe_configs;
176 // Calculate the possible axis orderings such that each axis has the opportunity
177 // to be the 'outermost' axis (which is axis that is chosen for rolling).
178 std::vector<std::vector<int>> orders;
179 for (size_t i = 0; i < output_dims; i++) {
180 std::vector<int> order(output_dims);
181 for (size_t j = 0; j < output_dims; j++) {
182 order[j] = 1 + (j + i) % output_dims;
183 }
184 orders.push_back(order);
185 }
186 // Finally, create the StripeConfigs from the possible stripe shapes and orders
187 for (const auto& stripe_shape : stripe_shapes) {
188 std::vector<int> stripes;
189 std::vector<float> strides;
190 for (size_t i = 0; i < output_dims; i++) {
191 stripes.push_back(std::ceil(static_cast<float>(output_shape[i]) / stripe_shape[i]));
192 strides.push_back(static_cast<float>(stripe_shape[i])); // strides = stripe_shape
193 }
194 // If the stripe shape equals the output shape (i.e. there's no striping), then
195 // the order doesn't matter, so just pick the first order and continue.
196 if (stripe_shape == output_shape) {
197 stripe_configs.push_back(
198 StripeConfig(stripe_shape, output_shape, strides, orders[0], stripes, offset));
199 continue;
200 }
201 for (const auto& order : orders) {
202 // Some logic to avoid having an axis be the 'outermost' if the stripe is full
203 // size in that axis. This would otherwise be a waste because we can't roll
204 // over an axis that hasn't been split.
205 bool skip = false;
206 for (size_t i = 0; i < output_dims; i++) {
207 if (order[i] == 1 && stripe_shape[i] == output_shape[i]) {
208 skip = true;
209 break;
210 }
211 }
212 if (skip) continue;
213 stripe_configs.push_back(
214 StripeConfig(stripe_shape, output_shape, strides, order, stripes, offset));
215 }
216 }
217 return stripe_configs;
218}
219
220std::vector<TensorConfig> GetPossibleInputConfigs(const StripeConfig& stripe_config,
221 const Tensor& tensor,
222 const std::vector<MemoryRegion>& home_regions,
223 const CascaderOptions& options) {
224 std::vector<TensorConfig> configs;
225 for (const auto& home_region : home_regions) {
226 // Boundary configs
227 if (home_region == options->cascade_region || tensor->GetSize() > options->always_copy_size) {
228 configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY,
229 BufferMode::RECOMPUTE, {stripe_config}, false, home_region));
230 }
231 if (home_region != options->cascade_region) {
232 configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY,
233 BufferMode::ROLLING, {stripe_config}, true,
234 options->cascade_region));
235 }
236 }
237 if (!tensor->IsConstant()) {
238 // Interior configs
239 configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
240 BufferMode::RECOMPUTE, {stripe_config}, false,
241 options->cascade_region));
242 configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
243 BufferMode::ROLLING, {stripe_config}, false,
244 options->cascade_region));
245 }
246 return configs;
247}
248
249// Check whether a StripeConfig can be an output boundary config
250bool CanBound(const StripeConfig& stripe_config) {
251 // Determine whether the StripeConfig results in non-overlapping stripes
252 // which is the case when the stripe shape equals the strides
253 for (size_t i = 0; i < stripe_config->GetShape().size(); i++) {
254 // Check that the stripe shape and strides are equal
255 if (stripe_config->GetShape()[i] - stripe_config->GetStrides()[i] != 0) {
256 return false;
257 }
258 }
259 return true;
260}
261
262std::vector<TensorConfig> GetPossibleOutputConfigs(const StripeConfig& stripe_config,
263 const Tensor& tensor,
264 const std::vector<MemoryRegion>& home_regions,
265 const CascaderOptions& options) {
266 std::vector<TensorConfig> configs;
267 // Only StripeConfigs with non-overlapping stripes can be output boundary configs
268 if (CanBound(stripe_config)) {
269 for (const auto& home_region : home_regions) {
270 // Boundary configs
271 configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY,
272 BufferMode::RECOMPUTE, {stripe_config}, false, home_region));
273 }
274 }
275 // Interior configs
276 configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
277 BufferMode::RECOMPUTE, {stripe_config}, false,
278 options->cascade_region));
279 configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR,
280 BufferMode::ROLLING, {stripe_config}, false,
281 options->cascade_region));
282 return configs;
283}
284
285int GetInteriorMemoryUsage(const std::vector<TensorConfig>& input_configs,
286 const TensorConfig& output_config, const MemoryRegion& interior_region) {
287 int memory_usage = 0;
288 if (output_config->GetHomeRegion() == interior_region &&
289 output_config->GetState() == TensorConfigState::BOUNDARY) {
290 memory_usage += output_config->GetTensor()->GetSize();
291 }
292 for (const auto& input_config : input_configs) {
293 if (input_config->GetHomeRegion() == interior_region &&
294 input_config->GetState() == TensorConfigState::BOUNDARY) {
295 memory_usage += input_config->GetTensor()->GetSize();
296 } else if (input_config->GetHomeRegion() == interior_region ||
297 input_config->GetCopyRegion() == interior_region) {
298 memory_usage += input_config->GetBufferSize();
299 }
300 }
301 return memory_usage;
302}
303
304/**
305 * \brief Returns a hint estimating the number of cycles required for
306 * the copy specified by tensor_config.
307 *
308 * \param tensor_config The tensor configuration to estimate.
309 * \return mem2mem_cycles Total estimated cycles.
310 * \return initial_mem2mem_cycles Estimated cycles for the first block.
311 */
312std::pair<int, int> GetCopyCyclesHint(const TensorConfig& tensor_config) {
313 Tensor tensor = tensor_config->GetTensor();
314 MemoryRegion home_region = tensor_config->GetHomeRegion();
315 MemoryRegion copy_region = tensor_config->GetCopyRegion();
316 int initial_mem2mem_cycles = 0;
317 int mem2mem_cycles = 0;
318
319 // This Tensor needs to be copied - Count stripes for this config
320 for (const auto& stripe_config : tensor_config->GetStripeConfigs()) {
321 std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true);
322 bool first_block = true;
323 for (const auto& block : input_blocks) {
324 int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() *
325 tensor->GetCompressionRatio() * block.second;
326 int read_cycles = bytes_transferred * home_region->read_bandwidth + home_region->read_latency;
327 int write_cycles = bytes_transferred * copy_region->write_bandwidth;
328
329 if (first_block) {
330 first_block = false;
331 initial_mem2mem_cycles += std::max(read_cycles, write_cycles);
332 }
333 mem2mem_cycles += std::max(read_cycles, write_cycles);
334 }
335 }
336
337 return {mem2mem_cycles, initial_mem2mem_cycles};
338}
339
340std::vector<Plan> GenerateSinglePlans(
341 const Part& part, const std::vector<StripeConfig>& output_stripe_configs,
342 const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
343 home_map,
344 const CascaderOptions& options) {
345 std::vector<Plan> plans;
346 std::vector<Part> part_group{part};
347 // Create a selection of Plans per output_stripe_config
348 for (const auto& output_stripe_config : output_stripe_configs) {
349 // Calculate the input_stripe_configs
350 auto input_stripe_configs = part->CalculateInputStripeConfigs(output_stripe_config);
351 // From the input_stripe_configs, now derive all the possible input TensorConfigs
352 std::vector<std::vector<TensorConfig>> all_possible_input_configs;
353 size_t i = 0;
354 for (const auto& stripe_config : input_stripe_configs) {
355 Tensor tensor = part->GetInputTensors()[i];
356 all_possible_input_configs.push_back(
357 GetPossibleInputConfigs(stripe_config, tensor, home_map.at(tensor), options));
358 i++;
359 }
360 // Now work out all the possible combinations of input TensorConfigs
361 auto input_config_combinations =
362 EnumerateCombinations<TensorConfig>(all_possible_input_configs);
363 Tensor output_tensor = part->GetOutputTensor();
364 // Then determine the possible output TensorConfigs (no combinations here because there's only
365 // one output)
366 auto output_configs = GetPossibleOutputConfigs(output_stripe_config, output_tensor,
367 home_map.at(output_tensor), options);
368 // Calculate the performance information for the output_stripe_config for both the recompute and
369 // rolling cases
370 PerformanceInfo rolling_perf =
371 part->GetPerformanceInfo(output_stripe_config, BufferMode::ROLLING);
372 PerformanceInfo recompute_perf =
373 part->GetPerformanceInfo(output_stripe_config, BufferMode::RECOMPUTE);
374 // For all the possible input TensorConfig combinations
375 for (const auto& input_configs : input_config_combinations) {
376 std::vector<TensorConfig> tensor_configs;
377 std::vector<TensorConfig> open_input_configs;
378 // Add the input TensorConfigs to the 'tensor_configs' and
379 // record which input TensorConfigs are 'open' (i.e. 'INTERIOR')
380 for (const auto& input_config : input_configs) {
381 tensor_configs.push_back(input_config);
382 if (input_config->GetState() == TensorConfigState::INTERIOR) {
383 open_input_configs.push_back(input_config);
384 }
385 }
386 for (const auto& output_config : output_configs) {
387 // Add the output TensorConfig to the tensor_configs and to
388 // the open configs (if it's 'INTERIOR')
389 tensor_configs.push_back(output_config);
390 std::vector<TensorConfig> open_configs = open_input_configs;
391 if (output_config->GetState() == TensorConfigState::INTERIOR) {
392 open_configs.push_back(output_config);
393 }
394 int bandwidth_cycles = 0;
395 int compute_cycles = 0;
396 int mem2mem_cycles = 0;
397 int initial_mem2mem_cycles = 0;
398
399 // Pick the correct performance info based on the BufferMode
400 PerformanceInfo perf_info;
401 if (output_config->GetBufferMode() == BufferMode::RECOMPUTE) {
402 perf_info = recompute_perf;
403 } else {
404 perf_info = rolling_perf;
405 }
406 // Calculate the bandwidth cycles by multiplying the bytes read/written by the
407 // bandwidth of the memories
408 BlockConfig block_config = perf_info->block_config;
409 for (size_t i = 0; i < input_configs.size(); i++) {
410 Tensor tensor = input_configs[i]->GetTensor();
411 MemoryRegion copy_region = input_configs[i]->GetCopyRegion();
412
413 if (input_configs[i]->DoCopy()) {
414 std::pair<int, int> ret = GetCopyCyclesHint(input_configs[i]);
415 mem2mem_cycles += ret.first;
416 initial_mem2mem_cycles += ret.second;
417 }
418 float read_efficiency =
419 GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region);
420 bandwidth_cycles +=
421 (perf_info->read_bytes[i] / copy_region->read_bandwidth) * read_efficiency;
422 }
423 MemoryRegion write_region = output_config->GetCopyRegion();
424 float write_efficiency = GetTransferEfficiency(
425 output_config->GetTensor(), block_config->GetOutputBlockShape(), write_region);
426
427 bandwidth_cycles +=
428 perf_info->write_bytes / write_region->write_bandwidth * write_efficiency;
429 compute_cycles = perf_info->compute_cycles;
430 // Take the max of compute and bandwidth cycles as we assume compute cycles
431 // can hide memory latency
432 int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles);
433 if (cycles > mem2mem_cycles) {
434 // NPU cycles are the bottleneck - add initial mem2mem transfer cycles
435 cycles += initial_mem2mem_cycles;
436 }
437
438 int memory_usage =
439 GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region);
440 plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group,
441 options->cascade_region, memory_usage, cycles));
442 }
443 }
444 }
445 return plans;
446}
447
448std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans(
449 const CascaderGraph& graph,
450 const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>&
451 home_map,
452 const CascaderOptions& options) {
453 ICHECK_GT(options->stripe_factors, 0)
454 << "stripe_factors = " << options->stripe_factors << ", but must be > 0";
455 ICHECK_GT(options->max_plan_size, 0)
456 << "max_plan_size = " << options->max_plan_size << ", but must be > 0";
457 // Define a map between the graph Tensors and possible StripeConfigs that the Tensor may be
458 // executed with
459 std::unordered_map<Tensor, std::set<StripeConfig>, ObjectPtrHash, ObjectPtrEqual>
460 stripe_configs_by_tensor;
461 // Define a map between a given open TensorConfig and all the Plans which provide it
462 std::unordered_map<TensorConfig, std::vector<Plan>> plans_by_config;
463 // Define a map between a group of connected Parts and all the closed plans covering them
464 std::unordered_map<std::vector<Part>, std::vector<Plan>> closed_plans;
465 // Define a nested map which indexes open plans by both Part group and the open TensorConfigs they
466 // provide. Note that we index in this way because Part group + open TensorConfigs combined
467 // defines a group of Plans which can be mutually Pareto culled. If we culled of Part group alone,
468 // we'd lose potentially valuable open Plans which could have gone on to be grown into Pareto
469 // optimal closed plans.
470 std::unordered_map<std::vector<Part>,
471 std::unordered_map<std::vector<TensorConfig>, std::vector<Plan>>>
472 open_plans;
473 // Traverse the graph in a reverse topological order (should be enforced by GetPartOrder)
474 for (const auto& part : graph->GetPartOrder()) {
475 // First generate all the possible StripeConfigs for the Part assuming that it will become the
476 // output of a Plan. The number generated is a function of stripe_factors and the number of
477 // cascadable dimensions in the Part.
478 std::vector<StripeConfig> stripe_configs =
479 GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping,
480 options->enable_multi_dimensional_striping);
481 // Check to see if the output Tensor is part of any existing open Plans
482 if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) {
483 // If there are other open Plans which have this Part's output Tensor as an input, then
484 // additionally consider the StripeConfigs of those open TensorConfigs so that we have the
485 // option to merge into those open Plans.
486 const std::set<StripeConfig>& connecting_configs =
487 stripe_configs_by_tensor.at(part->GetOutputTensor());
488 std::copy(connecting_configs.begin(), connecting_configs.end(),
489 std::back_inserter(stripe_configs));
490 }
491 // Generate all the single Part Plans for the previously determined StripeConfigs
492 auto single_part_plans = GenerateSinglePlans(part, stripe_configs, home_map, options);
493 std::vector<Plan> plans;
494 for (const auto& partial_plan : single_part_plans) {
495 // If the output TensorConfig of the Plan is 'INTERIOR', then it must be merged with
496 // another open Plan
497 if (partial_plan->GetOutputConfig()->GetState() == TensorConfigState::INTERIOR) {
498 if (plans_by_config.find(partial_plan->GetOutputConfig()) != plans_by_config.end() &&
499 partial_plan->GetOutputConfig()->GetTensor()->GetConsumers().size() == 1) {
500 // Search for all the open Plans which require the same TensorConfig
501 const auto& join_plans = plans_by_config.at(partial_plan->GetOutputConfig());
502 for (const auto& join_plan : join_plans) {
503 // Only merge to form a new Plan if the resulting Plan size won't exceed the
504 // max_plan_size
505 if (join_plan->GetPartGroup().size() < static_cast<size_t>(options->max_plan_size)) {
506 if (partial_plan->GetMemoryUsage() + join_plan->GetMemoryUsage() <
507 options->cascade_region->size) {
508 plans.push_back(partial_plan.Merge(join_plan));
509 }
510 }
511 }
512 }
513 } else {
514 // If the single Part Plan had a 'BOUNDARY' output TensorConfig, then it doesn't need
515 // merging and can stand on its own.
516 plans.push_back(partial_plan);
517 }
518 }
519 // For all the newly created Plans, update the various maps
520 std::unordered_set<std::vector<Part>> new_part_groups;
521 for (const auto& plan : plans) {
522 new_part_groups.insert(plan->GetPartGroup());
523 if (plan->IsClosed()) {
524 closed_plans[plan->GetPartGroup()].push_back(plan);
525 } else {
526 open_plans[plan->GetPartGroup()][plan->GetOpenConfigs()].push_back(plan);
527 }
528 }
529 // Now Pareto cull both the open and closed Plans to remove non-optimal Plans
530 // Additionally, once culled we update another two maps, the stripe_configs_by_tensor
531 // and plans_by_config maps.
532 for (const auto& part_group : new_part_groups) {
533 if (closed_plans.find(part_group) != closed_plans.end()) {
534 closed_plans[part_group] = ParetoCullPlans(
535 closed_plans.at(part_group), options->max_closed_plans, options->disable_pareto_plans);
536 }
537 for (const auto& it : open_plans[part_group]) {
538 auto pareto_plans =
539 ParetoCullPlans(it.second, options->max_open_plans, options->disable_pareto_plans);
540 for (const auto& plan : pareto_plans) {
541 for (const auto& open_config : plan->GetOpenConfigs()) {
542 if (open_config != plan->GetOutputConfig()) {
543 for (const auto& stripe_config : open_config->GetStripeConfigs()) {
544 // Only add a StripeConfig if it contains for than one stripe
545 if (mul_reduce(stripe_config->GetStripes()) > 1) {
546 stripe_configs_by_tensor[open_config->GetTensor()].insert(stripe_config);
547 }
548 }
549 plans_by_config[open_config].push_back(plan);
550 }
551 }
552 }
553 }
554 }
555 }
556 return closed_plans;
557}
558
559TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs")
560 .set_body_typed([](Part part, int stripe_factors, bool enable_striping,
561 bool multi_dimensional) {
562 if (stripe_factors < 0) {
563 return Array<StripeConfig>();
564 }
565 return Array<StripeConfig>(
566 GenerateOutputStripeConfigs(part, stripe_factors, enable_striping, multi_dimensional));
567 });
568
569TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans")
570 .set_body_typed([](Part part, Array<StripeConfig> output_stripe_configs,
571 Map<Tensor, Array<MemoryRegion>> home_map, CascaderOptions options) {
572 std::vector<StripeConfig> voutput_stripe_configs(output_stripe_configs.begin(),
573 output_stripe_configs.end());
574 std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>
575 mhome_map;
576 for (const auto& it : home_map) {
577 std::vector<MemoryRegion> home_regions;
578 for (const auto& i : it.second) {
579 home_regions.push_back(i);
580 }
581 mhome_map[it.first] = home_regions;
582 }
583 return Array<Plan>(GenerateSinglePlans(part, voutput_stripe_configs, mhome_map, options));
584 });
585
586TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans")
587 .set_body_typed([](CascaderGraph graph, Map<Tensor, Array<MemoryRegion>> home_map,
588 CascaderOptions options) {
589 std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>
590 mhome_map;
591 for (const auto& it : home_map) {
592 std::vector<MemoryRegion> home_regions;
593 for (const auto& i : it.second) {
594 home_regions.push_back(i);
595 }
596 mhome_map[it.first] = home_regions;
597 }
598 auto closed_plans = GenerateGraphPlans(graph, mhome_map, options);
599 Map<Array<Part>, Array<Plan>> tclosed_plans;
600 for (auto& it : closed_plans) {
601 Array<Part> part_arr(it.first.begin(), it.first.end());
602 Array<Plan> plan_arr(it.second);
603 tclosed_plans.Set(part_arr, plan_arr);
604 }
605 return tclosed_plans;
606 });
607
608TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetCopyCyclesHint")
609 .set_body_typed([](TensorConfig tensor_config) {
610 std::pair<int, int> ret = GetCopyCyclesHint(tensor_config);
611 return Array<Integer>({ret.first, ret.second});
612 });
613
614} // namespace cascader
615} // namespace ethosu
616} // namespace contrib
617} // namespace tvm
618