1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, |
13 | * software distributed under the License is distributed on an |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
15 | * KIND, either express or implied. See the License for the |
16 | * specific language governing permissions and limitations |
17 | * under the License. |
18 | */ |
19 | #include "plan_generator.h" |
20 | |
21 | #include <tvm/runtime/container/array.h> |
22 | #include <tvm/runtime/container/map.h> |
23 | #include <tvm/runtime/object.h> |
24 | #include <tvm/runtime/registry.h> |
25 | #include <tvm/support/parallel_for.h> |
26 | |
27 | #include <algorithm> |
28 | #include <cmath> |
29 | #include <map> |
30 | #include <set> |
31 | #include <unordered_map> |
32 | #include <unordered_set> |
33 | #include <utility> |
34 | #include <vector> |
35 | |
36 | #include "block_config.h" |
37 | #include "cascader_options.h" |
38 | #include "common.h" |
39 | #include "graph.h" |
40 | #include "pareto.h" |
41 | #include "plan.h" |
42 | #include "stripe_config.h" |
43 | #include "tensor_config.h" |
44 | |
45 | namespace tvm { |
46 | namespace contrib { |
47 | namespace ethosu { |
48 | namespace cascader { |
49 | |
50 | template <class T> |
51 | std::vector<std::vector<T>> EnumerateCombinations(std::vector<std::vector<T>> values) { |
52 | if (values.size() == 0) { |
53 | return values; |
54 | } |
55 | if (values.size() == 1) { |
56 | std::vector<std::vector<T>> combs; |
57 | for (const auto& value : values[0]) { |
58 | combs.push_back(std::vector<T>(1, value)); |
59 | } |
60 | return combs; |
61 | } |
62 | auto combs = EnumerateCombinations(std::vector<std::vector<T>>(values.begin(), values.end() - 1)); |
63 | std::vector<std::vector<T>> new_combs; |
64 | for (const auto& value : values.back()) { |
65 | for (const auto& comb : combs) { |
66 | auto new_comb = std::vector<T>(comb); |
67 | new_comb.push_back(value); |
68 | new_combs.push_back(new_comb); |
69 | } |
70 | } |
71 | return new_combs; |
72 | } |
73 | |
74 | float GetTransferEfficiency(const Tensor& tensor, const std::vector<int>& block_shape, |
75 | const MemoryRegion& memory) { |
76 | // The block_shape represents the shape of the data transfer required for each job. This is used |
77 | // to calculate how much of the block_shape is contiguous in memory (source memory for a read or |
78 | // destination memory for a write) and subsequently calculate how efficient each memory burst is. |
79 | const auto& shape = tensor->GetShape(); |
80 | int burst_length = block_shape[block_shape.size() - 1]; |
81 | if (block_shape[block_shape.size() - 1] == shape[shape.size() - 1]) { |
82 | burst_length *= block_shape[block_shape.size() - 2]; |
83 | } |
84 | |
85 | burst_length *= tensor->GetDataType().bytes(); |
86 | return static_cast<float>(memory->burst_length) / std::min(burst_length, memory->burst_length); |
87 | } |
88 | |
89 | std::vector<bool> GetCascadableAxes(const Part& part) { |
90 | std::vector<bool> cascadable_axes(part->GetOutputTensor()->GetShape().size()); |
91 | // Check all the propagators to see if an output axis is projected into any |
92 | // of the inputs. If it is, then that axis is cascadable. |
93 | for (const auto& propagator : part->GetPropagators()) { |
94 | auto transform = propagator->GetTransform(); |
95 | for (size_t i = 0; i < transform.size(); i++) { |
96 | for (size_t j = 0; j < transform[0].size() - 1; j++) { |
97 | // An axis is projected if there's a non-zero element |
98 | // in the transform matrix |
99 | if (transform[i][j] != 0) { |
100 | cascadable_axes[j] = true; |
101 | } |
102 | } |
103 | } |
104 | } |
105 | return cascadable_axes; |
106 | } |
107 | |
108 | std::vector<StripeConfig> GenerateOutputStripeConfigs(const Part& part, int stripe_factors, |
109 | bool enable_striping, |
110 | bool multi_dimensional) { |
111 | // If stripe_factors is <= 0, then we won't produce any StripeConfigs |
112 | if (stripe_factors <= 0) { |
113 | return std::vector<StripeConfig>(); |
114 | } |
115 | // Work out the factors to divide by as inverse powers of 2. |
116 | // The last factor is always reserved to be '0' which will correspond to |
117 | // choosing a stripe size of 1 in the dimension. We always include this |
118 | // as it represents the most extreme striping choice that uses the least |
119 | // memory, so it is our choice of last resort. |
120 | // For example, if stripe_factors = 4 then the factors are 1, 1/2, 1/4, 0. |
121 | std::vector<float> factors; |
122 | for (size_t i = 0; i < static_cast<size_t>(stripe_factors) - 1; i++) { |
123 | factors.push_back(1.0f / (std::pow(2.0f, i))); |
124 | } |
125 | factors.push_back(0); |
126 | // Then use the factors to derive the possible ways to split each dimension |
127 | // into stripes. As an example, if an had extent 128 then by applying |
128 | // the factors derived above we get the following possible splits for that axis: |
129 | // 128, 64, 32, 1 |
130 | std::vector<std::vector<int>> splits; |
131 | std::vector<int> output_shape = part->GetOutputTensor()->GetShape(); |
132 | size_t output_dims = output_shape.size(); |
133 | // Only bother striping along the axes which are cascadable |
134 | auto cascadable_axes = GetCascadableAxes(part); |
135 | for (size_t i = 0; i < output_dims; i++) { |
136 | auto axis = output_shape[i]; |
137 | auto axis_align = part->GetStripeAlignHint()[i]; |
138 | std::set<int> axis_splits; // Note this is a set to remove duplicate splits |
139 | if (!cascadable_axes[i] || (!enable_striping)) { |
140 | axis_splits.insert(axis); |
141 | } else { |
142 | for (float factor : factors) { |
143 | int split = |
144 | std::max(static_cast<int>(std::ceil(axis * factor / axis_align)), 1) * axis_align; |
145 | split = std::min(axis, split); |
146 | axis_splits.insert(split); |
147 | } |
148 | } |
149 | splits.push_back(std::vector<int>(axis_splits.begin(), axis_splits.end())); |
150 | } |
151 | |
152 | std::vector<std::vector<int>> stripe_shapes; |
153 | if (multi_dimensional) { |
154 | // Now calculate all the possible combinations of splits for each dimension |
155 | // to give us all the possible stripe shapes. For example, if we had two axes |
156 | // both with possible splits in {128, 64, 32, 1}, the stripe shapes would be: |
157 | // (128, 128), (128, 64), (128, 32) ... (1, 64), (1, 32), (1, 1) |
158 | stripe_shapes = EnumerateCombinations<int>(splits); |
159 | } else { |
160 | // Only consider splitting a single axis |
161 | int axis = 0; |
162 | for (const auto& split : splits) { |
163 | for (const auto& axis_split : split) { |
164 | std::vector<int> stripe_shape = output_shape; |
165 | if (stripe_shape[axis] != axis_split) { |
166 | stripe_shape[axis] = axis_split; |
167 | stripe_shapes.push_back(stripe_shape); |
168 | } |
169 | } |
170 | axis++; |
171 | } |
172 | stripe_shapes.push_back(output_shape); |
173 | } |
174 | auto offset = std::vector<int>(output_dims); |
175 | std::vector<StripeConfig> stripe_configs; |
176 | // Calculate the possible axis orderings such that each axis has the opportunity |
177 | // to be the 'outermost' axis (which is axis that is chosen for rolling). |
178 | std::vector<std::vector<int>> orders; |
179 | for (size_t i = 0; i < output_dims; i++) { |
180 | std::vector<int> order(output_dims); |
181 | for (size_t j = 0; j < output_dims; j++) { |
182 | order[j] = 1 + (j + i) % output_dims; |
183 | } |
184 | orders.push_back(order); |
185 | } |
186 | // Finally, create the StripeConfigs from the possible stripe shapes and orders |
187 | for (const auto& stripe_shape : stripe_shapes) { |
188 | std::vector<int> stripes; |
189 | std::vector<float> strides; |
190 | for (size_t i = 0; i < output_dims; i++) { |
191 | stripes.push_back(std::ceil(static_cast<float>(output_shape[i]) / stripe_shape[i])); |
192 | strides.push_back(static_cast<float>(stripe_shape[i])); // strides = stripe_shape |
193 | } |
194 | // If the stripe shape equals the output shape (i.e. there's no striping), then |
195 | // the order doesn't matter, so just pick the first order and continue. |
196 | if (stripe_shape == output_shape) { |
197 | stripe_configs.push_back( |
198 | StripeConfig(stripe_shape, output_shape, strides, orders[0], stripes, offset)); |
199 | continue; |
200 | } |
201 | for (const auto& order : orders) { |
202 | // Some logic to avoid having an axis be the 'outermost' if the stripe is full |
203 | // size in that axis. This would otherwise be a waste because we can't roll |
204 | // over an axis that hasn't been split. |
205 | bool skip = false; |
206 | for (size_t i = 0; i < output_dims; i++) { |
207 | if (order[i] == 1 && stripe_shape[i] == output_shape[i]) { |
208 | skip = true; |
209 | break; |
210 | } |
211 | } |
212 | if (skip) continue; |
213 | stripe_configs.push_back( |
214 | StripeConfig(stripe_shape, output_shape, strides, order, stripes, offset)); |
215 | } |
216 | } |
217 | return stripe_configs; |
218 | } |
219 | |
220 | std::vector<TensorConfig> GetPossibleInputConfigs(const StripeConfig& stripe_config, |
221 | const Tensor& tensor, |
222 | const std::vector<MemoryRegion>& home_regions, |
223 | const CascaderOptions& options) { |
224 | std::vector<TensorConfig> configs; |
225 | for (const auto& home_region : home_regions) { |
226 | // Boundary configs |
227 | if (home_region == options->cascade_region || tensor->GetSize() > options->always_copy_size) { |
228 | configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY, |
229 | BufferMode::RECOMPUTE, {stripe_config}, false, home_region)); |
230 | } |
231 | if (home_region != options->cascade_region) { |
232 | configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY, |
233 | BufferMode::ROLLING, {stripe_config}, true, |
234 | options->cascade_region)); |
235 | } |
236 | } |
237 | if (!tensor->IsConstant()) { |
238 | // Interior configs |
239 | configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, |
240 | BufferMode::RECOMPUTE, {stripe_config}, false, |
241 | options->cascade_region)); |
242 | configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, |
243 | BufferMode::ROLLING, {stripe_config}, false, |
244 | options->cascade_region)); |
245 | } |
246 | return configs; |
247 | } |
248 | |
249 | // Check whether a StripeConfig can be an output boundary config |
250 | bool CanBound(const StripeConfig& stripe_config) { |
251 | // Determine whether the StripeConfig results in non-overlapping stripes |
252 | // which is the case when the stripe shape equals the strides |
253 | for (size_t i = 0; i < stripe_config->GetShape().size(); i++) { |
254 | // Check that the stripe shape and strides are equal |
255 | if (stripe_config->GetShape()[i] - stripe_config->GetStrides()[i] != 0) { |
256 | return false; |
257 | } |
258 | } |
259 | return true; |
260 | } |
261 | |
262 | std::vector<TensorConfig> GetPossibleOutputConfigs(const StripeConfig& stripe_config, |
263 | const Tensor& tensor, |
264 | const std::vector<MemoryRegion>& home_regions, |
265 | const CascaderOptions& options) { |
266 | std::vector<TensorConfig> configs; |
267 | // Only StripeConfigs with non-overlapping stripes can be output boundary configs |
268 | if (CanBound(stripe_config)) { |
269 | for (const auto& home_region : home_regions) { |
270 | // Boundary configs |
271 | configs.push_back(TensorConfig(tensor, home_region, TensorConfigState::BOUNDARY, |
272 | BufferMode::RECOMPUTE, {stripe_config}, false, home_region)); |
273 | } |
274 | } |
275 | // Interior configs |
276 | configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, |
277 | BufferMode::RECOMPUTE, {stripe_config}, false, |
278 | options->cascade_region)); |
279 | configs.push_back(TensorConfig(tensor, options->cascade_region, TensorConfigState::INTERIOR, |
280 | BufferMode::ROLLING, {stripe_config}, false, |
281 | options->cascade_region)); |
282 | return configs; |
283 | } |
284 | |
285 | int GetInteriorMemoryUsage(const std::vector<TensorConfig>& input_configs, |
286 | const TensorConfig& output_config, const MemoryRegion& interior_region) { |
287 | int memory_usage = 0; |
288 | if (output_config->GetHomeRegion() == interior_region && |
289 | output_config->GetState() == TensorConfigState::BOUNDARY) { |
290 | memory_usage += output_config->GetTensor()->GetSize(); |
291 | } |
292 | for (const auto& input_config : input_configs) { |
293 | if (input_config->GetHomeRegion() == interior_region && |
294 | input_config->GetState() == TensorConfigState::BOUNDARY) { |
295 | memory_usage += input_config->GetTensor()->GetSize(); |
296 | } else if (input_config->GetHomeRegion() == interior_region || |
297 | input_config->GetCopyRegion() == interior_region) { |
298 | memory_usage += input_config->GetBufferSize(); |
299 | } |
300 | } |
301 | return memory_usage; |
302 | } |
303 | |
304 | /** |
305 | * \brief Returns a hint estimating the number of cycles required for |
306 | * the copy specified by tensor_config. |
307 | * |
308 | * \param tensor_config The tensor configuration to estimate. |
309 | * \return mem2mem_cycles Total estimated cycles. |
310 | * \return initial_mem2mem_cycles Estimated cycles for the first block. |
311 | */ |
312 | std::pair<int, int> GetCopyCyclesHint(const TensorConfig& tensor_config) { |
313 | Tensor tensor = tensor_config->GetTensor(); |
314 | MemoryRegion home_region = tensor_config->GetHomeRegion(); |
315 | MemoryRegion copy_region = tensor_config->GetCopyRegion(); |
316 | int initial_mem2mem_cycles = 0; |
317 | int mem2mem_cycles = 0; |
318 | |
319 | // This Tensor needs to be copied - Count stripes for this config |
320 | for (const auto& stripe_config : tensor_config->GetStripeConfigs()) { |
321 | std::map<std::vector<int>, int> input_blocks = CountStripes(stripe_config, true); |
322 | bool first_block = true; |
323 | for (const auto& block : input_blocks) { |
324 | int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() * |
325 | tensor->GetCompressionRatio() * block.second; |
326 | int read_cycles = bytes_transferred * home_region->read_bandwidth + home_region->read_latency; |
327 | int write_cycles = bytes_transferred * copy_region->write_bandwidth; |
328 | |
329 | if (first_block) { |
330 | first_block = false; |
331 | initial_mem2mem_cycles += std::max(read_cycles, write_cycles); |
332 | } |
333 | mem2mem_cycles += std::max(read_cycles, write_cycles); |
334 | } |
335 | } |
336 | |
337 | return {mem2mem_cycles, initial_mem2mem_cycles}; |
338 | } |
339 | |
340 | std::vector<Plan> GenerateSinglePlans( |
341 | const Part& part, const std::vector<StripeConfig>& output_stripe_configs, |
342 | const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>& |
343 | home_map, |
344 | const CascaderOptions& options) { |
345 | std::vector<Plan> plans; |
346 | std::vector<Part> part_group{part}; |
347 | // Create a selection of Plans per output_stripe_config |
348 | for (const auto& output_stripe_config : output_stripe_configs) { |
349 | // Calculate the input_stripe_configs |
350 | auto input_stripe_configs = part->CalculateInputStripeConfigs(output_stripe_config); |
351 | // From the input_stripe_configs, now derive all the possible input TensorConfigs |
352 | std::vector<std::vector<TensorConfig>> all_possible_input_configs; |
353 | size_t i = 0; |
354 | for (const auto& stripe_config : input_stripe_configs) { |
355 | Tensor tensor = part->GetInputTensors()[i]; |
356 | all_possible_input_configs.push_back( |
357 | GetPossibleInputConfigs(stripe_config, tensor, home_map.at(tensor), options)); |
358 | i++; |
359 | } |
360 | // Now work out all the possible combinations of input TensorConfigs |
361 | auto input_config_combinations = |
362 | EnumerateCombinations<TensorConfig>(all_possible_input_configs); |
363 | Tensor output_tensor = part->GetOutputTensor(); |
364 | // Then determine the possible output TensorConfigs (no combinations here because there's only |
365 | // one output) |
366 | auto output_configs = GetPossibleOutputConfigs(output_stripe_config, output_tensor, |
367 | home_map.at(output_tensor), options); |
368 | // Calculate the performance information for the output_stripe_config for both the recompute and |
369 | // rolling cases |
370 | PerformanceInfo rolling_perf = |
371 | part->GetPerformanceInfo(output_stripe_config, BufferMode::ROLLING); |
372 | PerformanceInfo recompute_perf = |
373 | part->GetPerformanceInfo(output_stripe_config, BufferMode::RECOMPUTE); |
374 | // For all the possible input TensorConfig combinations |
375 | for (const auto& input_configs : input_config_combinations) { |
376 | std::vector<TensorConfig> tensor_configs; |
377 | std::vector<TensorConfig> open_input_configs; |
378 | // Add the input TensorConfigs to the 'tensor_configs' and |
379 | // record which input TensorConfigs are 'open' (i.e. 'INTERIOR') |
380 | for (const auto& input_config : input_configs) { |
381 | tensor_configs.push_back(input_config); |
382 | if (input_config->GetState() == TensorConfigState::INTERIOR) { |
383 | open_input_configs.push_back(input_config); |
384 | } |
385 | } |
386 | for (const auto& output_config : output_configs) { |
387 | // Add the output TensorConfig to the tensor_configs and to |
388 | // the open configs (if it's 'INTERIOR') |
389 | tensor_configs.push_back(output_config); |
390 | std::vector<TensorConfig> open_configs = open_input_configs; |
391 | if (output_config->GetState() == TensorConfigState::INTERIOR) { |
392 | open_configs.push_back(output_config); |
393 | } |
394 | int bandwidth_cycles = 0; |
395 | int compute_cycles = 0; |
396 | int mem2mem_cycles = 0; |
397 | int initial_mem2mem_cycles = 0; |
398 | |
399 | // Pick the correct performance info based on the BufferMode |
400 | PerformanceInfo perf_info; |
401 | if (output_config->GetBufferMode() == BufferMode::RECOMPUTE) { |
402 | perf_info = recompute_perf; |
403 | } else { |
404 | perf_info = rolling_perf; |
405 | } |
406 | // Calculate the bandwidth cycles by multiplying the bytes read/written by the |
407 | // bandwidth of the memories |
408 | BlockConfig block_config = perf_info->block_config; |
409 | for (size_t i = 0; i < input_configs.size(); i++) { |
410 | Tensor tensor = input_configs[i]->GetTensor(); |
411 | MemoryRegion copy_region = input_configs[i]->GetCopyRegion(); |
412 | |
413 | if (input_configs[i]->DoCopy()) { |
414 | std::pair<int, int> ret = GetCopyCyclesHint(input_configs[i]); |
415 | mem2mem_cycles += ret.first; |
416 | initial_mem2mem_cycles += ret.second; |
417 | } |
418 | float read_efficiency = |
419 | GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region); |
420 | bandwidth_cycles += |
421 | (perf_info->read_bytes[i] / copy_region->read_bandwidth) * read_efficiency; |
422 | } |
423 | MemoryRegion write_region = output_config->GetCopyRegion(); |
424 | float write_efficiency = GetTransferEfficiency( |
425 | output_config->GetTensor(), block_config->GetOutputBlockShape(), write_region); |
426 | |
427 | bandwidth_cycles += |
428 | perf_info->write_bytes / write_region->write_bandwidth * write_efficiency; |
429 | compute_cycles = perf_info->compute_cycles; |
430 | // Take the max of compute and bandwidth cycles as we assume compute cycles |
431 | // can hide memory latency |
432 | int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles); |
433 | if (cycles > mem2mem_cycles) { |
434 | // NPU cycles are the bottleneck - add initial mem2mem transfer cycles |
435 | cycles += initial_mem2mem_cycles; |
436 | } |
437 | |
438 | int memory_usage = |
439 | GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region); |
440 | plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group, |
441 | options->cascade_region, memory_usage, cycles)); |
442 | } |
443 | } |
444 | } |
445 | return plans; |
446 | } |
447 | |
448 | std::unordered_map<std::vector<Part>, std::vector<Plan>> GenerateGraphPlans( |
449 | const CascaderGraph& graph, |
450 | const std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual>& |
451 | home_map, |
452 | const CascaderOptions& options) { |
453 | ICHECK_GT(options->stripe_factors, 0) |
454 | << "stripe_factors = " << options->stripe_factors << ", but must be > 0" ; |
455 | ICHECK_GT(options->max_plan_size, 0) |
456 | << "max_plan_size = " << options->max_plan_size << ", but must be > 0" ; |
457 | // Define a map between the graph Tensors and possible StripeConfigs that the Tensor may be |
458 | // executed with |
459 | std::unordered_map<Tensor, std::set<StripeConfig>, ObjectPtrHash, ObjectPtrEqual> |
460 | stripe_configs_by_tensor; |
461 | // Define a map between a given open TensorConfig and all the Plans which provide it |
462 | std::unordered_map<TensorConfig, std::vector<Plan>> plans_by_config; |
463 | // Define a map between a group of connected Parts and all the closed plans covering them |
464 | std::unordered_map<std::vector<Part>, std::vector<Plan>> closed_plans; |
465 | // Define a nested map which indexes open plans by both Part group and the open TensorConfigs they |
466 | // provide. Note that we index in this way because Part group + open TensorConfigs combined |
467 | // defines a group of Plans which can be mutually Pareto culled. If we culled of Part group alone, |
468 | // we'd lose potentially valuable open Plans which could have gone on to be grown into Pareto |
469 | // optimal closed plans. |
470 | std::unordered_map<std::vector<Part>, |
471 | std::unordered_map<std::vector<TensorConfig>, std::vector<Plan>>> |
472 | open_plans; |
473 | // Traverse the graph in a reverse topological order (should be enforced by GetPartOrder) |
474 | for (const auto& part : graph->GetPartOrder()) { |
475 | // First generate all the possible StripeConfigs for the Part assuming that it will become the |
476 | // output of a Plan. The number generated is a function of stripe_factors and the number of |
477 | // cascadable dimensions in the Part. |
478 | std::vector<StripeConfig> stripe_configs = |
479 | GenerateOutputStripeConfigs(part, options->stripe_factors, options->enable_striping, |
480 | options->enable_multi_dimensional_striping); |
481 | // Check to see if the output Tensor is part of any existing open Plans |
482 | if (stripe_configs_by_tensor.find(part->GetOutputTensor()) != stripe_configs_by_tensor.end()) { |
483 | // If there are other open Plans which have this Part's output Tensor as an input, then |
484 | // additionally consider the StripeConfigs of those open TensorConfigs so that we have the |
485 | // option to merge into those open Plans. |
486 | const std::set<StripeConfig>& connecting_configs = |
487 | stripe_configs_by_tensor.at(part->GetOutputTensor()); |
488 | std::copy(connecting_configs.begin(), connecting_configs.end(), |
489 | std::back_inserter(stripe_configs)); |
490 | } |
491 | // Generate all the single Part Plans for the previously determined StripeConfigs |
492 | auto single_part_plans = GenerateSinglePlans(part, stripe_configs, home_map, options); |
493 | std::vector<Plan> plans; |
494 | for (const auto& partial_plan : single_part_plans) { |
495 | // If the output TensorConfig of the Plan is 'INTERIOR', then it must be merged with |
496 | // another open Plan |
497 | if (partial_plan->GetOutputConfig()->GetState() == TensorConfigState::INTERIOR) { |
498 | if (plans_by_config.find(partial_plan->GetOutputConfig()) != plans_by_config.end() && |
499 | partial_plan->GetOutputConfig()->GetTensor()->GetConsumers().size() == 1) { |
500 | // Search for all the open Plans which require the same TensorConfig |
501 | const auto& join_plans = plans_by_config.at(partial_plan->GetOutputConfig()); |
502 | for (const auto& join_plan : join_plans) { |
503 | // Only merge to form a new Plan if the resulting Plan size won't exceed the |
504 | // max_plan_size |
505 | if (join_plan->GetPartGroup().size() < static_cast<size_t>(options->max_plan_size)) { |
506 | if (partial_plan->GetMemoryUsage() + join_plan->GetMemoryUsage() < |
507 | options->cascade_region->size) { |
508 | plans.push_back(partial_plan.Merge(join_plan)); |
509 | } |
510 | } |
511 | } |
512 | } |
513 | } else { |
514 | // If the single Part Plan had a 'BOUNDARY' output TensorConfig, then it doesn't need |
515 | // merging and can stand on its own. |
516 | plans.push_back(partial_plan); |
517 | } |
518 | } |
519 | // For all the newly created Plans, update the various maps |
520 | std::unordered_set<std::vector<Part>> new_part_groups; |
521 | for (const auto& plan : plans) { |
522 | new_part_groups.insert(plan->GetPartGroup()); |
523 | if (plan->IsClosed()) { |
524 | closed_plans[plan->GetPartGroup()].push_back(plan); |
525 | } else { |
526 | open_plans[plan->GetPartGroup()][plan->GetOpenConfigs()].push_back(plan); |
527 | } |
528 | } |
529 | // Now Pareto cull both the open and closed Plans to remove non-optimal Plans |
530 | // Additionally, once culled we update another two maps, the stripe_configs_by_tensor |
531 | // and plans_by_config maps. |
532 | for (const auto& part_group : new_part_groups) { |
533 | if (closed_plans.find(part_group) != closed_plans.end()) { |
534 | closed_plans[part_group] = ParetoCullPlans( |
535 | closed_plans.at(part_group), options->max_closed_plans, options->disable_pareto_plans); |
536 | } |
537 | for (const auto& it : open_plans[part_group]) { |
538 | auto pareto_plans = |
539 | ParetoCullPlans(it.second, options->max_open_plans, options->disable_pareto_plans); |
540 | for (const auto& plan : pareto_plans) { |
541 | for (const auto& open_config : plan->GetOpenConfigs()) { |
542 | if (open_config != plan->GetOutputConfig()) { |
543 | for (const auto& stripe_config : open_config->GetStripeConfigs()) { |
544 | // Only add a StripeConfig if it contains for than one stripe |
545 | if (mul_reduce(stripe_config->GetStripes()) > 1) { |
546 | stripe_configs_by_tensor[open_config->GetTensor()].insert(stripe_config); |
547 | } |
548 | } |
549 | plans_by_config[open_config].push_back(plan); |
550 | } |
551 | } |
552 | } |
553 | } |
554 | } |
555 | } |
556 | return closed_plans; |
557 | } |
558 | |
559 | TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateOutputStripeConfigs" ) |
560 | .set_body_typed([](Part part, int stripe_factors, bool enable_striping, |
561 | bool multi_dimensional) { |
562 | if (stripe_factors < 0) { |
563 | return Array<StripeConfig>(); |
564 | } |
565 | return Array<StripeConfig>( |
566 | GenerateOutputStripeConfigs(part, stripe_factors, enable_striping, multi_dimensional)); |
567 | }); |
568 | |
569 | TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateSinglePlans" ) |
570 | .set_body_typed([](Part part, Array<StripeConfig> output_stripe_configs, |
571 | Map<Tensor, Array<MemoryRegion>> home_map, CascaderOptions options) { |
572 | std::vector<StripeConfig> voutput_stripe_configs(output_stripe_configs.begin(), |
573 | output_stripe_configs.end()); |
574 | std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual> |
575 | mhome_map; |
576 | for (const auto& it : home_map) { |
577 | std::vector<MemoryRegion> home_regions; |
578 | for (const auto& i : it.second) { |
579 | home_regions.push_back(i); |
580 | } |
581 | mhome_map[it.first] = home_regions; |
582 | } |
583 | return Array<Plan>(GenerateSinglePlans(part, voutput_stripe_configs, mhome_map, options)); |
584 | }); |
585 | |
586 | TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GenerateGraphPlans" ) |
587 | .set_body_typed([](CascaderGraph graph, Map<Tensor, Array<MemoryRegion>> home_map, |
588 | CascaderOptions options) { |
589 | std::unordered_map<Tensor, std::vector<MemoryRegion>, ObjectPtrHash, ObjectPtrEqual> |
590 | mhome_map; |
591 | for (const auto& it : home_map) { |
592 | std::vector<MemoryRegion> home_regions; |
593 | for (const auto& i : it.second) { |
594 | home_regions.push_back(i); |
595 | } |
596 | mhome_map[it.first] = home_regions; |
597 | } |
598 | auto closed_plans = GenerateGraphPlans(graph, mhome_map, options); |
599 | Map<Array<Part>, Array<Plan>> tclosed_plans; |
600 | for (auto& it : closed_plans) { |
601 | Array<Part> part_arr(it.first.begin(), it.first.end()); |
602 | Array<Plan> plan_arr(it.second); |
603 | tclosed_plans.Set(part_arr, plan_arr); |
604 | } |
605 | return tclosed_plans; |
606 | }); |
607 | |
608 | TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.GetCopyCyclesHint" ) |
609 | .set_body_typed([](TensorConfig tensor_config) { |
610 | std::pair<int, int> ret = GetCopyCyclesHint(tensor_config); |
611 | return Array<Integer>({ret.first, ret.second}); |
612 | }); |
613 | |
614 | } // namespace cascader |
615 | } // namespace ethosu |
616 | } // namespace contrib |
617 | } // namespace tvm |
618 | |