1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, |
13 | * software distributed under the License is distributed on an |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
15 | * KIND, either express or implied. See the License for the |
16 | * specific language governing permissions and limitations |
17 | * under the License. |
18 | */ |
19 | #include <tvm/tir/transform.h> |
20 | |
21 | #include <cmath> |
22 | #include <memory> |
23 | #include <numeric> |
24 | #include <string> |
25 | #include <unordered_map> |
26 | #include <unordered_set> |
27 | #include <vector> |
28 | |
29 | #include "../utils.h" |
30 | |
31 | namespace tvm { |
32 | namespace tir { |
33 | |
34 | using support::NDIntSet; |
35 | |
36 | /*! \brief Type for multi-dimensional index */ |
37 | using MultiIndex = std::vector<PrimExpr>; |
38 | /*! \brief Vector of int64_t */ |
39 | using IntVec = std::vector<int64_t>; |
40 | /*! \brief Vector of for loops */ |
41 | using ForVec = std::vector<const ForNode*>; |
42 | |
43 | /*! |
44 | * \brief An unordered_map for (for, buffer) => V |
45 | * \tparam V The value type |
46 | */ |
47 | template <class V> |
48 | using ForBufferMap = std::unordered_map<const ForNode*, std::unordered_map<const BufferNode*, V>>; |
49 | |
50 | /*! \brief Given x, compute log2(|x| + 1) */ |
51 | inline double slog(double x) { return x >= 0 ? std::log2(x + 1) : std::log2(-x + 1); } |
52 | |
53 | namespace utils { |
54 | |
55 | /*! |
56 | * \brief Get the shape of the buffer |
57 | * \param buffer The buffer |
58 | * \param analyzer The analyzer |
59 | * \return The shape of the buffer |
60 | */ |
61 | std::vector<int64_t> GetBufferShape(const Buffer& buffer, arith::Analyzer* analyzer) { |
62 | int ndim = buffer->shape.size(); |
63 | std::vector<int64_t> result; |
64 | result.reserve(ndim); |
65 | for (const PrimExpr& i : buffer->shape) { |
66 | if (const IntImmNode* int_imm = i.as<IntImmNode>()) { |
67 | result.push_back(int_imm->value); |
68 | continue; |
69 | } |
70 | arith::ConstIntBound bound = analyzer->const_int_bound(i); |
71 | if (0 <= bound->max_value && bound->max_value < arith::ConstIntBound::kPosInf) { |
72 | result.push_back(bound->max_value); |
73 | } else { |
74 | result.push_back(1); |
75 | } |
76 | } |
77 | return result; |
78 | } |
79 | |
80 | /*! |
81 | * \brief Given a loop, return its `pragma_auto_unroll_max_step` annotation if it exists |
82 | * \param loop The loop to be checked |
83 | * \return The value of `pragma_auto_unroll_max_step` if it exists, or -1 if it does not exist |
84 | */ |
85 | int64_t GetPragmaAutoUnroll(const ForNode* loop) { |
86 | if (Optional<IntImm> auto_unroll = GetAnn<IntImm>(loop, tir::attr::pragma_auto_unroll_max_step)) { |
87 | return auto_unroll.value()->value; |
88 | } |
89 | return -1; |
90 | } |
91 | |
92 | /*! |
93 | * \brief Given a list of loops, return the extent of the first loop if the list is not empty, |
94 | * and the first loop has constant extent. Otherwise returns the default value given |
95 | * \param loops The list of loops to be checked |
96 | * \param default_value The default value to be returned if the list is empty or the first loop |
97 | * does not have constant extent |
98 | * \return The extent of the first loop if the list is not empty, or the first loop has constant |
99 | * extent. Otherwise returns the default value |
100 | */ |
101 | int64_t FirstLoopExtent(const ForVec& loops, int64_t default_value) { |
102 | if (!loops.empty()) { |
103 | if (const int64_t* extent = GetLoopIntExtent(loops[0])) { |
104 | return *extent; |
105 | } |
106 | } |
107 | return default_value; |
108 | } |
109 | |
110 | /*! |
111 | * \brief Relax each of the multi-indexing pattern according to the domains bound in the analyzer, |
112 | * and then union them into a single region |
113 | * \param multi_index_pattern A list of multi-index pattern to be relaxed |
114 | * \param numel The size of the single region after union |
115 | * \param analyzer The analyzer that contains the domain information |
116 | * \return The relaxed and unioned region |
117 | */ |
118 | IntVec RelaxAndUnion(const std::vector<MultiIndex>& multi_indices, int64_t* numel, |
119 | arith::Analyzer* analyzer) { |
120 | *numel = 1; |
121 | if (multi_indices.empty()) { |
122 | return {}; |
123 | } |
124 | int n_indices = multi_indices.size(); |
125 | int ndim = multi_indices[0].size(); |
126 | IntVec access_shape(ndim, 0); |
127 | for (int i = 0; i < ndim; ++i) { |
128 | int64_t minimum = arith::ConstIntBound::kPosInf; |
129 | int64_t maximum = arith::ConstIntBound::kNegInf; |
130 | for (int j = 0; j < n_indices; ++j) { |
131 | arith::ConstIntBound bound = analyzer->const_int_bound(multi_indices[j][i]); |
132 | minimum = std::min(minimum, bound->min_value); |
133 | maximum = std::max(maximum, bound->max_value); |
134 | } |
135 | *numel *= maximum - minimum + 1; |
136 | access_shape[i] = maximum - minimum + 1; |
137 | } |
138 | return access_shape; |
139 | } |
140 | |
141 | /*! |
142 | * \brief Given a list of multi-index pattern, return the minimal stride of a variable on it |
143 | * \param multi_indices The list of multi-index pattern |
144 | * \param buffer_stride The stride of the buffer |
145 | * \param var The variable to be checked |
146 | * \return The minimal stride of the variable on the multi-index pattern |
147 | */ |
148 | int64_t GetVarStride(const std::vector<MultiIndex>& multi_indices, const IntVec& buffer_stride, |
149 | const Var& var) { |
150 | class : private ExprVisitor { |
151 | public: |
152 | static int64_t (const PrimExpr& expr, const Var& var) { |
153 | CoefficientExtractor (var); |
154 | extractor.VisitExpr(expr); |
155 | return (extractor.visited_var && !extractor.visited_mul && !extractor.visited_add) |
156 | ? 1 |
157 | : (extractor.visited_var ? extractor.stride : 0); |
158 | } |
159 | |
160 | private: |
161 | explicit CoefficientExtractor(const Var& var) |
162 | : var(var), stride(0), visited_var(false), visited_add(false), visited_mul(false) {} |
163 | |
164 | void VisitExpr_(const MulNode* node) override { |
165 | ExprVisitor::VisitExpr_(node); |
166 | if (visited_var && !visited_add) { |
167 | if (const auto* a = node->a.as<IntImmNode>()) { |
168 | visited_mul = true; |
169 | stride = a->value; |
170 | } else if (const auto* b = node->b.as<IntImmNode>()) { |
171 | visited_mul = true; |
172 | stride = b->value; |
173 | } |
174 | } |
175 | } |
176 | |
177 | void VisitExpr_(const AddNode* node) override { |
178 | ExprVisitor::VisitExpr_(node); |
179 | if (visited_var && !visited_mul) { |
180 | visited_add = true; |
181 | stride = 1; |
182 | } |
183 | } |
184 | |
185 | void VisitExpr_(const VarNode* node) override { |
186 | if (node == var.get()) { |
187 | visited_var = true; |
188 | stride = 2; |
189 | } |
190 | } |
191 | |
192 | const Var& var; |
193 | int64_t stride; |
194 | bool visited_var; |
195 | bool visited_add; |
196 | bool visited_mul; |
197 | }; |
198 | |
199 | constexpr int64_t kNotFound = std::numeric_limits<int64_t>::max(); |
200 | int ndim = buffer_stride.size(); |
201 | // Calculate the min stride possible |
202 | int64_t result = kNotFound; |
203 | for (const MultiIndex& multi_index : multi_indices) { |
204 | ICHECK_EQ(multi_index.size(), buffer_stride.size()); |
205 | // Find the rightest dimension that contains the given variable |
206 | for (int i = ndim - 1; i >= 0; --i) { |
207 | int64_t coef = CoefficientExtractor::Extract(multi_index[i], var); |
208 | if (coef != 0) { |
209 | result = std::min(result, std::abs(coef) * buffer_stride[i]); |
210 | break; |
211 | } |
212 | } |
213 | } |
214 | return (result == kNotFound) ? 0 : result; |
215 | } |
216 | |
217 | /*! |
218 | * \brief Converts a 2-dimensional STL vector to a TVM NDArray |
219 | * \param src The source 2-dimensional STL vector |
220 | * \return The converted TVM NDArray |
221 | */ |
222 | runtime::NDArray AsNDArray(const std::vector<std::vector<double>>& src) { |
223 | ICHECK(!src.empty()); |
224 | int n = src.size(); |
225 | int m = src[0].size(); |
226 | runtime::NDArray tgt = runtime::NDArray::Empty( |
227 | /*shape=*/{n, m}, |
228 | /*dtype=*/DLDataType{kDLFloat, 64, 1}, |
229 | /*ctx=*/DLDevice{kDLCPU, 0}); |
230 | double* data = static_cast<double*>(tgt->data); |
231 | for (const std::vector<double>& row : src) { |
232 | for (double v : row) { |
233 | *data++ = v; |
234 | } |
235 | } |
236 | return tgt; |
237 | } |
238 | |
239 | } // namespace utils |
240 | |
241 | namespace transform { |
242 | |
243 | /*! |
244 | * \brief Create a pass that simplifies the IR for feature extraction |
245 | * \return The pass created |
246 | */ |
247 | Pass () { |
248 | class Simplifier : private StmtExprMutator { |
249 | public: |
250 | static Stmt Run(Stmt stmt) { return Simplifier()(std::move(stmt)); } |
251 | |
252 | private: |
253 | static bool HasBufferLoad(const PrimExpr& expr) { |
254 | bool found = false; |
255 | PostOrderVisit(expr, [&found](const ObjectRef& node) { |
256 | if (node->IsInstance<BufferLoadNode>()) { |
257 | found = true; |
258 | } |
259 | }); |
260 | return found; |
261 | } |
262 | |
263 | PrimExpr VisitExpr_(const SelectNode* node) final { |
264 | if (HasBufferLoad(node->true_value) || HasBufferLoad(node->false_value) || |
265 | HasBufferLoad(node->condition)) { |
266 | return GetRef<Select>(node); |
267 | } |
268 | return make_const(node->dtype, 1.0); |
269 | } |
270 | |
271 | PrimExpr VisitExpr_(const VarNode* var) final { |
272 | if (unit_vars_.count(GetRef<Var>(var))) { |
273 | return make_const(var->dtype, 0.0); |
274 | } |
275 | return GetRef<Var>(var); |
276 | } |
277 | |
278 | Stmt VisitStmt_(const ForNode* loop) final { |
279 | if (is_zero(loop->min) && is_one(loop->extent) && loop->kind == ForKind::kSerial && |
280 | loop->annotations.empty()) { |
281 | unit_vars_.insert(loop->loop_var); |
282 | return VisitStmt(loop->body); |
283 | } else { |
284 | return StmtExprMutator::VisitStmt_(loop); |
285 | } |
286 | } |
287 | |
288 | std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual> unit_vars_; |
289 | }; |
290 | auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) { |
291 | PrimFuncNode* n = f.CopyOnWrite(); |
292 | n->body = Simplifier::Run(std::move(n->body)); |
293 | return f; |
294 | }; |
295 | return CreatePrimFuncPass(pass_func, 0, "tir.SimplifyForFeatureExtraction" , {}); |
296 | } |
297 | |
298 | /*! |
299 | * \brief Create a list of passes that preprocesses the IR for feature extraction |
300 | * \return The list of passes created |
301 | */ |
302 | Sequential PassListForPerStoreFeature() { |
303 | return Sequential({ |
304 | tir::transform::RemoveWeightLayoutRewriteBlock(/*skip_ndarray_rewrite*/ true), |
305 | tir::transform::SimplifyForFeatureExtraction(), |
306 | tir::transform::LowerCrossThreadReduction(), |
307 | tir::transform::LowerInitBlock(), |
308 | tir::transform::PlanAndUpdateBufferAllocationLocation(), |
309 | tir::transform::ConvertBlocksToOpaque(), |
310 | tir::transform::UnifyThreadBinding(), |
311 | tir::transform::CompactBufferAllocation(), |
312 | tir::transform::LowerMatchBuffer(), |
313 | tir::transform::Simplify(), |
314 | }); |
315 | } |
316 | |
317 | } // namespace transform |
318 | |
319 | /*! \brief A data structure managing loop nests */ |
320 | struct LoopNest { |
321 | int64_t prod = 1; // The product of the extents of all the loops |
322 | ForVec loops; // All the loops |
323 | IntVec auto_unroll; // The loops with auto unroll pragma |
324 | ForVec parallel; // The loops whose ForKind are kParallel |
325 | ForVec vectorize; // The loops whose ForKind are kVectorized |
326 | ForVec unroll; // The loops whose ForKind are kUnrolled |
327 | ForVec blockIdx_x; // The loops whose ForKind are kThreadBinding to blockIdx.x |
328 | ForVec blockIdx_y; // The loops whose ForKind are kThreadBinding to blockIdx.y |
329 | ForVec blockIdx_z; // The loops whose ForKind are kThreadBinding to blockIdx.z |
330 | ForVec threadIdx_x; // The loops whose ForKind are kThreadBinding to threadIdx.x |
331 | ForVec threadIdx_y; // The loops whose ForKind are kThreadBinding to threadIdx.y |
332 | ForVec threadIdx_z; // The loops whose ForKind are kThreadBinding to threadIdx.z |
333 | ForVec vthread; // The loops whose ForKind are kThreadBinding to vthread.* |
334 | |
335 | /*! |
336 | * \brief Push a new loop into the loop nest |
337 | * \param loop The loop to be pushed |
338 | * \param auto_unroll_attr The auto unroll attribute of the loop |
339 | * \return A list of for loops that the loop is bound to |
340 | */ |
341 | ForVec* Push(const ForNode* loop, int64_t* auto_unroll_attr) { |
342 | if (const int64_t* extent = GetLoopIntExtent(loop)) { |
343 | this->prod *= *extent; |
344 | } |
345 | this->loops.push_back(loop); |
346 | if ((*auto_unroll_attr = utils::GetPragmaAutoUnroll(loop)) > 0) { |
347 | this->auto_unroll.push_back(*auto_unroll_attr); |
348 | } |
349 | ForVec* ref_loops = nullptr; |
350 | if (loop->kind == ForKind::kParallel) { |
351 | ref_loops = ∥ |
352 | } else if (loop->kind == ForKind::kVectorized) { |
353 | ref_loops = &vectorize; |
354 | } else if (loop->kind == ForKind::kUnrolled) { |
355 | ref_loops = &unroll; |
356 | } else if (loop->kind == ForKind::kThreadBinding) { |
357 | std::string thread_tag = loop->thread_binding.value()->thread_tag; |
358 | if (thread_tag == "blockIdx.x" ) { |
359 | ref_loops = &blockIdx_x; |
360 | } else if (thread_tag == "blockIdx.y" ) { |
361 | ref_loops = &blockIdx_y; |
362 | } else if (thread_tag == "blockIdx.z" ) { |
363 | ref_loops = &blockIdx_z; |
364 | } else if (thread_tag == "threadIdx.x" ) { |
365 | ref_loops = &threadIdx_x; |
366 | } else if (thread_tag == "threadIdx.y" ) { |
367 | ref_loops = &threadIdx_y; |
368 | } else if (thread_tag == "threadIdx.z" ) { |
369 | ref_loops = &threadIdx_z; |
370 | } else if (support::StartsWith(thread_tag, "vthread" )) { |
371 | ref_loops = &vthread; |
372 | } else { |
373 | LOG(FATAL) << "ValueError: Unable to recognize thread tag: " << thread_tag; |
374 | } |
375 | } |
376 | if (ref_loops != nullptr) { |
377 | ref_loops->push_back(loop); |
378 | } |
379 | return ref_loops; |
380 | } |
381 | |
382 | /*! |
383 | * \brief Pop the last loop from the loop nest |
384 | * \param loop The loop to be popped |
385 | * \param ref_loops The list of for loops that the loop is bound to |
386 | * \param auto_unroll_attr The auto unroll attribute of the loop |
387 | */ |
388 | void Pop(const ForNode* loop, ForVec* ref_loops, int auto_unroll_attr) { |
389 | if (ref_loops) { |
390 | ref_loops->pop_back(); |
391 | } |
392 | if (auto_unroll_attr > 0) { |
393 | this->auto_unroll.pop_back(); |
394 | } |
395 | if (const int64_t* extent = GetLoopIntExtent(loop)) { |
396 | this->prod /= *extent; |
397 | } |
398 | this->loops.pop_back(); |
399 | } |
400 | }; |
401 | |
402 | /****** Group 1: Computation related features ******/ |
403 | |
404 | namespace group1 { |
405 | |
406 | /*! \brief Group 1 features */ |
407 | struct Feature { |
408 | /*! \brief Arithmetic features */ |
409 | struct ArithOps { |
410 | // Float-point arithmetic features |
411 | int64_t float_mad = 0; // The number of float MAD (Multiply–add) ops |
412 | int64_t float_add_sub = 0; // The number of float add and sub ops |
413 | int64_t float_mul = 0; // The number of float multiply ops |
414 | int64_t float_div_mod = 0; // The number of float div and mod ops |
415 | int64_t float_cmp = 0; // The number of float comparison ops |
416 | int64_t float_math_func = 0; // The number of float math func calls |
417 | int64_t float_other_func = 0; // The number of other float func calls |
418 | // Integer arithmetic features |
419 | int64_t int_mad = 0; // The number of integer MAD (Multiply–add) ops |
420 | int64_t int_add_sub = 0; // The number of integer add and sub ops |
421 | int64_t int_mul = 0; // The number of integer multiply ops |
422 | int64_t int_div_mod = 0; // The number of integer div and mod ops |
423 | int64_t int_cmp = 0; // The number of integer comparison ops |
424 | int64_t int_math_func = 0; // The number of integer math func calls |
425 | int64_t int_other_func = 0; // The number of other integer func calls |
426 | // Other arithmetic features |
427 | int64_t bool_op = 0; // The number of bool ops |
428 | int64_t select_op = 0; // The number of select ops |
429 | |
430 | static constexpr int64_t kCount = 16; |
431 | |
432 | ArithOps() = default; |
433 | ArithOps(const BufferStoreNode* store, int64_t prod_loop_extent); |
434 | |
435 | void Export(std::vector<double>* v) const { |
436 | double vs[] = { |
437 | slog(float_mad), slog(float_add_sub), slog(float_mul), slog(float_div_mod), |
438 | slog(float_cmp), slog(float_math_func), slog(float_other_func), // |
439 | slog(int_mad), slog(int_add_sub), slog(int_mul), slog(int_div_mod), |
440 | slog(int_cmp), slog(int_math_func), slog(int_other_func), // |
441 | slog(bool_op), slog(select_op), |
442 | }; |
443 | v->insert(v->end(), std::begin(vs), std::end(vs)); |
444 | } |
445 | }; |
446 | |
447 | /*! \brief Loop binding features */ |
448 | struct ForKindFeature { |
449 | enum class Pos : int { |
450 | kPosNone = 0, // Does not have this kind of annotation |
451 | kPosInnerSpatial = 1, // The annotated iterator is the innermost spatial iterator |
452 | kPosMiddleSpatial = 2, // The annotated iterator is a middle spatial iterator |
453 | kPosOuterSpatial = 3, // The annotated iterator is the outermost spatial iterator |
454 | kPosInnerReduce = 4, // The annotated iterator is the innermost reduce iterator |
455 | kPosMiddleReduce = 5, // The annotated iterator is a middle reduce iterator |
456 | kPosOuterReduce = 6, // The annotated iterator is the outermost reduce iterator |
457 | kPosMixed = 7, // The annotated iterator is a mixed space and reduce iterator |
458 | }; |
459 | int64_t num = 0; // The number of iterators with the annotation |
460 | int64_t prod = 0; // The product of the lengths of iterators with the annotation |
461 | int64_t len = 0; // The length of the innermost iterator with the annotation |
462 | Pos pos = Pos::kPosMixed; // The position of the iterators with the annotation |
463 | |
464 | static constexpr int64_t kCount = 11; |
465 | |
466 | explicit ForKindFeature(const ForVec& loops); |
467 | |
468 | void Export(std::vector<double>* v) const { |
469 | double vs[] = { |
470 | slog(num), |
471 | slog(prod), |
472 | slog(len), |
473 | static_cast<double>(static_cast<int>(pos) == 0), |
474 | static_cast<double>(static_cast<int>(pos) == 1), |
475 | static_cast<double>(static_cast<int>(pos) == 2), |
476 | static_cast<double>(static_cast<int>(pos) == 3), |
477 | static_cast<double>(static_cast<int>(pos) == 4), |
478 | static_cast<double>(static_cast<int>(pos) == 5), |
479 | static_cast<double>(static_cast<int>(pos) == 6), |
480 | static_cast<double>(static_cast<int>(pos) == 7), |
481 | }; |
482 | v->insert(v->end(), std::begin(vs), std::end(vs)); |
483 | } |
484 | }; |
485 | |
486 | ArithOps arith_ops; // Arithmetic features |
487 | ForKindFeature vectorize; // Loop binding features: kVectorize |
488 | ForKindFeature unroll; // Loop binding features: kUnroll |
489 | ForKindFeature parallel; // Loop binding features: kParallel |
490 | bool is_gpu = false; // If the program is running on GPU |
491 | int64_t blockIdx_x_len = 1; // The length of blockIdx.x |
492 | int64_t blockIdx_y_len = 1; // The length of blockIdx.y |
493 | int64_t blockIdx_z_len = 1; // The length of blockIdx.z |
494 | int64_t threadIdx_x_len = 1; // The length of threadIdx.x |
495 | int64_t threadIdx_y_len = 1; // The length of threadIdx.y |
496 | int64_t threadIdx_z_len = 1; // The length of threadIdx.z |
497 | int64_t vthread_len = 1; // The length of virtual thread |
498 | |
499 | static constexpr int64_t kCount = ArithOps::kCount + ForKindFeature::kCount * 3 + 8; |
500 | |
501 | explicit Feature(const BufferStoreNode* store, const LoopNest& loop_nest, bool is_gpu) |
502 | : arith_ops(store, loop_nest.prod), |
503 | vectorize(loop_nest.vectorize), |
504 | unroll(loop_nest.unroll), |
505 | parallel(loop_nest.parallel) { |
506 | if (is_gpu) { |
507 | this->is_gpu = true; |
508 | this->blockIdx_x_len = utils::FirstLoopExtent(loop_nest.blockIdx_x, 1); |
509 | this->blockIdx_y_len = utils::FirstLoopExtent(loop_nest.blockIdx_y, 1); |
510 | this->blockIdx_z_len = utils::FirstLoopExtent(loop_nest.blockIdx_z, 1); |
511 | this->threadIdx_x_len = utils::FirstLoopExtent(loop_nest.threadIdx_x, 1); |
512 | this->threadIdx_y_len = utils::FirstLoopExtent(loop_nest.threadIdx_y, 1); |
513 | this->threadIdx_z_len = utils::FirstLoopExtent(loop_nest.threadIdx_z, 1); |
514 | this->vthread_len = utils::FirstLoopExtent(loop_nest.vthread, 1); |
515 | } |
516 | } |
517 | |
518 | void Export(std::vector<double>* v) const { |
519 | this->arith_ops.Export(v); |
520 | this->vectorize.Export(v); |
521 | this->unroll.Export(v); |
522 | this->parallel.Export(v); |
523 | double vs[] = { |
524 | static_cast<double>(is_gpu), // |
525 | slog(blockIdx_x_len), slog(blockIdx_y_len), slog(blockIdx_z_len), |
526 | slog(threadIdx_x_len), slog(threadIdx_y_len), slog(threadIdx_z_len), |
527 | slog(vthread_len), |
528 | }; |
529 | v->insert(v->end(), std::begin(vs), std::end(vs)); |
530 | } |
531 | }; |
532 | |
533 | Feature::ArithOps::ArithOps(const BufferStoreNode* store, int64_t prod_loop_extent) { |
534 | class ArithOpCounter : public ExprVisitor { |
535 | public: |
536 | #define TVM_FEATURE_SIMPLE(Type, Counter) \ |
537 | void VisitExpr_(const Type* op) final { \ |
538 | result_.Counter += this->prod_loop_extent_; \ |
539 | ExprVisitor::VisitExpr_(op); \ |
540 | } |
541 | #define TVM_FEATURE_BINARY(Type, FloatCounter, IntCounter) \ |
542 | void VisitExpr_(const Type* op) final { \ |
543 | if (op->dtype.is_float()) { \ |
544 | result_.FloatCounter += this->prod_loop_extent_; \ |
545 | } else { \ |
546 | result_.IntCounter += this->prod_loop_extent_; \ |
547 | } \ |
548 | ExprVisitor::VisitExpr_(op); \ |
549 | } |
550 | TVM_FEATURE_SIMPLE(AndNode, bool_op); |
551 | TVM_FEATURE_SIMPLE(OrNode, bool_op); |
552 | TVM_FEATURE_SIMPLE(NotNode, bool_op); |
553 | TVM_FEATURE_SIMPLE(SelectNode, select_op); |
554 | TVM_FEATURE_BINARY(AddNode, float_add_sub, int_add_sub); |
555 | TVM_FEATURE_BINARY(SubNode, float_add_sub, int_add_sub); |
556 | TVM_FEATURE_BINARY(MulNode, float_mul, int_mul); |
557 | TVM_FEATURE_BINARY(DivNode, float_div_mod, int_div_mod); |
558 | TVM_FEATURE_BINARY(ModNode, float_div_mod, int_div_mod); |
559 | TVM_FEATURE_BINARY(FloorDivNode, float_div_mod, int_div_mod); |
560 | TVM_FEATURE_BINARY(FloorModNode, float_div_mod, int_div_mod); |
561 | TVM_FEATURE_BINARY(MaxNode, float_cmp, int_cmp); |
562 | TVM_FEATURE_BINARY(MinNode, float_cmp, int_cmp); |
563 | TVM_FEATURE_BINARY(EQNode, float_cmp, int_cmp); |
564 | TVM_FEATURE_BINARY(NENode, float_cmp, int_cmp); |
565 | TVM_FEATURE_BINARY(LTNode, float_cmp, int_cmp); |
566 | TVM_FEATURE_BINARY(LENode, float_cmp, int_cmp); |
567 | TVM_FEATURE_BINARY(GTNode, float_cmp, int_cmp); |
568 | TVM_FEATURE_BINARY(GENode, float_cmp, int_cmp); |
569 | #undef TVM_FEATURE_BINARY |
570 | #undef TVM_FEATURE_SIMPLE |
571 | |
572 | void VisitExpr_(const CallNode* op) final { |
573 | static auto op_call_effect_ = Op::GetAttrMap<TCallEffectKind>("TCallEffectKind" ); |
574 | TCallEffectKind effect_kind = op_call_effect_[Downcast<Op>(op->op)]; |
575 | bool is_pure = |
576 | effect_kind == CallEffectKind::kPure || effect_kind == CallEffectKind::kExprAnnotation; |
577 | if (is_pure) { |
578 | if (op->dtype.is_float()) { |
579 | result_.float_math_func += prod_loop_extent_; |
580 | } else { |
581 | result_.int_math_func += prod_loop_extent_; |
582 | } |
583 | } else { |
584 | if (op->dtype.is_float()) { |
585 | result_.float_other_func += prod_loop_extent_; |
586 | } else { |
587 | result_.int_other_func += prod_loop_extent_; |
588 | } |
589 | } |
590 | ExprVisitor::VisitExpr_(op); |
591 | } |
592 | |
593 | int64_t prod_loop_extent_; |
594 | ArithOps result_; |
595 | }; |
596 | ArithOpCounter counter; |
597 | counter.prod_loop_extent_ = prod_loop_extent; |
598 | counter(store->value); |
599 | *this = counter.result_; |
600 | } |
601 | |
602 | Feature::ForKindFeature::ForKindFeature(const ForVec& loops) { |
603 | if (loops.empty()) { |
604 | this->num = 0; |
605 | this->prod = 0; |
606 | this->len = 0; |
607 | this->pos = ForKindFeature::Pos::kPosNone; |
608 | } else { |
609 | const int64_t* last_loop_extent = GetLoopIntExtent(loops.back()); |
610 | this->num = loops.size(); |
611 | this->len = last_loop_extent ? *last_loop_extent : 1; |
612 | this->pos = ForKindFeature::Pos::kPosMixed; |
613 | int64_t& prod = this->prod = 1; |
614 | for (const ForNode* loop : loops) { |
615 | if (const int64_t* extent = GetLoopIntExtent(loop)) { |
616 | prod *= *extent; |
617 | } |
618 | } |
619 | } |
620 | } |
621 | |
622 | } // namespace group1 |
623 | |
624 | namespace group2 { |
625 | |
626 | /*! \brief Group 2 features */ |
627 | struct Feature { |
628 | enum class AccessType : int { |
629 | /*! The buffer is read but not written */ |
630 | kRead = 0, |
631 | /*! The buffer is written but not read */ |
632 | kWrite = 1, |
633 | /*! The buffer is both read and written */ |
634 | kReadWrite = 2, |
635 | /*! Unknown type */ |
636 | kUnknownRW = 3, |
637 | }; |
638 | enum class ReuseType : int { |
639 | /*! Buffer reuse because accessed on each iteration of a loop */ |
640 | kLoopMultipleRead = 0, |
641 | /*! Buffer reuse because it is serially accessed */ |
642 | kSerialMultipleReadWrite = 1, |
643 | /*! No buffer reuse */ |
644 | kNoReuse = 2, |
645 | }; |
646 | |
647 | struct SubFeature { |
648 | /*! \brief The buffer this feature is for */ |
649 | const BufferNode* buffer = nullptr; |
650 | /*! \brief The access type of the buffer */ |
651 | AccessType access_type = AccessType::kUnknownRW; |
652 | /*! \brief A list of multi-dimensonal indices used to access the buffer */ |
653 | std::vector<MultiIndex> multi_indices = {}; |
654 | // Access information |
655 | /*! \brief loop_accessed_numel[i][...] means the number of elements accessed by loops[i] */ |
656 | std::vector<std::unordered_map<const BufferNode*, int64_t>> loop_accessed_numel = {}; |
657 | /*! \brief The shape of the data access */ |
658 | IntVec access_shape; |
659 | /*! \brief The bytes that are continuously accessed */ |
660 | int64_t num_continuous_bytes = 1; |
661 | // Stride information |
662 | /*! \brief The min stride of the access */ |
663 | int64_t min_stride = 0; |
664 | /*! \brief The innermost stride */ |
665 | int64_t innermost_stride = 0; |
666 | /*! \brief The product of the non-strided loops */ |
667 | int64_t prod_non_strided_loop_extent = 0; |
668 | // Reuse information |
669 | /*! The type of data reuse */ |
670 | ReuseType reuse_type = ReuseType::kNoReuse; |
671 | /*! The reuse distance in terms of number of iterations */ |
672 | double reuse_dis_iter = 0.0; |
673 | /*! The reuse distance in terms of bytes */ |
674 | double reuse_dis_bytes = 0.0; |
675 | /*! The reuse count */ |
676 | int64_t reuse_ct = 0; |
677 | // Features |
678 | /*! The touched memory in bytes */ |
679 | double bytes; |
680 | /*! The touched unique memory in bytes */ |
681 | double unique_bytes; |
682 | /*! The number of touched cache lines */ |
683 | double lines; |
684 | /*! The number touched unique cache lines */ |
685 | double unique_lines; |
686 | /*! bytes / reuse_ct */ |
687 | double bytes_d_reuse_ct; |
688 | /*! unique_bytes / reuse_ct */ |
689 | double unique_bytes_d_reuse_ct; |
690 | /*! lines / reuse_ct */ |
691 | double lines_d_reuse_ct; |
692 | /*! unique_lines / reuse_ct */ |
693 | double unique_lines_d_reuse_ct; |
694 | /*! The stride in access */ |
695 | double stride; |
696 | |
697 | static constexpr int64_t kCount = 18; |
698 | |
699 | void Export(std::vector<double>* v) const { |
700 | double vs[] = { |
701 | static_cast<double>(static_cast<int>(access_type) == 0), |
702 | static_cast<double>(static_cast<int>(access_type) == 1), |
703 | static_cast<double>(static_cast<int>(access_type) == 2), |
704 | // FeatureSet::BufferAccess::AccessType::kUnknownRW is ignored |
705 | slog(bytes), |
706 | slog(unique_bytes), |
707 | slog(lines), |
708 | slog(unique_lines), |
709 | static_cast<double>(static_cast<int>(reuse_type) == 0), |
710 | static_cast<double>(static_cast<int>(reuse_type) == 1), |
711 | static_cast<double>(static_cast<int>(reuse_type) == 2), |
712 | slog(reuse_dis_iter), |
713 | slog(reuse_dis_bytes), |
714 | slog(reuse_ct), |
715 | slog(bytes_d_reuse_ct), |
716 | slog(unique_bytes_d_reuse_ct), |
717 | slog(lines_d_reuse_ct), |
718 | slog(unique_lines_d_reuse_ct), |
719 | slog(stride), |
720 | }; |
721 | v->insert(v->end(), std::begin(vs), std::end(vs)); |
722 | } |
723 | |
724 | static void Pad(std::vector<double>* v) { v->insert(v->end(), 18, 0.0); } |
725 | |
726 | void SetStride(const LoopNest& loop_nest, arith::Analyzer* analyzer); |
727 | |
728 | void SetReuse(const LoopNest& loop_nest, // |
729 | int64_t top_loop_touch_bytes, // |
730 | const ForBufferMap<IntVec>& buffer_touched_under_loop); |
731 | |
732 | void SetFeature(const LoopNest& loop_nest, int64_t cache_line_bytes); |
733 | |
734 | explicit SubFeature(const BufferNode* buffer, AccessType access_type, |
735 | std::vector<MultiIndex> multi_indices, int n_loops) |
736 | : buffer(buffer), |
737 | access_type(access_type), |
738 | multi_indices(multi_indices), |
739 | loop_accessed_numel(n_loops) {} |
740 | }; |
741 | |
742 | void Export(std::vector<double>* v, int buffers_per_store) const { |
743 | int n = sub_features.size(); |
744 | for (int i = 0; i < buffers_per_store; ++i) { |
745 | if (i < n) { |
746 | sub_features[i].Export(v); |
747 | } else { |
748 | SubFeature::Pad(v); |
749 | } |
750 | } |
751 | } |
752 | |
753 | explicit Feature(const BufferStoreNode* store, const LoopNest& loop_nest, |
754 | int64_t cache_line_bytes, IntVec* for_touched_bytes, |
755 | ForBufferMap<IntVec>* buffer_touched_under_loop, arith::Analyzer* analyzer); |
756 | |
757 | void Init(const BufferStoreNode* store, int n_loops); |
758 | |
759 | void SetRegion(const LoopNest& loop_nest, // |
760 | IntVec* for_touched_bytes, // |
761 | ForBufferMap<IntVec>* buffer_touched_under_loop, // |
762 | arith::Analyzer* analyzer); |
763 | |
764 | std::vector<SubFeature> sub_features; |
765 | }; |
766 | |
767 | void Feature::Init(const BufferStoreNode* store, int n_loops) { |
768 | struct Info { |
769 | AccessType access_type = AccessType::kUnknownRW; |
770 | std::vector<MultiIndex> multi_indices; |
771 | }; |
772 | std::unordered_map<const BufferNode*, Info> buffer_info; |
773 | { |
774 | Info& info = buffer_info[store->buffer.get()]; |
775 | info.access_type = AccessType::kWrite; |
776 | info.multi_indices.push_back({store->indices.begin(), store->indices.end()}); |
777 | } |
778 | PostOrderVisit(store->value, [&buffer_info](const ObjectRef& obj) -> void { |
779 | if (const BufferLoadNode* load = obj.as<BufferLoadNode>()) { |
780 | const BufferNode* buffer = load->buffer.get(); |
781 | Info& info = buffer_info[buffer]; |
782 | switch (info.access_type) { |
783 | case AccessType::kRead: |
784 | break; |
785 | case AccessType::kWrite: |
786 | info.access_type = AccessType::kReadWrite; |
787 | break; |
788 | case AccessType::kReadWrite: |
789 | break; |
790 | case AccessType::kUnknownRW: |
791 | default: |
792 | info.access_type = AccessType::kRead; |
793 | break; |
794 | } |
795 | if (info.access_type != AccessType::kReadWrite) { |
796 | info.multi_indices.push_back({load->indices.begin(), load->indices.end()}); |
797 | } |
798 | } |
799 | }); |
800 | this->sub_features.reserve(buffer_info.size()); |
801 | for (const auto& kv : buffer_info) { |
802 | this->sub_features.emplace_back(kv.first, kv.second.access_type, |
803 | std::move(kv.second.multi_indices), n_loops); |
804 | } |
805 | } |
806 | |
807 | void Feature::SetRegion(const LoopNest& loop_nest, IntVec* for_touched_bytes, |
808 | ForBufferMap<IntVec>* buffer_touched_under_loop, |
809 | arith::Analyzer* analyzer) { |
810 | int n_loops = loop_nest.loops.size(); |
811 | const std::vector<const ForNode*>& loops = loop_nest.loops; |
812 | // Step 1. Initialize and bind all the loop variables to a constant |
813 | *for_touched_bytes = IntVec(n_loops, 0); |
814 | for (int i = 0; i < n_loops; ++i) { |
815 | const ForNode* loop = loops[i]; |
816 | analyzer->Bind(loop->loop_var, loop->min, /*allow_override=*/true); |
817 | } |
818 | // Step 2. Corner case: no loops |
819 | if (n_loops == 0) { |
820 | // In this case, the `access_shape` is not calculated |
821 | for (SubFeature& feature : sub_features) { |
822 | feature.access_shape = IntVec(feature.buffer->shape.size(), 1); |
823 | } |
824 | return; |
825 | } |
826 | // Step 3. Gradually bind the loops from inner to outer, |
827 | // calculate the area the loops touch on each buffer |
828 | for (int i = n_loops - 1; i >= 0; --i) { |
829 | const ForNode* loop = loops[i]; |
830 | analyzer->Bind(loop->loop_var, Range::FromMinExtent(loop->min, loop->extent), |
831 | /*allow_override=*/true); |
832 | int64_t& touched_bytes = (*for_touched_bytes)[i] = 0; |
833 | for (SubFeature& feature : sub_features) { |
834 | const BufferNode* buffer = feature.buffer; |
835 | // Note: `feature.access_shape` for `i == 0` is the only one preserved, |
836 | // while others are discarded |
837 | int64_t numel; |
838 | feature.access_shape = utils::RelaxAndUnion(feature.multi_indices, &numel, analyzer); |
839 | numel = std::max<int64_t>(0, numel); |
840 | feature.loop_accessed_numel[i][buffer] = numel; |
841 | touched_bytes += numel * buffer->dtype.bytes(); |
842 | (*buffer_touched_under_loop)[loop][buffer].push_back(numel); |
843 | } |
844 | } |
845 | } |
846 | |
847 | void Feature::SubFeature::SetStride(const LoopNest& loop_nest, arith::Analyzer* analyzer) { |
848 | int n_loops = loop_nest.loops.size(); |
849 | const std::vector<const ForNode*>& loops = loop_nest.loops; |
850 | // For each buffer, we find the loop stride on it |
851 | const BufferNode* buffer = this->buffer; |
852 | int ndim = this->buffer->shape.size(); |
853 | IntVec buffer_shape = utils::GetBufferShape(GetRef<Buffer>(buffer), analyzer); |
854 | // Calculate the buffer's stride from its shape |
855 | IntVec buffer_stride(ndim); |
856 | if (ndim >= 1) { |
857 | buffer_stride[ndim - 1] = 1; |
858 | for (int i = ndim - 2; i >= 0; --i) { |
859 | buffer_stride[i] = buffer_stride[i + 1] * buffer_shape[i + 1]; |
860 | } |
861 | } |
862 | // Calculate `num_continuous_bytes` |
863 | { |
864 | int64_t& num_continuous_bytes = this->num_continuous_bytes = 1; |
865 | const IntVec& access_shape = this->access_shape; |
866 | ICHECK_EQ(access_shape.size(), buffer_shape.size()); |
867 | for (int i = ndim - 1; i >= 0; --i) { |
868 | if (access_shape[i] == buffer_shape[i]) { |
869 | num_continuous_bytes = buffer_shape[i] * buffer->dtype.bytes(); |
870 | break; |
871 | } |
872 | } |
873 | } |
874 | // Enumerate loops from inner to outer |
875 | int i = 0; |
876 | // Calculate this->min_stride |
877 | int64_t& stride = this->min_stride = 0; |
878 | for (i = n_loops - 1; i >= 0; --i) { |
879 | stride = utils::GetVarStride(this->multi_indices, buffer_stride, loops[i]->loop_var); |
880 | if (stride != 0) { |
881 | break; |
882 | } |
883 | } |
884 | // Calculate this->innermost_stride |
885 | this->innermost_stride = (i == n_loops - 1) ? stride : 0; |
886 | // Calculate this->prod |
887 | int64_t& prod = this->prod_non_strided_loop_extent = 1; |
888 | for (int j = n_loops - 1; j > i; --j) { |
889 | if (const int64_t* extent = GetLoopIntExtent(loops[j])) { |
890 | prod *= *extent; |
891 | } |
892 | } |
893 | } |
894 | |
895 | void Feature::SubFeature::SetReuse(const LoopNest& loop_nest, int64_t top_loop_touch_bytes, |
896 | const ForBufferMap<IntVec>& buffer_touched_under_loop) { |
897 | const BufferNode* buffer = this->buffer; |
898 | // Step 3.1. Collect all `Var`s that appears in the buffer region |
899 | std::unordered_set<const VarNode*> region_vars; |
900 | for (const MultiIndex& multi_index : this->multi_indices) { |
901 | for (const PrimExpr& index : multi_index) { |
902 | PostOrderVisit(index, [®ion_vars](const ObjectRef& obj) -> void { |
903 | if (const auto* var = obj.as<VarNode>()) { |
904 | region_vars.insert(var); |
905 | } |
906 | }); |
907 | } |
908 | } |
909 | // Default case: no reuse |
910 | ReuseType& reuse_type = this->reuse_type = ReuseType::kNoReuse; |
911 | double& reuse_dis_iter = this->reuse_dis_iter = 0; |
912 | double& reuse_dis_bytes = this->reuse_dis_bytes = 0; |
913 | int64_t& reuse_ct = this->reuse_ct = 0; |
914 | |
915 | // Step 3.2. Enumerate loops from inner to outer, find the first loop with reuse |
916 | int n_loops = loop_nest.loops.size(); |
917 | const std::vector<const ForNode*>& loops = loop_nest.loops; |
918 | for (int i = n_loops - 1; i >= 0; --i) { |
919 | const ForNode* loop = loops[i]; |
920 | // Case 1. Find an invariant loop, i.e. reuse with kLoopMultipleRead |
921 | if (!region_vars.count(loop->loop_var.get())) { |
922 | reuse_type = ReuseType::kLoopMultipleRead; |
923 | if (const int64_t* extent = GetLoopIntExtent(loop)) { |
924 | reuse_ct = *extent; |
925 | } else { |
926 | reuse_ct = 1; |
927 | } |
928 | reuse_dis_iter = 1; |
929 | for (int j = n_loops - 1; j > i; --j) { |
930 | if (const int64_t* extent = GetLoopIntExtent(loops[j])) { |
931 | reuse_dis_iter *= *extent; |
932 | } |
933 | } |
934 | reuse_dis_bytes = 0.0; |
935 | if (i == n_loops - 1) { |
936 | reuse_dis_bytes = top_loop_touch_bytes; |
937 | } else { |
938 | for (const auto& iter : buffer_touched_under_loop.at(loops[i + 1])) { |
939 | const BufferNode* buffer = iter.first; |
940 | const IntVec& numels = iter.second; |
941 | int64_t numel = std::accumulate(numels.begin(), numels.end(), int64_t(0)); |
942 | reuse_dis_bytes += numel * buffer->dtype.bytes(); |
943 | } |
944 | } |
945 | break; |
946 | } |
947 | // Case 2. Find serial reuse, i.e. reuse with kSerialMultipleReadWrite |
948 | const IntVec& touched = buffer_touched_under_loop.at(loop).at(buffer); |
949 | if (touched.size() >= 2) { |
950 | int64_t extent = 1; |
951 | if (const int64_t* ext = GetLoopIntExtent(loop)) { |
952 | extent = *ext; |
953 | } |
954 | reuse_type = ReuseType::kSerialMultipleReadWrite; |
955 | reuse_ct = touched.size() - 1; |
956 | reuse_dis_iter = *std::min_element(touched.begin(), touched.end()); |
957 | reuse_dis_bytes = 0.0; |
958 | for (const auto& iter : buffer_touched_under_loop.at(loop)) { |
959 | const BufferNode* buffer = iter.first; |
960 | const IntVec& numels = iter.second; |
961 | int64_t numel = std::accumulate(numels.begin(), numels.end(), int64_t(0)); |
962 | reuse_dis_bytes += numel * buffer->dtype.bytes(); |
963 | } |
964 | reuse_dis_iter /= extent; |
965 | reuse_dis_bytes /= extent; |
966 | break; |
967 | } |
968 | } |
969 | } |
970 | |
971 | void Feature::SubFeature::SetFeature(const LoopNest& loop_nest, int64_t cache_line_bytes) { |
972 | int64_t dtype_bytes = this->buffer->dtype.bytes(); |
973 | this->stride = this->innermost_stride; |
974 | this->bytes = dtype_bytes * loop_nest.prod; |
975 | if (loop_nest.loops.empty()) { |
976 | this->unique_bytes = 1; |
977 | this->lines = 1; |
978 | this->unique_lines = 1; |
979 | } else { |
980 | this->unique_bytes = |
981 | static_cast<double>(this->loop_accessed_numel.front().at(buffer)) * dtype_bytes; |
982 | this->lines = static_cast<double>(loop_nest.prod) / this->prod_non_strided_loop_extent * |
983 | std::min(1.0, 1.0 * this->min_stride * dtype_bytes / cache_line_bytes); |
984 | this->lines = std::max(1.0, this->lines); |
985 | this->unique_lines = static_cast<double>(this->unique_bytes) / |
986 | std::min(cache_line_bytes, this->num_continuous_bytes); |
987 | this->unique_lines = std::max(1.0, this->unique_lines); |
988 | } |
989 | double proxy_reuse_ct = this->reuse_ct > 0 ? this->reuse_ct : 0.5; |
990 | this->bytes_d_reuse_ct = this->bytes / proxy_reuse_ct; |
991 | this->unique_bytes_d_reuse_ct = this->unique_bytes / proxy_reuse_ct; |
992 | this->lines_d_reuse_ct = this->lines / proxy_reuse_ct; |
993 | this->unique_lines_d_reuse_ct = this->unique_lines / proxy_reuse_ct; |
994 | } |
995 | |
996 | Feature::Feature(const BufferStoreNode* store, const LoopNest& loop_nest, int64_t cache_line_bytes, |
997 | IntVec* for_touched_bytes, ForBufferMap<IntVec>* buffer_touched_under_loop, |
998 | arith::Analyzer* analyzer) { |
999 | int n_loops = loop_nest.loops.size(); |
1000 | // Step 0. Initialize data structures |
1001 | this->Init(store, n_loops); |
1002 | // Step 1. Calculate region-related feature |
1003 | this->SetRegion(loop_nest, for_touched_bytes, buffer_touched_under_loop, analyzer); |
1004 | // Step 2. Calculate stride-related feature |
1005 | for (auto& feature : sub_features) { |
1006 | feature.SetStride(loop_nest, analyzer); |
1007 | } |
1008 | // Step 3. Calculate reuse-related feature |
1009 | int64_t top_loop_touch_bytes = 0.0; |
1010 | if (n_loops > 0) { |
1011 | for (const SubFeature& feature : sub_features) { |
1012 | int64_t bytes = feature.buffer->dtype.bytes(); |
1013 | int64_t n_buffer = feature.loop_accessed_numel[0].size(); |
1014 | top_loop_touch_bytes += bytes * n_buffer; |
1015 | } |
1016 | } |
1017 | for (auto& feature : sub_features) { |
1018 | feature.SetReuse(loop_nest, top_loop_touch_bytes, *buffer_touched_under_loop); |
1019 | } |
1020 | // Step 4. Calculate rest of the features |
1021 | for (auto& feature : sub_features) { |
1022 | feature.SetFeature(loop_nest, cache_line_bytes); |
1023 | } |
1024 | // Step 5. Sort the features |
1025 | std::sort(sub_features.begin(), sub_features.end(), [](const SubFeature& a, const SubFeature& b) { |
1026 | if (a.lines != b.lines) { |
1027 | return a.lines > b.lines; |
1028 | } |
1029 | if (a.bytes != b.bytes) { |
1030 | return a.bytes > b.bytes; |
1031 | } |
1032 | return a.buffer->name < b.buffer->name; |
1033 | }); |
1034 | } |
1035 | |
1036 | } // namespace group2 |
1037 | |
1038 | namespace group3 { |
1039 | |
1040 | /*! \brief Group 3 feature */ |
1041 | struct Feature { |
1042 | /*! |
1043 | * \brief See the wiki page [1] for details |
1044 | * |
1045 | * Arithmetic intensity is FLOPs/unique bytes of memory touched. A value is computed |
1046 | * for each set of loop nests starting with just the innermost loop and |
1047 | * reaching to include all loops. There are a variable number of loops, so |
1048 | * n_samples are taken from the curve of arithmetic intensity vs flops. This |
1049 | * biases the values towards larger loops. |
1050 | * |
1051 | * Note that the denominator is unique bytes of memory touched. Repeated |
1052 | * access to the same byte of memory counts as only a single byte touched. |
1053 | * |
1054 | * Values are scaled by log2(x + 1). |
1055 | * |
1056 | * [1] https://en.wikipedia.org/wiki/Roofline_model |
1057 | */ |
1058 | std::vector<double> arith_intensity_curve; |
1059 | |
1060 | void Export(std::vector<double>* v) const { |
1061 | v->insert(v->end(), arith_intensity_curve.begin(), arith_intensity_curve.end()); |
1062 | } |
1063 | |
1064 | explicit Feature(int n_samples, const LoopNest& loop_nest, const IntVec& for_touched_bytes, |
1065 | const group1::Feature::ArithOps& arith_ops) |
1066 | : arith_intensity_curve(n_samples, 0.0) { |
1067 | const std::vector<const ForNode*>& loops = loop_nest.loops; |
1068 | ICHECK_EQ(loops.size(), for_touched_bytes.size()); |
1069 | int n_loops = loops.size(); |
1070 | // Calculate `memory_bytes` |
1071 | std::vector<double> memory_bytes; |
1072 | memory_bytes.resize(n_loops); |
1073 | for (int i = 0; i < n_loops; ++i) { |
1074 | memory_bytes[n_loops - 1 - i] = for_touched_bytes[i]; |
1075 | } |
1076 | // Calculate `compute_ops` and `cur_compute_ops` |
1077 | std::vector<double> compute_ops; |
1078 | double total_compute_ops = arith_ops.float_mad + arith_ops.float_add_sub + arith_ops.float_mul + |
1079 | arith_ops.float_div_mod + arith_ops.float_cmp + |
1080 | arith_ops.float_math_func + arith_ops.float_other_func; |
1081 | total_compute_ops /= loop_nest.prod; |
1082 | for (int i = n_loops - 1; i >= 0; --i) { |
1083 | if (const int64_t* extent = GetLoopIntExtent(loops[i])) { |
1084 | total_compute_ops *= *extent; |
1085 | } |
1086 | compute_ops.push_back(total_compute_ops); |
1087 | } |
1088 | // Fill the feature set |
1089 | if (total_compute_ops <= 0 || compute_ops.empty()) { |
1090 | for (int i = 0; i < n_samples; ++i) { |
1091 | arith_intensity_curve[i] = 0.0; |
1092 | } |
1093 | return; |
1094 | } |
1095 | total_compute_ops = compute_ops.back(); |
1096 | int p = 0; |
1097 | for (int i = 0; i < n_samples; ++i) { |
1098 | double& result = arith_intensity_curve[i]; |
1099 | double cur_compute_ops = static_cast<double>(i + 1) / n_samples * total_compute_ops; |
1100 | // Find the first `p` that `compute[p] >= total * (i + 1) / N` |
1101 | for (; p < n_loops; ++p) { |
1102 | if (compute_ops[p] >= cur_compute_ops - 1e-4) { |
1103 | break; |
1104 | } |
1105 | } |
1106 | CHECK_LT(p, n_loops); |
1107 | if (p == 0) { |
1108 | result = slog(compute_ops[p] / memory_bytes[p]); |
1109 | } else { |
1110 | double base = compute_ops[p - 1] / memory_bytes[p - 1]; |
1111 | double slope = |
1112 | (compute_ops[p] / memory_bytes[p] - compute_ops[p - 1] / memory_bytes[p - 1]) / |
1113 | (compute_ops[p] - compute_ops[p - 1]); |
1114 | result = slog(base + slope * (cur_compute_ops - compute_ops[p - 1])); |
1115 | } |
1116 | } |
1117 | } |
1118 | }; |
1119 | |
1120 | } // namespace group3 |
1121 | |
1122 | namespace group4 { |
1123 | |
1124 | /*! \brief Group 4 feature */ |
1125 | struct Feature { |
1126 | int64_t alloc_size = 0; // The size of allocated buffer in bytes |
1127 | int64_t alloc_prod = 0; // alloc_outer_prod * alloc_inner_prod |
1128 | int64_t alloc_outer_prod = 1; // The product of lengths of loops outside the scope of the alloc |
1129 | |
1130 | static constexpr int64_t kCount = 4; |
1131 | |
1132 | void Export(std::vector<double>* v, int64_t outer_prod) const { |
1133 | double vs[] = { |
1134 | slog(alloc_size), |
1135 | slog(alloc_prod), |
1136 | slog(alloc_outer_prod), |
1137 | slog(static_cast<double>(outer_prod) / alloc_outer_prod), |
1138 | }; |
1139 | v->insert(v->end(), std::begin(vs), std::end(vs)); |
1140 | } |
1141 | |
1142 | Feature() = default; |
1143 | |
1144 | explicit Feature(const LoopNest& loop_nest, const Buffer& buffer, arith::Analyzer* analyzer) { |
1145 | std::vector<int64_t> shape = utils::GetBufferShape(buffer, analyzer); |
1146 | int64_t numel = 1; |
1147 | for (int64_t x : shape) { |
1148 | numel *= x; |
1149 | } |
1150 | alloc_size = numel * buffer->dtype.bytes(); |
1151 | alloc_prod = numel * loop_nest.prod; |
1152 | alloc_outer_prod = loop_nest.prod; |
1153 | } |
1154 | }; |
1155 | |
1156 | } // namespace group4 |
1157 | |
1158 | namespace group5 { |
1159 | |
1160 | /*! \brief Group 5 feature */ |
1161 | struct Feature { |
1162 | int64_t outer_prod; // The product of lengths of outer loops |
1163 | int num_loops; // The number of outer loops |
1164 | int auto_unroll_max_step; // The value of pragma "auto_unroll_max_step" |
1165 | |
1166 | static constexpr int64_t kCount = 3; |
1167 | |
1168 | void Export(std::vector<double>* v) const { |
1169 | double vs[] = { |
1170 | slog(outer_prod), |
1171 | slog(num_loops), |
1172 | slog(auto_unroll_max_step), |
1173 | }; |
1174 | v->insert(v->end(), std::begin(vs), std::end(vs)); |
1175 | } |
1176 | |
1177 | explicit Feature(const LoopNest& loop_nest) { |
1178 | this->outer_prod = loop_nest.prod; |
1179 | this->num_loops = loop_nest.loops.size(); |
1180 | this->auto_unroll_max_step = loop_nest.auto_unroll.empty() ? 0 : loop_nest.auto_unroll.back(); |
1181 | } |
1182 | }; |
1183 | |
1184 | } // namespace group5 |
1185 | |
1186 | namespace group6 { |
1187 | |
1188 | /*! \brief The auxiliary feature extractor for workloads */ |
1189 | class : private StmtVisitor { |
1190 | public: |
1191 | static std::vector<double> (const IRModule& mod) { |
1192 | WorkloadEmbeddingExtractor self; |
1193 | for (const auto& kv : mod->functions) { |
1194 | if (const PrimFuncNode* func = kv.second.as<PrimFuncNode>()) { |
1195 | self(func->body); |
1196 | } |
1197 | } |
1198 | return self.embedding; |
1199 | } |
1200 | |
1201 | private: |
1202 | void (const BlockNode* block) final { |
1203 | StmtVisitor::VisitStmt_(block); |
1204 | std::string name = block->name_hint; |
1205 | std::for_each(name.begin(), name.end(), [](char& c) { c = ::tolower(c); }); |
1206 | if (name.find("softmax" ) != std::string::npos) { |
1207 | embedding[0] = 1.0; |
1208 | } else if ((name.find("max" ) != std::string::npos) || (name.find("min" ) != std::string::npos)) { |
1209 | embedding[1] = 1.0; |
1210 | } else if (name.find("add" ) != std::string::npos) { |
1211 | embedding[2] = 1.0; |
1212 | } else if (name.find("batch_matmul" ) != std::string::npos) { |
1213 | embedding[3] = 1.0; |
1214 | } else if (name.find("matmul" ) != std::string::npos) { |
1215 | embedding[4] = 1.0; |
1216 | } else if (name.find("depthwiseconv2d" ) != std::string::npos) { |
1217 | embedding[5] = 1.0; |
1218 | } else if (name.find("conv2d_winograd" ) != std::string::npos) { |
1219 | embedding[6] = 1.0; |
1220 | } else if (name.find("conv2d" ) != std::string::npos) { |
1221 | embedding[7] = 1.0; |
1222 | } |
1223 | } |
1224 | |
1225 | std::vector<double> = std::vector<double>(8, 0.0); |
1226 | }; |
1227 | |
1228 | /*! \brief Group 6 feature */ |
1229 | struct Feature { |
1230 | explicit Feature(const IRModule& mod) { |
1231 | this->feature = WorkloadEmbeddingExtractor::Extract(mod); |
1232 | } |
1233 | |
1234 | void Export(std::vector<double>* v) const { |
1235 | v->insert(v->end(), std::begin(feature), std::end(feature)); |
1236 | } |
1237 | |
1238 | std::vector<double> feature; // The workload embedding |
1239 | static constexpr int64_t kCount = 8; |
1240 | }; |
1241 | |
1242 | } // namespace group6 |
1243 | |
1244 | /*! \brief The feature extracted */ |
1245 | struct Feature { |
1246 | const BufferNode* buffer = nullptr; |
1247 | int buffer_order = -1; |
1248 | std::unique_ptr<group1::Feature> group1 = nullptr; |
1249 | std::unique_ptr<group2::Feature> group2 = nullptr; |
1250 | std::unique_ptr<group3::Feature> group3 = nullptr; |
1251 | std::unique_ptr<group4::Feature> group4 = nullptr; |
1252 | std::unique_ptr<group5::Feature> group5 = nullptr; |
1253 | std::shared_ptr<group6::Feature> group6 = nullptr; |
1254 | |
1255 | bool operator<(const Feature& other) const { return buffer_order < other.buffer_order; } |
1256 | }; |
1257 | |
1258 | /*! \brief The main feature extractor */ |
1259 | class PerStoreFeatureCollector : private StmtVisitor { |
1260 | public: |
1261 | static std::vector<Feature> Collect(bool is_gpu, int64_t cache_line_bytes, |
1262 | int64_t arith_intensity_curve_num_samples, |
1263 | const IRModule& mod) { |
1264 | PerStoreFeatureCollector collector(is_gpu, cache_line_bytes, arith_intensity_curve_num_samples); |
1265 | for (const auto& kv : mod->functions) { |
1266 | if (const PrimFuncNode* func = kv.second.as<PrimFuncNode>()) { |
1267 | collector(func->body); |
1268 | for (const auto& it : func->buffer_map) { |
1269 | collector.HandleBufferAlloc(it.second); |
1270 | } |
1271 | } |
1272 | } |
1273 | std::vector<Feature> result; |
1274 | result.reserve(collector.buffer_features_.size()); |
1275 | for (auto& it : collector.buffer_features_) { |
1276 | Feature& feature = it.second; |
1277 | if (feature.buffer != nullptr) { |
1278 | ICHECK(feature.group1); |
1279 | ICHECK(feature.group2); |
1280 | ICHECK(feature.group3); |
1281 | ICHECK(feature.group5); |
1282 | if (feature.group4 == nullptr) { |
1283 | feature.group4 = std::make_unique<group4::Feature>(); |
1284 | } |
1285 | result.push_back(std::move(feature)); |
1286 | } |
1287 | } |
1288 | std::sort(result.begin(), result.end()); |
1289 | return result; |
1290 | } |
1291 | |
1292 | private: |
1293 | void VisitStmt_(const ForNode* loop) final { |
1294 | int64_t auto_unroll; |
1295 | ForVec* for_vec = loop_nest_.Push(loop, &auto_unroll); |
1296 | StmtVisitor::VisitStmt_(loop); |
1297 | loop_nest_.Pop(loop, for_vec, auto_unroll); |
1298 | } |
1299 | |
1300 | void VisitStmt_(const BufferStoreNode* store) final { |
1301 | if (store->value->IsInstance<IntImmNode>() || store->value->IsInstance<FloatImmNode>()) { |
1302 | return; |
1303 | } |
1304 | const BufferNode* buffer = store->buffer.get(); |
1305 | Feature& feature = buffer_features_[buffer]; |
1306 | if (feature.buffer == nullptr) { |
1307 | feature.buffer = buffer; |
1308 | feature.buffer_order = buffer_features_.size(); |
1309 | } |
1310 | feature.group1 = std::make_unique<group1::Feature>(store, loop_nest_, is_gpu_); |
1311 | feature.group2 = |
1312 | std::make_unique<group2::Feature>(store, loop_nest_, cache_line_bytes_, &for_touched_bytes_, |
1313 | &buffer_touched_under_loop_, &analyzer_); |
1314 | feature.group3 = |
1315 | std::make_unique<group3::Feature>(arith_intensity_curve_num_samples_, loop_nest_, |
1316 | for_touched_bytes_, feature.group1->arith_ops); |
1317 | feature.group5 = std::make_unique<group5::Feature>(loop_nest_); |
1318 | } |
1319 | |
1320 | void VisitStmt_(const BlockNode* block) final { |
1321 | StmtVisitor::VisitStmt_(block); |
1322 | for (const Buffer& buffer : block->alloc_buffers) { |
1323 | HandleBufferAlloc(buffer); |
1324 | } |
1325 | } |
1326 | |
1327 | void HandleBufferAlloc(const Buffer& buffer) { |
1328 | Feature& feature = buffer_features_[buffer.get()]; |
1329 | feature.group4 = std::make_unique<group4::Feature>(loop_nest_, buffer, &analyzer_); |
1330 | } |
1331 | |
1332 | explicit PerStoreFeatureCollector(bool is_gpu, int64_t cache_line_bytes, |
1333 | int64_t arith_intensity_curve_num_samples) |
1334 | : is_gpu_(is_gpu), |
1335 | cache_line_bytes_(cache_line_bytes), |
1336 | arith_intensity_curve_num_samples_(arith_intensity_curve_num_samples) {} |
1337 | |
1338 | bool is_gpu_; |
1339 | int64_t cache_line_bytes_; |
1340 | int64_t arith_intensity_curve_num_samples_; |
1341 | arith::Analyzer analyzer_; |
1342 | LoopNest loop_nest_ = {}; |
1343 | IntVec for_touched_bytes_ = {}; |
1344 | ForBufferMap<IntVec> buffer_touched_under_loop_ = {}; |
1345 | std::unordered_map<const BufferNode*, Feature> buffer_features_ = {}; |
1346 | }; |
1347 | |
1348 | } // namespace tir |
1349 | } // namespace tvm |
1350 | |
1351 | namespace tvm { |
1352 | namespace meta_schedule { |
1353 | |
1354 | class PerStoreFeatureNode : public FeatureExtractorNode { |
1355 | public: |
1356 | int buffers_per_store; |
1357 | int arith_intensity_curve_num_samples; |
1358 | int cache_line_bytes; |
1359 | bool ; |
1360 | int feature_vector_length; |
1361 | |
1362 | void VisitAttrs(tvm::AttrVisitor* v) { |
1363 | v->Visit("buffers_per_store" , &buffers_per_store); |
1364 | v->Visit("arith_intensity_curve_num_samples" , &arith_intensity_curve_num_samples); |
1365 | v->Visit("cache_line_bytes" , &cache_line_bytes); |
1366 | v->Visit("feature_vector_length" , &feature_vector_length); |
1367 | } |
1368 | |
1369 | void (IRModule mod, bool is_gpu, std::vector<std::vector<double>>* results) { |
1370 | static transform::Sequential passes = tir::transform::PassListForPerStoreFeature(); |
1371 | mod = passes(std::move(mod)); |
1372 | std::vector<tir::Feature> features = tir::PerStoreFeatureCollector::Collect( |
1373 | is_gpu, this->cache_line_bytes, this->arith_intensity_curve_num_samples, mod); |
1374 | int n_features = features.size(); |
1375 | results->resize(n_features); |
1376 | for (int i = 0; i < n_features; ++i) { |
1377 | const tir::Feature& feature = features[i]; |
1378 | std::vector<double>& result = (*results)[i]; |
1379 | result.reserve(feature_vector_length); |
1380 | feature.group1->Export(&result); |
1381 | feature.group2->Export(&result, this->buffers_per_store); |
1382 | feature.group3->Export(&result); |
1383 | feature.group4->Export(&result, feature.group5->outer_prod); |
1384 | feature.group5->Export(&result); |
1385 | } |
1386 | } |
1387 | |
1388 | Array<runtime::NDArray> ExtractFrom(const TuneContext& tune_context, |
1389 | const Array<MeasureCandidate>& candidates) { |
1390 | bool is_gpu = tune_context->target.value()->kind->name == "cuda" ; |
1391 | std::vector<runtime::NDArray> results; |
1392 | results.resize(candidates.size()); |
1393 | std::unique_ptr<tir::group6::Feature> feature_group6 = nullptr; |
1394 | if (extract_workload) { |
1395 | feature_group6 = std::make_unique<tir::group6::Feature>(tune_context->mod.value()); |
1396 | } |
1397 | auto f = [this, is_gpu, &feature_group6, &candidates, &results](int, int task_id) -> void { |
1398 | const auto& candidate = candidates[task_id]; |
1399 | std::vector<std::vector<double>> features; |
1400 | ExtractSingle(DeepCopyIRModule(candidate->sch->mod()), is_gpu, &features); |
1401 | if (extract_workload) { |
1402 | for (auto& feature : features) { |
1403 | feature_group6->Export(&feature); |
1404 | } |
1405 | } |
1406 | results[task_id] = tir::utils::AsNDArray(features); |
1407 | }; |
1408 | support::parallel_for_dynamic(0, candidates.size(), tune_context->num_threads, f); |
1409 | return results; |
1410 | } |
1411 | |
1412 | static constexpr const char* _type_key = "meta_schedule.PerStoreFeature" ; |
1413 | TVM_DECLARE_FINAL_OBJECT_INFO(PerStoreFeatureNode, FeatureExtractorNode); |
1414 | }; |
1415 | |
1416 | FeatureExtractor FeatureExtractor::(int buffers_per_store, |
1417 | int arith_intensity_curve_num_samples, |
1418 | int cache_line_bytes, bool ) { |
1419 | ObjectPtr<PerStoreFeatureNode> n = make_object<PerStoreFeatureNode>(); |
1420 | n->buffers_per_store = buffers_per_store; |
1421 | n->arith_intensity_curve_num_samples = arith_intensity_curve_num_samples; |
1422 | n->cache_line_bytes = cache_line_bytes; |
1423 | n->extract_workload = extract_workload; |
1424 | n->feature_vector_length = tir::group1::Feature::kCount + // |
1425 | tir::group2::Feature::SubFeature::kCount * buffers_per_store + // |
1426 | arith_intensity_curve_num_samples + // |
1427 | tir::group4::Feature::kCount + // |
1428 | tir::group5::Feature::kCount; |
1429 | if (extract_workload) { |
1430 | n->feature_vector_length += tir::group6::Feature::kCount; |
1431 | } |
1432 | return FeatureExtractor(n); |
1433 | } |
1434 | |
1435 | TVM_REGISTER_NODE_TYPE(PerStoreFeatureNode); |
1436 | TVM_REGISTER_GLOBAL("meta_schedule.FeatureExtractorPerStoreFeature" ) |
1437 | .set_body_typed(FeatureExtractor::PerStoreFeature); |
1438 | |
1439 | } // namespace meta_schedule |
1440 | } // namespace tvm |
1441 | |