1 | /* |
2 | * Licensed to the Apache Software Foundation (ASF) under one |
3 | * or more contributor license agreements. See the NOTICE file |
4 | * distributed with this work for additional information |
5 | * regarding copyright ownership. The ASF licenses this file |
6 | * to you under the Apache License, Version 2.0 (the |
7 | * "License"); you may not use this file except in compliance |
8 | * with the License. You may obtain a copy of the License at |
9 | * |
10 | * http://www.apache.org/licenses/LICENSE-2.0 |
11 | * |
12 | * Unless required by applicable law or agreed to in writing, |
13 | * software distributed under the License is distributed on an |
14 | * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
15 | * KIND, either express or implied. See the License for the |
16 | * specific language governing permissions and limitations |
17 | * under the License. |
18 | */ |
19 | |
20 | /*! |
21 | * \file tvm/tir/builtin.h |
22 | * \brief TIR builtin intrinsics. |
23 | * |
24 | * TIR builtin intrinsics are stored as tvm:Op. |
25 | * They are processed in the same way as we process Ops. |
26 | * |
27 | * It is not necessary to create a function for every Op, |
28 | * as we can obtain them through Op::Get. |
29 | * |
30 | * This file contains the most commonly used intrinsics or |
31 | * those that have special semantics and need compiler support. |
32 | */ |
33 | #ifndef TVM_TIR_BUILTIN_H_ |
34 | #define TVM_TIR_BUILTIN_H_ |
35 | |
36 | #include <tvm/ir/op.h> |
37 | #include <tvm/tir/expr.h> |
38 | |
39 | namespace tvm { |
40 | namespace tir { |
41 | |
42 | /*! \brief Collection of builtin intrinsics as ops */ |
43 | namespace builtin { |
44 | /*! |
45 | * \brief Return value. |
46 | */ |
47 | TVM_DLL const Op& ret(); |
48 | /*! |
49 | * \brief Reinterpret the value using the target type. |
50 | */ |
51 | TVM_DLL const Op& reinterpret(); |
52 | |
53 | /*! |
54 | * \brief Marks a condition is likely going to happen. |
55 | */ |
56 | TVM_DLL const Op& likely(); |
57 | |
58 | /*! |
59 | * \brief Bitwise and operator. |
60 | */ |
61 | TVM_DLL const Op& bitwise_and(); |
62 | |
63 | /*! |
64 | * \brief Bitwise or operator. |
65 | */ |
66 | TVM_DLL const Op& bitwise_or(); |
67 | |
68 | /*! |
69 | * \brief Bitwise xor operator. |
70 | */ |
71 | TVM_DLL const Op& bitwise_xor(); |
72 | |
73 | /*! |
74 | * \brief Bitwise not operator. |
75 | */ |
76 | TVM_DLL const Op& bitwise_not(); |
77 | |
78 | /*! |
79 | * \brief Left shift |
80 | */ |
81 | TVM_DLL const Op& shift_left(); |
82 | |
83 | /*! |
84 | * \brief Right shift |
85 | */ |
86 | TVM_DLL const Op& shift_right(); |
87 | |
88 | /*! |
89 | * \brief See pesudo code |
90 | * |
91 | * Construct a big uint that may not be representable by int64 |
92 | * |
93 | * Expr large_uint_imm(uint32_t v0, uin32_t v1) { |
94 | * return (v1 << 32) | v0; |
95 | * } |
96 | */ |
97 | TVM_DLL const Op& large_uint_imm(); |
98 | |
99 | /*! |
100 | * \brief Execute a multiplication between two Q-numbers x and y |
101 | * followed by a right shift s |
102 | * The default rounding rule is to the nearest value, rounding half up |
103 | * (i.e., round(x.1) = x and round (x.5) = x+1) |
104 | */ |
105 | TVM_DLL const Op& q_multiply_shift(); |
106 | |
107 | /*! |
108 | * \brief Returns the address of an element in the buffer (see pseudocode below). |
109 | * |
110 | * The number of indices should match the dimensionality of the buffer |
111 | * being accessed. If this operation occurs after buffer flattening, |
112 | * the number of indices must be supported by the target (i.e. N>1 |
113 | * only on targets that support non-flat memory buffers). |
114 | * |
115 | * Handle address_of(BufferLoad *op) { |
116 | * return &op->buffer_var[op->indices[0], op->indices[1], ..., op->indices[N-1]]; |
117 | * } |
118 | */ |
119 | TVM_DLL const Op& address_of(); |
120 | |
121 | /*! |
122 | * \brief Same as select, used for unsafe memory access. |
123 | * |
124 | * Type tvm_if_then_else(cond, a, b) { |
125 | * return cond ? a : b; |
126 | * } |
127 | */ |
128 | TVM_DLL const Op& if_then_else(); |
129 | |
130 | /*! |
131 | * \brief See pesudo code |
132 | * |
133 | * bool isnullptr(void* handle) { |
134 | * return handle == nullptr |
135 | * } |
136 | */ |
137 | TVM_DLL const Op& isnullptr(); |
138 | |
139 | /*! |
140 | * \brief Check if value is nan |
141 | */ |
142 | TVM_DLL const Op& isnan(); |
143 | |
144 | /*! |
145 | * \brief Popcount |
146 | */ |
147 | TVM_DLL const Op& popcount(); |
148 | |
149 | /*! |
150 | * \brief Fused multiply add |
151 | * |
152 | * Type fma(a, b, c) { |
153 | * return a * b + c; |
154 | * } |
155 | */ |
156 | TVM_DLL const Op& fma(); |
157 | |
158 | /*! |
159 | * \brief Call an extern C function with given name |
160 | * and signature from the types of args in the runtime environment. |
161 | * |
162 | * Type call_extern(name, args...) { |
163 | * return dlsym(name)(args...); |
164 | * } |
165 | * |
166 | * \note This intrinsic does not provide any type checking, |
167 | * and is main used for backward compatibility reasons. |
168 | * Always consider use pre-registered and typed tvm::Op first. |
169 | */ |
170 | TVM_DLL const Op& call_extern(); |
171 | |
172 | /*! |
173 | * \brief Call an pure extern C function with given name |
174 | * and signature from the types of args in the runtime environment. |
175 | * |
176 | * Type call_pure_extern(name, args...) { |
177 | * return dlsym(name)(args...); |
178 | * } |
179 | * |
180 | * \note This intrinsic does not provide any type checking, |
181 | * and is main used for backward compatibility reasons. |
182 | * Always consider use pre-registered and typed tvm::Op first. |
183 | */ |
184 | TVM_DLL const Op& call_pure_extern(); |
185 | |
186 | /*! |
187 | * \brief Call an LLVM intrinsic with a given intrinsic id |
188 | * and signature from the types of args in the runtime environment. |
189 | * |
190 | * Type call_llvm_pure_intrin(intrin_id, args...) { |
191 | * return dlsym(name)(args...); |
192 | * } |
193 | * |
194 | * \note This op does not provide any type checking. |
195 | */ |
196 | TVM_DLL const Op& call_llvm_intrin(); |
197 | |
198 | /*! |
199 | * \brief Call an LLVM pure intrinsic with a given intrinsic id |
200 | * and signature from the types of args in the runtime environment. |
201 | * |
202 | * Type call_llvm_pure_intrin(intrin_id, args...) { |
203 | * return dlsym(name)(args...); |
204 | * } |
205 | * |
206 | * \note This op does not provide any type checking. |
207 | */ |
208 | TVM_DLL const Op& call_llvm_pure_intrin(); |
209 | |
210 | /*! |
211 | * \brief Call an SPIRV pure GLSL450 intrinsic. |
212 | * |
213 | * Type call_spirv_pure_glsl450(intrin_id, args...) { |
214 | * return dlsym(name)(args...); |
215 | * } |
216 | * |
217 | * \note This op does not provide any type checking. |
218 | */ |
219 | TVM_DLL const Op& call_spirv_pure_glsl450(); |
220 | |
221 | // TODO(tvm-team) revisit the builtins below |
222 | // some of them can simply become ops with special codegen attr. |
223 | /*! |
224 | * \brief Prefetch a cacheline |
225 | */ |
226 | TVM_DLL const Op& prefetch(); |
227 | |
228 | /*! |
229 | * \brief Get head access address with memory access pattern info. |
230 | * |
231 | * This operator also marks range of the memory access |
232 | * The offset and extent are in unit of the DType(including vectorization factor). |
233 | * rw_mask is a bit_mask setting whether the access is a read(1) or write(2). |
234 | * The access is assume to happen in the current expression. |
235 | * |
236 | * PtrType tvm_access_ptr(Expr dtype, DType* data, |
237 | * int offset, int extent, |
238 | * int rw_mask) { |
239 | * // DType == dtype.type(); |
240 | * return &data[offset]; |
241 | * } |
242 | */ |
243 | TVM_DLL const Op& tvm_access_ptr(); |
244 | |
245 | /*! |
246 | * \brief Create a function local static handle that iniitalizes to nullptr. |
247 | * can be used to cache function local static resources. |
248 | */ |
249 | TVM_DLL const Op& tvm_static_handle(); |
250 | |
251 | /*! |
252 | * \brief Return a unique context id, used for hint of workspace separation. |
253 | * Different context id ganrantees not having overlapping workspace. |
254 | */ |
255 | TVM_DLL const Op& tvm_context_id(); |
256 | |
257 | /*! |
258 | * \brief tvm_tuple is not an actual function and cannot codegen. |
259 | * It is used to represent tuple structure in value field of AttrStmt, |
260 | * for the sake of giving hint to optimization. |
261 | * |
262 | * Handle tvm_tuple(value0, value1, ..., value_n); |
263 | */ |
264 | TVM_DLL const Op& tvm_tuple(); |
265 | |
266 | /*! |
267 | * \brief See pesudo code |
268 | * |
269 | * Type tvm_struct_get(StructType* arr, int index, int field_id) { |
270 | * return arr[index]->field; |
271 | * } |
272 | * \sa TVMStructFieldKind |
273 | */ |
274 | TVM_DLL const Op& tvm_struct_get(); |
275 | |
276 | /*! |
277 | * \brief See pesudo code |
278 | * |
279 | * Handle tvm_struct_set(StructType* arr, int index, int field_id, value) { |
280 | * arr[index]->field = value; |
281 | * } |
282 | * \sa TVMStructFieldKind |
283 | */ |
284 | TVM_DLL const Op& tvm_struct_set(); |
285 | |
286 | /*! |
287 | * \brief See pseudo code |
288 | * Type lookup_param(String param_name) { |
289 | * return __tvm_param__param_name; |
290 | * } |
291 | */ |
292 | TVM_DLL const Op& lookup_param(); |
293 | |
294 | /*! |
295 | * \brief See pesudo code |
296 | * |
297 | * void tvm_throw_last_error() { |
298 | * throw TVMGetLastError(); |
299 | * } |
300 | */ |
301 | TVM_DLL const Op& tvm_throw_last_error(); |
302 | |
303 | /*! |
304 | * \brief See pesudo code |
305 | * |
306 | * dtype in {shape, array, arg_value, arg_tcode} |
307 | * |
308 | * Handle tvm_stack_alloca(string dtype, int num) { |
309 | * return new on stack dtype[num]; |
310 | * } |
311 | */ |
312 | TVM_DLL const Op& tvm_stack_alloca(); |
313 | |
314 | /*! |
315 | * \brief Allocate a shape tuple on stack, return the handle. |
316 | * |
317 | * Handle tvm_stack_make_shape(list args) { |
318 | * ret = alloca stack int64_t[len(args)]; |
319 | * for i in range(len(args)): |
320 | * ret[i] = args[i] |
321 | * return &ret[0]; |
322 | * } |
323 | */ |
324 | TVM_DLL const Op& tvm_stack_make_shape(); |
325 | |
326 | /*! |
327 | * \brief Allocate a NDArray(DLTensor) on stack, return the handle. |
328 | * |
329 | * Type tvm_stack_make_array(Expr data, |
330 | * Expr shape, |
331 | * Expr strides, |
332 | * Expr ndim, |
333 | * Expr dtype, |
334 | * Expr elem_offset) { |
335 | * ret = alloca stack DLTensor(); |
336 | * ret->data = data; |
337 | * ret->shape = shape; |
338 | * ret->strides = strides != 0 ? strides : nullptr; |
339 | * ret->ndim = ndim; |
340 | * ret->dtype = dtype.type(); |
341 | * ret->byte_offset = elem_offset * sizeof(dtype); |
342 | * return ret; |
343 | * } |
344 | */ |
345 | TVM_DLL const Op& tvm_stack_make_array(); |
346 | |
347 | /*! |
348 | * \brief See pesudo code |
349 | * |
350 | * return_type tvm_call_packed(name, TVMValue* args) { |
351 | * TVMValue ret_value; |
352 | * int ret_code; |
353 | * ModuleNode* env = GetCurrentEnv(); |
354 | * const PackedFunc* f = env->GetFuncFromEnv(name); |
355 | * (*f)(args, type_code_of(args), len(args), &ret_value, &ret_code); |
356 | * // return type can be int, float, handle. |
357 | * return cast(return_type, ret_value.v_return_type); |
358 | * } |
359 | */ |
360 | TVM_DLL const Op& tvm_call_packed(); |
361 | |
362 | /*! |
363 | * \brief See pesudo code |
364 | * |
365 | * return_type tvm_call_packed(fname, TVMValue* args) { |
366 | * int ret_code; |
367 | * TVMValue ret_value; |
368 | * (*fname)(args, type_code_of(args), len(args), &ret_value, &ret_code); |
369 | * return cast(return_type, ret_value.v_return_type); |
370 | * } |
371 | */ |
372 | TVM_DLL const Op& tvm_call_cpacked(); |
373 | |
374 | /*! |
375 | * \brief See pesudo code |
376 | * |
377 | * return_type tvm_call_trace_packed(name, TVMValue* args) { |
378 | * ModuleNode* env = GetCurrentEnv(); |
379 | * const PackedFunc* f = env->GetFuncFromEnv(name); |
380 | * (*f)(args, type_code_of(args), len(args)); |
381 | * // return type can be int, float, handle. |
382 | * return cast(return_type, ret_value.v_return_type); |
383 | * } |
384 | */ |
385 | TVM_DLL const Op& tvm_call_trace_packed(); |
386 | |
387 | /*! |
388 | * \brief Checks the return value of another call is correct or returns a given value. |
389 | * |
390 | * \note This is meant to serve a specific case for AOT code generator whilst this |
391 | * cannot be fully represented in TIR. |
392 | * |
393 | * Type tvm_check_return(expected, return_unexpected, nested_call) { |
394 | * if (nested_call() != expected) { |
395 | * return return_unexpected; |
396 | * } |
397 | * } |
398 | */ |
399 | TVM_DLL const Op& tvm_check_return(); |
400 | |
401 | /*! |
402 | * \brief See pesudo code |
403 | * Mark the content as thread local context, can get optimized |
404 | * by only call the call once at thread start. |
405 | * |
406 | * Do not allow nesting(getting a thread context from another). |
407 | * |
408 | * Handle tvm_thread_context(Expr call) { |
409 | * return call; |
410 | * } |
411 | */ |
412 | TVM_DLL const Op& tvm_thread_context(); |
413 | |
414 | /*! |
415 | * \brief Lowered version of call packed, the space of value and |
416 | * type codes are explicitly allocated. |
417 | * |
418 | * return_type tvm_call_packed_lowered(name, |
419 | * TVMValue* value_stack, |
420 | * int* tcode_stack, |
421 | * int begin, |
422 | * int end) { |
423 | * ModuleNode* env = GetCurrentEnv(); |
424 | * const PackedFunc* f = env->GetFuncFromEnv(name); |
425 | * f->CallPacked(TVMArgs(value_stack[begin:end], |
426 | * tcode_stack[begin:end]), |
427 | * TVMRetValue(value_stack + end, tcode_stack + end)); |
428 | * // return type can be int, float, handle. |
429 | * return cast(return_type, load_return_from(tcode_stack + end)) |
430 | * } |
431 | */ |
432 | TVM_DLL const Op& tvm_call_packed_lowered(); |
433 | |
434 | /*! |
435 | * \brief Lowered version of call c-packed, the space of value and |
436 | * type codes are explicitly allocated. |
437 | * |
438 | * int tvm_call_packed_lowered(fname, |
439 | * TVMValue* value_stack, |
440 | * int* tcode_stack, |
441 | * int begin, |
442 | * int end) { |
443 | * fname(TVMArgs(value_stack[begin:end], tcode_stack[begin:end]), |
444 | * TVMRetValue(value_stack + end, tcode_stack + end)); |
445 | * } |
446 | */ |
447 | TVM_DLL const Op& tvm_call_cpacked_lowered(); |
448 | |
449 | /*! |
450 | * \brief Lowered version of trace intrinsic, the space of value and |
451 | * type codes are explicitly allocated. The return value is the |
452 | * (end - 1) value on the stack. |
453 | * |
454 | * return_type tvm_call_trace_packed_lowered(name, |
455 | * TVMValue* value_stack, |
456 | * int* tcode_stack, |
457 | * int begin, |
458 | * int end) { |
459 | * ModuleNode* env = GetCurrentEnv(); |
460 | * const PackedFunc* f = env->GetFuncFromEnv(name); |
461 | * f->CallPacked(TVMArgs(value_stack[begin:end], |
462 | * tcode_stack[begin:end]), |
463 | * TVMRetValue(value_stack + end, tcode_stack + end)); |
464 | * // return type can be int, float, handle. |
465 | * return cast(return_type, load_return_from(tcode_stack + end)) |
466 | * } |
467 | */ |
468 | TVM_DLL const Op& tvm_call_trace_packed_lowered(); |
469 | |
470 | /*! |
471 | * \brief See pseudo code |
472 | * |
473 | * int tvm_storage_sync(std::string storage_scope) { |
474 | * __sync(storage_scope); |
475 | * return 0; |
476 | * } |
477 | */ |
478 | TVM_DLL const Op& tvm_storage_sync(); |
479 | |
480 | /*! |
481 | * \brief See pseudo code |
482 | * |
483 | * Type tvm_warp_shuffle(mask, Type value, warp_id, width, warp_size) { |
484 | * return (value passed in by warp indicated by this_warp_id); |
485 | * } |
486 | * |
487 | * Type tvm_warp_shuffle_up(mask, Type value, offset, width, warp_size) { |
488 | * return (value passed in by warp indicated by this_warp_id - offset); |
489 | * } |
490 | * |
491 | * Type tvm_warp_shuffle_down(mask, Type value, offset, width, warp_size) { |
492 | * return (value passed in by warp indicated by this_warp_id + offset); |
493 | * } |
494 | * |
495 | * unsigned tvm_warp_activemask() { |
496 | * return (32-bit mask of currently active threads in the calling warp); |
497 | * } |
498 | * |
499 | * Parameter warp_id indicates the source thread ID in a warp. |
500 | * |
501 | * Parameter offset indicates the relative distance to this_warp_id. |
502 | * |
503 | * Parameter width indicates the number of threads involved in one |
504 | * shuffle. See CUDA document for __shfl_sync, __shfl_up_sync, |
505 | * __shfl_down_sync and __activemask. |
506 | * |
507 | * Parameter warp_size is the size of a warp, which helps a backend |
508 | * to determine wheter the width paramter is legal. |
509 | * |
510 | */ |
511 | TVM_DLL const Op& tvm_warp_shuffle(); |
512 | TVM_DLL const Op& tvm_warp_shuffle_up(); |
513 | TVM_DLL const Op& tvm_warp_shuffle_down(); |
514 | TVM_DLL const Op& tvm_warp_activemask(); |
515 | |
516 | /*! |
517 | * \brief Initialize the global barrier. |
518 | * Call this at beginning of kernel that need global barrier. |
519 | */ |
520 | TVM_DLL const Op& tvm_global_barrier_kinit(); |
521 | |
522 | /*! |
523 | * \brief See pesudo code |
524 | * |
525 | * void tvm_thread_allreduce(UIntImm size, Expr source0, ..., Expr cond, |
526 | * Var reduce_temp0, .., Var thread_idx1, ...) { |
527 | * // constraint by the other thread_idx remain the same. |
528 | * // reduce_temp is used to save intermediate result. |
529 | * reduce_temp0, ... = reduce(combiner, source0, ..., cond |
530 | * over [thread_idx1, thread_idx2] passed by any caller) |
531 | * } |
532 | */ |
533 | TVM_DLL const Op& tvm_thread_allreduce(); |
534 | |
535 | // TODO(tvm-team) TensorCore specific intrinsics should be directly registered under |
536 | // cuda. namespace and used through op. |
537 | /*! |
538 | * \brief tvm intrinsic for tensor core load operators. |
539 | * |
540 | * void tvm_load_matrix_sync(Var fragment, UIntImm m, UIntImm, n, UIntImm k, |
541 | * Expr index, Expr buffer_ptr, Expr stride, |
542 | * StringImm layout) { |
543 | * // m, n, k are the shape of wmma fragment. |
544 | * // Determine fragment layout(column-major or row major) by layout. |
545 | * // fragments must be in 'wmma.matrix_a' or 'wmma.matrix_b' scope. |
546 | * nvcuda::wmma::load_matrix_sync(fragment[index], buffer_ptr, stride); |
547 | * } |
548 | */ |
549 | TVM_DLL const Op& tvm_load_matrix_sync(); |
550 | |
551 | /*! |
552 | * \brief tvm intrinsic for tensor core mma_sync operators. |
553 | * |
554 | * void tvm_mma_sync(Var fragment_d, Expr index_d, |
555 | * Var fragment_a, Expr index_a, |
556 | * Var fragment_b, Expr index_b, |
557 | * Var fragment_c, Expr index_c) { |
558 | * nvcuda::wmma::mma_sync(fragment_d[index_d], fragment_a[index_a], |
559 | * fragment_b[index_b], fragment_c[index_c]); |
560 | * } |
561 | */ |
562 | TVM_DLL const Op& tvm_mma_sync(); |
563 | |
564 | /*! |
565 | * \brief tvm intrinsic for tensor core bmma_sync operators. |
566 | * |
567 | * void tvm_bmma_sync(Var fragment_d, Expr index_d, |
568 | * Var fragment_a, Expr index_a, |
569 | * Var fragment_b, Expr index_b, |
570 | * Var fragment_c, Expr index_c) { |
571 | * nvcuda::wmma::bmma_sync(fragment_d[index_d], fragment_a[index_a], |
572 | * fragment_b[index_b], fragment_c[index_c]); |
573 | * } |
574 | */ |
575 | TVM_DLL const Op& tvm_bmma_sync(); |
576 | |
577 | /*! |
578 | * \brief tvm intrinsic for tensor core fill_fragment operators. |
579 | * |
580 | * void tvm_fill_fragment(Var fragment, UIntImm m, UIntImm, n, UIntImm k, |
581 | * Expr index, Expr value) { |
582 | * // m, n, k are the shape of wmma fragment |
583 | * // fragments must be in 'wmma.accumulator' scope. |
584 | * nvcuda::wmma::fill_fragment(fragment[index], value); |
585 | * } |
586 | */ |
587 | TVM_DLL const Op& tvm_fill_fragment(); |
588 | |
589 | /*! |
590 | * \brief tvm intrinsic for tensor core store operators. |
591 | * |
592 | * void tvm_store_matrix_sync(Var fragment, UIntImm m, UIntImm, n, UIntImm k, |
593 | * Expr index, Expr buffer_ptr, Expr stride, |
594 | * StringImm layout) { |
595 | * // m, n, k are the shape of wmma fragment |
596 | * // fragments must be in 'wmma.accumulator' scope. |
597 | * nvcuda::wmma::store_matrix_sync(fragment[index], buffer_ptr, stride, layout); |
598 | * } |
599 | */ |
600 | TVM_DLL const Op& tvm_store_matrix_sync(); |
601 | |
602 | /*! |
603 | * \brief tvm intrinsic for ptx tensor core mma instructions. |
604 | * |
605 | * void ptx_mma(StringImm shape, StringImm A_layout, StringImm B_layout, |
606 | * StringImm A_dtype, StringImm B_dtype, StringImm C_dtype, |
607 | * Var multiplicand_a, Expr a_index, |
608 | * Var multiplicand_b, Expr b_index, |
609 | * Var accumulator, Expr c_index, bool saturate); |
610 | */ |
611 | TVM_DLL const Op& ptx_mma(); |
612 | |
613 | /*! |
614 | * \brief tvm intrinsic for sparse tensor core ptx instructions. |
615 | * |
616 | * void ptx_mma_sp(StringImm shape, StringImm A_layout, StringImm B_layout, |
617 | * StringImm A_dtype, StringImm B_dtype, StringImm C_dtype, |
618 | * Var multiplicand_a, Expr a_index, |
619 | * Var multiplicand_b, Expr b_index, |
620 | * Var accumulator, Expr c_index, |
621 | * Var metadata, Expr meta_index, |
622 | * Var sparse_selector, bool saturate); |
623 | */ |
624 | TVM_DLL const Op& ptx_mma_sp(); |
625 | |
626 | /*! |
627 | * \brief tvm intrinsic for ptx load matrix from shared memory. |
628 | * |
629 | * void ptx_ldmatrix(Bool trans, IntImm num, StringImm type, |
630 | * Var local_ptr, Expr local_offset, |
631 | * Var smem_ptr, Expr smem_offset); |
632 | */ |
633 | TVM_DLL const Op& ptx_ldmatrix(); |
634 | |
635 | /*! |
636 | * \brief tvm intrinsics for ptx async copy from global to shared memory |
637 | * |
638 | * void ptx_cp_async(Var shared_ptr, Expr shared_offset, Var global_ptr, Expr global_offset, size_t |
639 | * bytes); |
640 | * |
641 | */ |
642 | TVM_DLL const Op& ptx_cp_async(); |
643 | |
644 | /*! |
645 | * \brief tvm intrinsics for ptx async copy commit and wait. |
646 | * |
647 | * void ptx_commit_group(); |
648 | * void ptx_wait_group(int num); |
649 | * |
650 | */ |
651 | TVM_DLL const Op& ptx_commit_group(); |
652 | TVM_DLL const Op& ptx_wait_group(); |
653 | |
654 | /*! |
655 | * \brief tvm intrinsic for storing the result of PTX MMA into a destination pointer. |
656 | * For example, if each thread in a warp of size 32 has 4 elements from the result of |
657 | * m16xn8xk16 MMA in its registers, this intrinsic can be used to store the result in a |
658 | * 16x8 region in shared or global memory. |
659 | * |
660 | * There is no real PTX instruction that does that, but we want to hide details of |
661 | * complex index manipulation behind this intrinsic to simplify TIR lowering passes (e.g. |
662 | * LowerWarpMemory). |
663 | * |
664 | * void mma_store(IntImm m, IntImm n, Var dst_ptr, Var src_ptr, Expr src_offset, Var dst_stride); |
665 | */ |
666 | TVM_DLL const Op& mma_store(); |
667 | |
668 | /*! |
669 | * \brief tvm intrinsic for zero-initalizing an MMA accumulation registor. |
670 | * For example, if each thread in a warp of size 32 has 8 elements from the A matrix in |
671 | * m16xn8xk16 MMA in its registers, this intrinsic can be used to zero-initialize its |
672 | * 4 accumulation registers. |
673 | * |
674 | * There is no real PTX instruction that does that, but we introduce this intrinsic for the |
675 | * same reason as mma_store above. |
676 | * |
677 | * void mma_fill(IntImm local_size, Var local_ptr, Expr offset); |
678 | */ |
679 | TVM_DLL const Op& mma_fill(); |
680 | |
681 | // TODO(tvm-team) replace the usage of the vector operations by Shuffle. |
682 | /*! |
683 | * \brief Get the high level half of the vector |
684 | */ |
685 | TVM_DLL const Op& vectorhigh(); |
686 | |
687 | /*! |
688 | * \brief Get the low-level half of the vector |
689 | */ |
690 | TVM_DLL const Op& vectorlow(); |
691 | |
692 | /*! |
693 | * \brief Concat two vectors. |
694 | */ |
695 | TVM_DLL const Op& vectorcombine(); |
696 | |
697 | /*! |
698 | * \brief atomic add instruction, corresponding e.g. to atomicAdd in CUDA |
699 | */ |
700 | TVM_DLL const Op& atomic_add(); |
701 | /*! |
702 | * \brief Create an Nd memory allocation with storage scope |
703 | */ |
704 | TVM_DLL const Op& nd_mem_alloc_with_scope(); |
705 | |
706 | /*! |
707 | * \brief Store to texture 2d memory |
708 | */ |
709 | TVM_DLL const Op& texture2d_store(); |
710 | |
711 | /*! |
712 | * \brief Load from texture 2d memory |
713 | */ |
714 | TVM_DLL const Op& texture2d_load(); |
715 | |
716 | /*! |
717 | * \brief Initiate a non-blocking DMA copy from source to destination |
718 | */ |
719 | TVM_DLL const Op& dma_copy(); |
720 | |
721 | /*! |
722 | * \brief Wait until the number of DMAs in flight is less than or equal to some maximum |
723 | */ |
724 | TVM_DLL const Op& dma_wait(); |
725 | |
726 | /*! |
727 | * \brief Provide a true statement that can be used for simplifications |
728 | * |
729 | * Compile-time representation of known constraints about function |
730 | * inputs. This assumption is removed when lowering, and does not |
731 | * occur in codegen. |
732 | */ |
733 | TVM_DLL const Op& assume(); |
734 | |
735 | /*! |
736 | * \brief Returns an initialized but arbitrary value |
737 | * |
738 | * Compile-time representation of memory locations whose values may be |
739 | * altered as a result of optimizations. |
740 | */ |
741 | TVM_DLL const Op& undef(); |
742 | |
743 | /*! |
744 | * \brief Profiling intrinsic |
745 | */ |
746 | TVM_DLL const Op& start_profile_intrinsic(); |
747 | |
748 | /*! |
749 | * \brief Profiling intrinsic |
750 | */ |
751 | TVM_DLL const Op& end_profile_intrinsic(); |
752 | |
753 | /*! \brief The kind of structure field info used in intrinsic */ |
754 | enum TVMStructFieldKind : int { |
755 | // array head address |
756 | kArrAddr, |
757 | kArrData, |
758 | kArrShape, |
759 | kArrStrides, |
760 | kArrNDim, |
761 | kArrTypeCode, |
762 | kArrTypeBits, |
763 | kArrTypeLanes, |
764 | kArrByteOffset, |
765 | kArrDeviceId, |
766 | kArrDeviceType, |
767 | kArrKindBound_, |
768 | // TVMValue field |
769 | kTVMValueContent, |
770 | kTVMValueKindBound_ |
771 | }; |
772 | } // namespace builtin |
773 | } // namespace tir |
774 | } // namespace tvm |
775 | #endif // TVM_TIR_BUILTIN_H_ |
776 | |