1 | #include "triton/codegen/target.h" |
2 | #include "llvm/IR/IRBuilder.h" |
3 | #include "llvm/IR/Function.h" |
4 | #include "llvm/IR/Intrinsics.h" |
5 | #include "llvm/IR/IntrinsicsNVPTX.h" |
6 | #include "llvm/IR/IntrinsicsAMDGPU.h" |
7 | #include "llvm/IR/Value.h" |
8 | #include "llvm/IR/IRBuilder.h" |
9 | #include <iostream> |
10 | |
11 | using namespace llvm; |
12 | |
13 | namespace triton{ |
14 | namespace codegen{ |
15 | |
16 | // base |
17 | |
18 | |
19 | nvidia_cu_target* target::as_nvidia() { |
20 | return dynamic_cast<nvidia_cu_target*>(this); |
21 | } |
22 | |
23 | bool target::is_gpu() const { |
24 | return is_gpu_; |
25 | } |
26 | |
27 | // AMD |
28 | void amd_cl_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) { |
29 | fn->setCallingConv(CallingConv::AMDGPU_KERNEL); |
30 | } |
31 | |
32 | Instruction* amd_cl_target::add_barrier(Module *module, IRBuilder<>& builder) { |
33 | Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_barrier); |
34 | return builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {}); |
35 | } |
36 | |
37 | Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { |
38 | Value* group_id = get_block_id(module, builder, ax); |
39 | Value* result = builder.CreateMul(builder.getInt32(stride), group_id); |
40 | return result; |
41 | } |
42 | |
43 | Instruction* amd_cl_target::add_memfence(Module *module, IRBuilder<>& builder) { |
44 | throw std::runtime_error("not implemented" ); |
45 | } |
46 | |
47 | |
48 | Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) { |
49 | static std::array<Intrinsic::ID, 3> ids = { |
50 | Intrinsic::amdgcn_workgroup_id_x, |
51 | Intrinsic::amdgcn_workgroup_id_y, |
52 | Intrinsic::amdgcn_workgroup_id_z |
53 | }; |
54 | Value* group_id = builder.CreateIntrinsic(ids[ax], {}, {}); |
55 | return group_id; |
56 | } |
57 | |
58 | Value* amd_cl_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) { |
59 | throw std::runtime_error("not implemented on AMD" ); |
60 | } |
61 | |
62 | Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { |
63 | static std::array<Intrinsic::ID, 3> ids = { |
64 | Intrinsic::amdgcn_workitem_id_x, |
65 | Intrinsic::amdgcn_workitem_id_y, |
66 | Intrinsic::amdgcn_workitem_id_z |
67 | }; |
68 | Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]); |
69 | return builder.CreateCall(get_local_id, {}); |
70 | } |
71 | |
72 | // NVIDIA |
73 | |
74 | void nvidia_cu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn){ |
75 | // set metadata |
76 | Metadata *md_args[] = { |
77 | ValueAsMetadata::get(fn), |
78 | MDString::get(ctx, "kernel" ), |
79 | ValueAsMetadata::get(builder.getInt32(1)) |
80 | }; |
81 | module->getOrInsertNamedMetadata("nvvm.annotations" )->addOperand(MDNode::get(ctx, md_args)); |
82 | } |
83 | |
84 | Instruction* nvidia_cu_target::add_barrier(Module *module, IRBuilder<>& builder) { |
85 | Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0); |
86 | return builder.CreateCall(barrier, {}); |
87 | } |
88 | |
89 | Instruction* nvidia_cu_target::add_memfence(Module *module, IRBuilder<>& builder) { |
90 | Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_membar_gl); |
91 | return builder.CreateCall(barrier, {}); |
92 | } |
93 | |
94 | |
95 | Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { |
96 | Value* group_id = get_block_id(module, builder, ax); |
97 | Value* result = builder.CreateMul(builder.getInt32(stride), group_id); |
98 | return result; |
99 | } |
100 | |
101 | Value* nvidia_cu_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) { |
102 | static std::array<Intrinsic::ID, 3> cta_ids = { |
103 | Intrinsic::nvvm_read_ptx_sreg_ctaid_x, |
104 | Intrinsic::nvvm_read_ptx_sreg_ctaid_y, |
105 | Intrinsic::nvvm_read_ptx_sreg_ctaid_z |
106 | }; |
107 | Value* cta_id = builder.CreateIntrinsic(cta_ids[ax], {}, {}); |
108 | return cta_id; |
109 | } |
110 | |
111 | Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { |
112 | static std::array<Intrinsic::ID, 3> ids = { |
113 | Intrinsic::nvvm_read_ptx_sreg_tid_x, |
114 | Intrinsic::nvvm_read_ptx_sreg_tid_y, |
115 | Intrinsic::nvvm_read_ptx_sreg_tid_z |
116 | }; |
117 | Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]); |
118 | return builder.CreateCall(get_local_id, {}); |
119 | } |
120 | |
121 | Value* nvidia_cu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) { |
122 | static std::array<Intrinsic::ID, 3> ids = { |
123 | Intrinsic::nvvm_read_ptx_sreg_nctaid_x, |
124 | Intrinsic::nvvm_read_ptx_sreg_nctaid_y, |
125 | Intrinsic::nvvm_read_ptx_sreg_nctaid_z |
126 | }; |
127 | return builder.CreateIntrinsic(ids[ax], {}, {}); |
128 | } |
129 | |
130 | // CPU |
131 | |
132 | void cpu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) { |
133 | // normal cpu functions can be kernels |
134 | } |
135 | |
136 | Instruction* cpu_target::add_barrier(Module *module, IRBuilder<>& builder) { |
137 | // no barrier on CPU |
138 | return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0)); |
139 | } |
140 | |
141 | Instruction* cpu_target::add_memfence(Module *module, IRBuilder<>& builder) { |
142 | // no barrier on CPU |
143 | return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0)); |
144 | } |
145 | |
146 | |
147 | Value* cpu_target::get_block_id(Module *module, llvm::IRBuilder<> &builder, unsigned ax) { |
148 | const Function *fn = builder.GetInsertBlock()->getParent(); |
149 | size_t num_params = fn->getFunctionType()->getNumParams(); |
150 | static std::array<const Argument*, 3> ids = { |
151 | fn->arg_begin() + num_params - 3, |
152 | fn->arg_begin() + num_params - 2, |
153 | fn->arg_begin() + num_params - 1 |
154 | }; |
155 | return (Argument*)ids[ax]; |
156 | } |
157 | |
158 | Value* cpu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) { |
159 | throw std::runtime_error("not implemented" ); |
160 | } |
161 | |
162 | |
163 | Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) { |
164 | Value* result = builder.CreateMul(builder.getInt32(stride), get_block_id(module, builder, ax)); |
165 | return result; |
166 | } |
167 | |
168 | Value* cpu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) { |
169 | return builder.getInt32(0); |
170 | } |
171 | |
172 | } |
173 | } |
174 | |