1#include "triton/codegen/target.h"
2#include "llvm/IR/IRBuilder.h"
3#include "llvm/IR/Function.h"
4#include "llvm/IR/Intrinsics.h"
5#include "llvm/IR/IntrinsicsNVPTX.h"
6#include "llvm/IR/IntrinsicsAMDGPU.h"
7#include "llvm/IR/Value.h"
8#include "llvm/IR/IRBuilder.h"
9#include <iostream>
10
11using namespace llvm;
12
13namespace triton{
14namespace codegen{
15
16// base
17
18
19nvidia_cu_target* target::as_nvidia() {
20 return dynamic_cast<nvidia_cu_target*>(this);
21}
22
23bool target::is_gpu() const {
24 return is_gpu_;
25}
26
27// AMD
28void amd_cl_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) {
29 fn->setCallingConv(CallingConv::AMDGPU_KERNEL);
30}
31
32Instruction* amd_cl_target::add_barrier(Module *module, IRBuilder<>& builder) {
33 Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::amdgcn_s_barrier);
34 return builder.CreateIntrinsic(Intrinsic::amdgcn_s_barrier, {}, {});
35}
36
37Value* amd_cl_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) {
38 Value* group_id = get_block_id(module, builder, ax);
39 Value* result = builder.CreateMul(builder.getInt32(stride), group_id);
40 return result;
41}
42
43Instruction* amd_cl_target::add_memfence(Module *module, IRBuilder<>& builder) {
44 throw std::runtime_error("not implemented");
45}
46
47
48Value* amd_cl_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) {
49 static std::array<Intrinsic::ID, 3> ids = {
50 Intrinsic::amdgcn_workgroup_id_x,
51 Intrinsic::amdgcn_workgroup_id_y,
52 Intrinsic::amdgcn_workgroup_id_z
53 };
54 Value* group_id = builder.CreateIntrinsic(ids[ax], {}, {});
55 return group_id;
56}
57
58Value* amd_cl_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
59 throw std::runtime_error("not implemented on AMD");
60}
61
62Value* amd_cl_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {
63 static std::array<Intrinsic::ID, 3> ids = {
64 Intrinsic::amdgcn_workitem_id_x,
65 Intrinsic::amdgcn_workitem_id_y,
66 Intrinsic::amdgcn_workitem_id_z
67 };
68 Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]);
69 return builder.CreateCall(get_local_id, {});
70}
71
72// NVIDIA
73
74void nvidia_cu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn){
75 // set metadata
76 Metadata *md_args[] = {
77 ValueAsMetadata::get(fn),
78 MDString::get(ctx, "kernel"),
79 ValueAsMetadata::get(builder.getInt32(1))
80 };
81 module->getOrInsertNamedMetadata("nvvm.annotations")->addOperand(MDNode::get(ctx, md_args));
82}
83
84Instruction* nvidia_cu_target::add_barrier(Module *module, IRBuilder<>& builder) {
85 Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_barrier0);
86 return builder.CreateCall(barrier, {});
87}
88
89Instruction* nvidia_cu_target::add_memfence(Module *module, IRBuilder<>& builder) {
90 Function *barrier = Intrinsic::getDeclaration(module, Intrinsic::nvvm_membar_gl);
91 return builder.CreateCall(barrier, {});
92}
93
94
95Value* nvidia_cu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) {
96 Value* group_id = get_block_id(module, builder, ax);
97 Value* result = builder.CreateMul(builder.getInt32(stride), group_id);
98 return result;
99}
100
101Value* nvidia_cu_target::get_block_id(Module *module, IRBuilder<>& builder, unsigned ax) {
102 static std::array<Intrinsic::ID, 3> cta_ids = {
103 Intrinsic::nvvm_read_ptx_sreg_ctaid_x,
104 Intrinsic::nvvm_read_ptx_sreg_ctaid_y,
105 Intrinsic::nvvm_read_ptx_sreg_ctaid_z
106 };
107 Value* cta_id = builder.CreateIntrinsic(cta_ids[ax], {}, {});
108 return cta_id;
109}
110
111Value* nvidia_cu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {
112 static std::array<Intrinsic::ID, 3> ids = {
113 Intrinsic::nvvm_read_ptx_sreg_tid_x,
114 Intrinsic::nvvm_read_ptx_sreg_tid_y,
115 Intrinsic::nvvm_read_ptx_sreg_tid_z
116 };
117 Function *get_local_id = Intrinsic::getDeclaration(module, ids[ax]);
118 return builder.CreateCall(get_local_id, {});
119}
120
121Value* nvidia_cu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
122 static std::array<Intrinsic::ID, 3> ids = {
123 Intrinsic::nvvm_read_ptx_sreg_nctaid_x,
124 Intrinsic::nvvm_read_ptx_sreg_nctaid_y,
125 Intrinsic::nvvm_read_ptx_sreg_nctaid_z
126 };
127 return builder.CreateIntrinsic(ids[ax], {}, {});
128}
129
130// CPU
131
132void cpu_target::set_kernel(IRBuilder<>& builder, LLVMContext &ctx, Module *module, Function* fn) {
133 // normal cpu functions can be kernels
134}
135
136Instruction* cpu_target::add_barrier(Module *module, IRBuilder<>& builder) {
137 // no barrier on CPU
138 return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0));
139}
140
141Instruction* cpu_target::add_memfence(Module *module, IRBuilder<>& builder) {
142 // no barrier on CPU
143 return (Instruction*)builder.CreateAdd(builder.getInt32(0), builder.getInt32(0));
144}
145
146
147Value* cpu_target::get_block_id(Module *module, llvm::IRBuilder<> &builder, unsigned ax) {
148 const Function *fn = builder.GetInsertBlock()->getParent();
149 size_t num_params = fn->getFunctionType()->getNumParams();
150 static std::array<const Argument*, 3> ids = {
151 fn->arg_begin() + num_params - 3,
152 fn->arg_begin() + num_params - 2,
153 fn->arg_begin() + num_params - 1
154 };
155 return (Argument*)ids[ax];
156}
157
158Value* cpu_target::get_num_blocks(Module *module, IRBuilder<>& builder, unsigned ax) {
159 throw std::runtime_error("not implemented");
160}
161
162
163Value* cpu_target::get_global_offset(Module *module, IRBuilder<>& builder, unsigned stride, unsigned ax) {
164 Value* result = builder.CreateMul(builder.getInt32(stride), get_block_id(module, builder, ax));
165 return result;
166}
167
168Value* cpu_target::get_local_id(Module *module, IRBuilder<>& builder, unsigned ax) {
169 return builder.getInt32(0);
170}
171
172}
173}
174