1 | // A LLVM JIT compiler for CPU archs wrapper |
2 | |
3 | #include <memory> |
4 | |
5 | #ifdef TI_WITH_LLVM |
6 | #include "llvm/Analysis/TargetTransformInfo.h" |
7 | #include "llvm/ADT/StringRef.h" |
8 | #include "llvm/ExecutionEngine/ExecutionEngine.h" |
9 | #include "llvm/ExecutionEngine/JITSymbol.h" |
10 | #include "llvm/ExecutionEngine/Orc/Core.h" |
11 | #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h" |
12 | #include "llvm/ExecutionEngine/Orc/CompileUtils.h" |
13 | #include "llvm/ExecutionEngine/Orc/ExecutionUtils.h" |
14 | #include "llvm/ExecutionEngine/Orc/IRCompileLayer.h" |
15 | #include "llvm/ExecutionEngine/Orc/IRTransformLayer.h" |
16 | // From https://github.com/JuliaLang/julia/pull/43664 |
17 | #if defined(__APPLE__) && defined(__aarch64__) |
18 | #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h" |
19 | #else |
20 | #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h" |
21 | #endif |
22 | #include "llvm/ExecutionEngine/RTDyldMemoryManager.h" |
23 | #include "llvm/ExecutionEngine/RuntimeDyld.h" |
24 | #include "llvm/ExecutionEngine/SectionMemoryManager.h" |
25 | #include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h" |
26 | #include "llvm/IR/Module.h" |
27 | #include "llvm/IR/DataLayout.h" |
28 | #include "llvm/IR/Verifier.h" |
29 | #include "llvm/IR/LLVMContext.h" |
30 | #include "llvm/IR/LegacyPassManager.h" |
31 | #include "llvm/Support/DynamicLibrary.h" |
32 | #include "llvm/Support/raw_ostream.h" |
33 | #include "llvm/Support/Error.h" |
34 | #include "llvm/Target/TargetMachine.h" |
35 | #include "llvm/Transforms/IPO/PassManagerBuilder.h" |
36 | #include "llvm/Transforms/InstCombine/InstCombine.h" |
37 | #include "llvm/Transforms/Scalar.h" |
38 | #include "llvm/Transforms/Scalar/GVN.h" |
39 | #include "llvm/Transforms/IPO.h" |
40 | |
41 | #include "llvm/MC/TargetRegistry.h" |
42 | #include "llvm/Support/Host.h" |
43 | |
44 | #endif |
45 | |
46 | #include "taichi/jit/jit_module.h" |
47 | #include "taichi/util/lang_util.h" |
48 | #include "taichi/program/program.h" |
49 | #include "taichi/jit/jit_session.h" |
50 | #include "taichi/util/file_sequence_writer.h" |
51 | #include "taichi/runtime/llvm/llvm_context.h" |
52 | |
53 | namespace taichi::lang { |
54 | |
55 | #ifdef TI_WITH_LLVM |
56 | using namespace llvm; |
57 | using namespace llvm::orc; |
58 | #if defined(__APPLE__) && defined(__aarch64__) |
59 | typedef orc::ObjectLinkingLayer ObjLayerT; |
60 | #else |
61 | typedef orc::RTDyldObjectLinkingLayer ObjLayerT; |
62 | #endif |
63 | #endif |
64 | |
65 | std::pair<JITTargetMachineBuilder, llvm::DataLayout> get_host_target_info() { |
66 | auto expected_jtmb = JITTargetMachineBuilder::detectHost(); |
67 | if (!expected_jtmb) |
68 | TI_ERROR("LLVM TargetMachineBuilder has failed." ); |
69 | auto jtmb = *expected_jtmb; |
70 | auto expected_data_layout = jtmb.getDefaultDataLayoutForTarget(); |
71 | if (!expected_data_layout) { |
72 | TI_ERROR("LLVM TargetMachineBuilder has failed when getting data layout." ); |
73 | } |
74 | auto data_layout = *expected_data_layout; |
75 | return std::make_pair(jtmb, data_layout); |
76 | } |
77 | |
78 | class JITSessionCPU; |
79 | |
80 | class JITModuleCPU : public JITModule { |
81 | private: |
82 | JITSessionCPU *session_; |
83 | JITDylib *dylib_; |
84 | |
85 | public: |
86 | JITModuleCPU(JITSessionCPU *session, JITDylib *dylib) |
87 | : session_(session), dylib_(dylib) { |
88 | } |
89 | |
90 | void *lookup_function(const std::string &name) override; |
91 | |
92 | bool direct_dispatch() const override { |
93 | return true; |
94 | } |
95 | }; |
96 | |
97 | class JITSessionCPU : public JITSession { |
98 | private: |
99 | ExecutionSession es_; |
100 | ObjLayerT object_layer_; |
101 | IRCompileLayer compile_layer_; |
102 | DataLayout dl_; |
103 | MangleAndInterner mangle_; |
104 | std::mutex mut_; |
105 | std::vector<llvm::orc::JITDylib *> all_libs_; |
106 | int module_counter_; |
107 | SectionMemoryManager *memory_manager_; |
108 | |
109 | public: |
110 | JITSessionCPU(TaichiLLVMContext *tlctx, |
111 | std::unique_ptr<ExecutorProcessControl> EPC, |
112 | const CompileConfig &config, |
113 | JITTargetMachineBuilder JTMB, |
114 | DataLayout DL) |
115 | : JITSession(tlctx, config), |
116 | es_(std::move(EPC)), |
117 | #if defined(__APPLE__) && defined(__aarch64__) |
118 | object_layer_(es_), |
119 | #else |
120 | object_layer_(es_, |
121 | [&]() { |
122 | auto smgr = std::make_unique<SectionMemoryManager>(); |
123 | memory_manager_ = smgr.get(); |
124 | return smgr; |
125 | }), |
126 | #endif |
127 | compile_layer_(es_, |
128 | object_layer_, |
129 | std::make_unique<ConcurrentIRCompiler>(JTMB)), |
130 | dl_(DL), |
131 | mangle_(es_, this->dl_), |
132 | module_counter_(0), |
133 | memory_manager_(nullptr) { |
134 | if (JTMB.getTargetTriple().isOSBinFormatCOFF()) { |
135 | object_layer_.setOverrideObjectFlagsWithResponsibilityFlags(true); |
136 | object_layer_.setAutoClaimResponsibilityForObjectSymbols(true); |
137 | } |
138 | } |
139 | |
140 | ~JITSessionCPU() override { |
141 | std::lock_guard<std::mutex> _(mut_); |
142 | if (memory_manager_) |
143 | memory_manager_->deregisterEHFrames(); |
144 | if (auto Err = es_.endSession()) |
145 | es_.reportError(std::move(Err)); |
146 | } |
147 | |
148 | DataLayout get_data_layout() override { |
149 | return dl_; |
150 | } |
151 | |
152 | void global_optimize_module(llvm::Module *module) override { |
153 | global_optimize_module_cpu(module); |
154 | } |
155 | |
156 | JITModule *add_module(std::unique_ptr<llvm::Module> M, int max_reg) override { |
157 | TI_ASSERT(max_reg == 0); // No need to specify max_reg on CPUs |
158 | TI_ASSERT(M); |
159 | global_optimize_module_cpu(M.get()); |
160 | std::lock_guard<std::mutex> _(mut_); |
161 | auto dylib_expect = es_.createJITDylib(fmt::format("{}" , module_counter_)); |
162 | TI_ASSERT(dylib_expect); |
163 | auto &dylib = dylib_expect.get(); |
164 | dylib.addGenerator( |
165 | cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess( |
166 | dl_.getGlobalPrefix()))); |
167 | auto *thread_safe_context = |
168 | this->tlctx_->get_this_thread_thread_safe_context(); |
169 | cantFail(compile_layer_.add( |
170 | dylib, |
171 | llvm::orc::ThreadSafeModule(std::move(M), *thread_safe_context))); |
172 | all_libs_.push_back(&dylib); |
173 | auto new_module = std::make_unique<JITModuleCPU>(this, &dylib); |
174 | auto new_module_raw_ptr = new_module.get(); |
175 | modules.push_back(std::move(new_module)); |
176 | module_counter_++; |
177 | return new_module_raw_ptr; |
178 | } |
179 | |
180 | void *lookup(const std::string Name) override { |
181 | std::lock_guard<std::mutex> _(mut_); |
182 | #ifdef __APPLE__ |
183 | auto symbol = es_.lookup(all_libs_, mangle_(Name)); |
184 | #else |
185 | auto symbol = es_.lookup(all_libs_, es_.intern(Name)); |
186 | #endif |
187 | if (!symbol) |
188 | TI_ERROR("Function \"{}\" not found" , Name); |
189 | return (void *)(symbol->getAddress()); |
190 | } |
191 | |
192 | void *lookup_in_module(JITDylib *lib, const std::string Name) { |
193 | std::lock_guard<std::mutex> _(mut_); |
194 | #ifdef __APPLE__ |
195 | auto symbol = es_.lookup({lib}, mangle_(Name)); |
196 | #else |
197 | auto symbol = es_.lookup({lib}, es_.intern(Name)); |
198 | #endif |
199 | if (!symbol) |
200 | TI_ERROR("Function \"{}\" not found" , Name); |
201 | return (void *)(symbol->getAddress()); |
202 | } |
203 | |
204 | private: |
205 | void global_optimize_module_cpu(llvm::Module *module); |
206 | }; |
207 | |
208 | void *JITModuleCPU::lookup_function(const std::string &name) { |
209 | return session_->lookup_in_module(dylib_, name); |
210 | } |
211 | |
212 | void JITSessionCPU::global_optimize_module_cpu(llvm::Module *module) { |
213 | TI_AUTO_PROF |
214 | if (llvm::verifyModule(*module, &llvm::errs())) { |
215 | module->print(llvm::errs(), nullptr); |
216 | TI_ERROR("Module broken" ); |
217 | } |
218 | |
219 | auto triple = get_host_target_info().first.getTargetTriple(); |
220 | |
221 | std::string err_str; |
222 | const llvm::Target *target = |
223 | TargetRegistry::lookupTarget(triple.str(), err_str); |
224 | TI_ERROR_UNLESS(target, err_str); |
225 | |
226 | TargetOptions options; |
227 | if (this->config_.fast_math) { |
228 | options.AllowFPOpFusion = FPOpFusion::Fast; |
229 | options.UnsafeFPMath = 1; |
230 | options.NoInfsFPMath = 1; |
231 | options.NoNaNsFPMath = 1; |
232 | } else { |
233 | options.AllowFPOpFusion = FPOpFusion::Strict; |
234 | options.UnsafeFPMath = 0; |
235 | options.NoInfsFPMath = 0; |
236 | options.NoNaNsFPMath = 0; |
237 | } |
238 | options.HonorSignDependentRoundingFPMathOption = false; |
239 | options.NoZerosInBSS = false; |
240 | options.GuaranteedTailCallOpt = false; |
241 | |
242 | legacy::FunctionPassManager function_pass_manager(module); |
243 | legacy::PassManager module_pass_manager; |
244 | |
245 | llvm::StringRef mcpu = llvm::sys::getHostCPUName(); |
246 | std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine( |
247 | triple.str(), mcpu.str(), "" , options, llvm::Reloc::PIC_, |
248 | llvm::CodeModel::Small, CodeGenOpt::Aggressive)); |
249 | |
250 | TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!" ); |
251 | |
252 | module->setDataLayout(target_machine->createDataLayout()); |
253 | |
254 | module_pass_manager.add(createTargetTransformInfoWrapperPass( |
255 | target_machine->getTargetIRAnalysis())); |
256 | function_pass_manager.add(createTargetTransformInfoWrapperPass( |
257 | target_machine->getTargetIRAnalysis())); |
258 | |
259 | PassManagerBuilder b; |
260 | b.OptLevel = 3; |
261 | b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false); |
262 | b.LoopVectorize = true; |
263 | b.SLPVectorize = true; |
264 | |
265 | target_machine->adjustPassManager(b); |
266 | |
267 | b.populateFunctionPassManager(function_pass_manager); |
268 | b.populateModulePassManager(module_pass_manager); |
269 | |
270 | { |
271 | TI_PROFILER("llvm_function_pass" ); |
272 | function_pass_manager.doInitialization(); |
273 | for (llvm::Module::iterator i = module->begin(); i != module->end(); i++) |
274 | function_pass_manager.run(*i); |
275 | |
276 | function_pass_manager.doFinalization(); |
277 | } |
278 | |
279 | /* |
280 | Optimization for llvm::GetElementPointer: |
281 | https://github.com/taichi-dev/taichi/issues/5472 The three other passes |
282 | "loop-reduce", "ind-vars", "cse" serves as preprocessing for |
283 | "separate-const-offset-gep". |
284 | |
285 | Note there's an update for "separate-const-offset-gep" in llvm-12. |
286 | */ |
287 | module_pass_manager.add(llvm::createLoopStrengthReducePass()); |
288 | module_pass_manager.add(llvm::createIndVarSimplifyPass()); |
289 | module_pass_manager.add(llvm::createSeparateConstOffsetFromGEPPass(false)); |
290 | module_pass_manager.add(llvm::createEarlyCSEPass(true)); |
291 | |
292 | { |
293 | TI_PROFILER("llvm_module_pass" ); |
294 | module_pass_manager.run(*module); |
295 | } |
296 | |
297 | if (this->config_.print_kernel_llvm_ir_optimized) { |
298 | if (false) { |
299 | TI_INFO("Functions with > 100 instructions in optimized LLVM IR:" ); |
300 | TaichiLLVMContext::print_huge_functions(module); |
301 | } |
302 | static FileSequenceWriter writer( |
303 | "taichi_kernel_cpu_llvm_ir_optimized_{:04d}.ll" , |
304 | "optimized LLVM IR (CPU)" ); |
305 | writer.write(module); |
306 | } |
307 | } |
308 | |
309 | std::unique_ptr<JITSession> create_llvm_jit_session_cpu( |
310 | TaichiLLVMContext *tlctx, |
311 | const CompileConfig &config, |
312 | Arch arch) { |
313 | TI_ASSERT(arch_is_cpu(arch)); |
314 | auto target_info = get_host_target_info(); |
315 | auto EPC = SelfExecutorProcessControl::Create(); |
316 | TI_ASSERT(EPC); |
317 | return std::make_unique<JITSessionCPU>(tlctx, std::move(*EPC), config, |
318 | target_info.first, target_info.second); |
319 | } |
320 | |
321 | } // namespace taichi::lang |
322 | |