1// A LLVM JIT compiler for CPU archs wrapper
2
3#include <memory>
4
5#ifdef TI_WITH_LLVM
6#include "llvm/Analysis/TargetTransformInfo.h"
7#include "llvm/ADT/StringRef.h"
8#include "llvm/ExecutionEngine/ExecutionEngine.h"
9#include "llvm/ExecutionEngine/JITSymbol.h"
10#include "llvm/ExecutionEngine/Orc/Core.h"
11#include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
12#include "llvm/ExecutionEngine/Orc/CompileUtils.h"
13#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
14#include "llvm/ExecutionEngine/Orc/IRCompileLayer.h"
15#include "llvm/ExecutionEngine/Orc/IRTransformLayer.h"
16// From https://github.com/JuliaLang/julia/pull/43664
17#if defined(__APPLE__) && defined(__aarch64__)
18#include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
19#else
20#include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
21#endif
22#include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
23#include "llvm/ExecutionEngine/RuntimeDyld.h"
24#include "llvm/ExecutionEngine/SectionMemoryManager.h"
25#include "llvm/ExecutionEngine/Orc/ThreadSafeModule.h"
26#include "llvm/IR/Module.h"
27#include "llvm/IR/DataLayout.h"
28#include "llvm/IR/Verifier.h"
29#include "llvm/IR/LLVMContext.h"
30#include "llvm/IR/LegacyPassManager.h"
31#include "llvm/Support/DynamicLibrary.h"
32#include "llvm/Support/raw_ostream.h"
33#include "llvm/Support/Error.h"
34#include "llvm/Target/TargetMachine.h"
35#include "llvm/Transforms/IPO/PassManagerBuilder.h"
36#include "llvm/Transforms/InstCombine/InstCombine.h"
37#include "llvm/Transforms/Scalar.h"
38#include "llvm/Transforms/Scalar/GVN.h"
39#include "llvm/Transforms/IPO.h"
40
41#include "llvm/MC/TargetRegistry.h"
42#include "llvm/Support/Host.h"
43
44#endif
45
46#include "taichi/jit/jit_module.h"
47#include "taichi/util/lang_util.h"
48#include "taichi/program/program.h"
49#include "taichi/jit/jit_session.h"
50#include "taichi/util/file_sequence_writer.h"
51#include "taichi/runtime/llvm/llvm_context.h"
52
53namespace taichi::lang {
54
55#ifdef TI_WITH_LLVM
56using namespace llvm;
57using namespace llvm::orc;
58#if defined(__APPLE__) && defined(__aarch64__)
59typedef orc::ObjectLinkingLayer ObjLayerT;
60#else
61typedef orc::RTDyldObjectLinkingLayer ObjLayerT;
62#endif
63#endif
64
65std::pair<JITTargetMachineBuilder, llvm::DataLayout> get_host_target_info() {
66 auto expected_jtmb = JITTargetMachineBuilder::detectHost();
67 if (!expected_jtmb)
68 TI_ERROR("LLVM TargetMachineBuilder has failed.");
69 auto jtmb = *expected_jtmb;
70 auto expected_data_layout = jtmb.getDefaultDataLayoutForTarget();
71 if (!expected_data_layout) {
72 TI_ERROR("LLVM TargetMachineBuilder has failed when getting data layout.");
73 }
74 auto data_layout = *expected_data_layout;
75 return std::make_pair(jtmb, data_layout);
76}
77
78class JITSessionCPU;
79
80class JITModuleCPU : public JITModule {
81 private:
82 JITSessionCPU *session_;
83 JITDylib *dylib_;
84
85 public:
86 JITModuleCPU(JITSessionCPU *session, JITDylib *dylib)
87 : session_(session), dylib_(dylib) {
88 }
89
90 void *lookup_function(const std::string &name) override;
91
92 bool direct_dispatch() const override {
93 return true;
94 }
95};
96
97class JITSessionCPU : public JITSession {
98 private:
99 ExecutionSession es_;
100 ObjLayerT object_layer_;
101 IRCompileLayer compile_layer_;
102 DataLayout dl_;
103 MangleAndInterner mangle_;
104 std::mutex mut_;
105 std::vector<llvm::orc::JITDylib *> all_libs_;
106 int module_counter_;
107 SectionMemoryManager *memory_manager_;
108
109 public:
110 JITSessionCPU(TaichiLLVMContext *tlctx,
111 std::unique_ptr<ExecutorProcessControl> EPC,
112 const CompileConfig &config,
113 JITTargetMachineBuilder JTMB,
114 DataLayout DL)
115 : JITSession(tlctx, config),
116 es_(std::move(EPC)),
117#if defined(__APPLE__) && defined(__aarch64__)
118 object_layer_(es_),
119#else
120 object_layer_(es_,
121 [&]() {
122 auto smgr = std::make_unique<SectionMemoryManager>();
123 memory_manager_ = smgr.get();
124 return smgr;
125 }),
126#endif
127 compile_layer_(es_,
128 object_layer_,
129 std::make_unique<ConcurrentIRCompiler>(JTMB)),
130 dl_(DL),
131 mangle_(es_, this->dl_),
132 module_counter_(0),
133 memory_manager_(nullptr) {
134 if (JTMB.getTargetTriple().isOSBinFormatCOFF()) {
135 object_layer_.setOverrideObjectFlagsWithResponsibilityFlags(true);
136 object_layer_.setAutoClaimResponsibilityForObjectSymbols(true);
137 }
138 }
139
140 ~JITSessionCPU() override {
141 std::lock_guard<std::mutex> _(mut_);
142 if (memory_manager_)
143 memory_manager_->deregisterEHFrames();
144 if (auto Err = es_.endSession())
145 es_.reportError(std::move(Err));
146 }
147
148 DataLayout get_data_layout() override {
149 return dl_;
150 }
151
152 void global_optimize_module(llvm::Module *module) override {
153 global_optimize_module_cpu(module);
154 }
155
156 JITModule *add_module(std::unique_ptr<llvm::Module> M, int max_reg) override {
157 TI_ASSERT(max_reg == 0); // No need to specify max_reg on CPUs
158 TI_ASSERT(M);
159 global_optimize_module_cpu(M.get());
160 std::lock_guard<std::mutex> _(mut_);
161 auto dylib_expect = es_.createJITDylib(fmt::format("{}", module_counter_));
162 TI_ASSERT(dylib_expect);
163 auto &dylib = dylib_expect.get();
164 dylib.addGenerator(
165 cantFail(llvm::orc::DynamicLibrarySearchGenerator::GetForCurrentProcess(
166 dl_.getGlobalPrefix())));
167 auto *thread_safe_context =
168 this->tlctx_->get_this_thread_thread_safe_context();
169 cantFail(compile_layer_.add(
170 dylib,
171 llvm::orc::ThreadSafeModule(std::move(M), *thread_safe_context)));
172 all_libs_.push_back(&dylib);
173 auto new_module = std::make_unique<JITModuleCPU>(this, &dylib);
174 auto new_module_raw_ptr = new_module.get();
175 modules.push_back(std::move(new_module));
176 module_counter_++;
177 return new_module_raw_ptr;
178 }
179
180 void *lookup(const std::string Name) override {
181 std::lock_guard<std::mutex> _(mut_);
182#ifdef __APPLE__
183 auto symbol = es_.lookup(all_libs_, mangle_(Name));
184#else
185 auto symbol = es_.lookup(all_libs_, es_.intern(Name));
186#endif
187 if (!symbol)
188 TI_ERROR("Function \"{}\" not found", Name);
189 return (void *)(symbol->getAddress());
190 }
191
192 void *lookup_in_module(JITDylib *lib, const std::string Name) {
193 std::lock_guard<std::mutex> _(mut_);
194#ifdef __APPLE__
195 auto symbol = es_.lookup({lib}, mangle_(Name));
196#else
197 auto symbol = es_.lookup({lib}, es_.intern(Name));
198#endif
199 if (!symbol)
200 TI_ERROR("Function \"{}\" not found", Name);
201 return (void *)(symbol->getAddress());
202 }
203
204 private:
205 void global_optimize_module_cpu(llvm::Module *module);
206};
207
208void *JITModuleCPU::lookup_function(const std::string &name) {
209 return session_->lookup_in_module(dylib_, name);
210}
211
212void JITSessionCPU::global_optimize_module_cpu(llvm::Module *module) {
213 TI_AUTO_PROF
214 if (llvm::verifyModule(*module, &llvm::errs())) {
215 module->print(llvm::errs(), nullptr);
216 TI_ERROR("Module broken");
217 }
218
219 auto triple = get_host_target_info().first.getTargetTriple();
220
221 std::string err_str;
222 const llvm::Target *target =
223 TargetRegistry::lookupTarget(triple.str(), err_str);
224 TI_ERROR_UNLESS(target, err_str);
225
226 TargetOptions options;
227 if (this->config_.fast_math) {
228 options.AllowFPOpFusion = FPOpFusion::Fast;
229 options.UnsafeFPMath = 1;
230 options.NoInfsFPMath = 1;
231 options.NoNaNsFPMath = 1;
232 } else {
233 options.AllowFPOpFusion = FPOpFusion::Strict;
234 options.UnsafeFPMath = 0;
235 options.NoInfsFPMath = 0;
236 options.NoNaNsFPMath = 0;
237 }
238 options.HonorSignDependentRoundingFPMathOption = false;
239 options.NoZerosInBSS = false;
240 options.GuaranteedTailCallOpt = false;
241
242 legacy::FunctionPassManager function_pass_manager(module);
243 legacy::PassManager module_pass_manager;
244
245 llvm::StringRef mcpu = llvm::sys::getHostCPUName();
246 std::unique_ptr<TargetMachine> target_machine(target->createTargetMachine(
247 triple.str(), mcpu.str(), "", options, llvm::Reloc::PIC_,
248 llvm::CodeModel::Small, CodeGenOpt::Aggressive));
249
250 TI_ERROR_UNLESS(target_machine.get(), "Could not allocate target machine!");
251
252 module->setDataLayout(target_machine->createDataLayout());
253
254 module_pass_manager.add(createTargetTransformInfoWrapperPass(
255 target_machine->getTargetIRAnalysis()));
256 function_pass_manager.add(createTargetTransformInfoWrapperPass(
257 target_machine->getTargetIRAnalysis()));
258
259 PassManagerBuilder b;
260 b.OptLevel = 3;
261 b.Inliner = createFunctionInliningPass(b.OptLevel, 0, false);
262 b.LoopVectorize = true;
263 b.SLPVectorize = true;
264
265 target_machine->adjustPassManager(b);
266
267 b.populateFunctionPassManager(function_pass_manager);
268 b.populateModulePassManager(module_pass_manager);
269
270 {
271 TI_PROFILER("llvm_function_pass");
272 function_pass_manager.doInitialization();
273 for (llvm::Module::iterator i = module->begin(); i != module->end(); i++)
274 function_pass_manager.run(*i);
275
276 function_pass_manager.doFinalization();
277 }
278
279 /*
280 Optimization for llvm::GetElementPointer:
281 https://github.com/taichi-dev/taichi/issues/5472 The three other passes
282 "loop-reduce", "ind-vars", "cse" serves as preprocessing for
283 "separate-const-offset-gep".
284
285 Note there's an update for "separate-const-offset-gep" in llvm-12.
286 */
287 module_pass_manager.add(llvm::createLoopStrengthReducePass());
288 module_pass_manager.add(llvm::createIndVarSimplifyPass());
289 module_pass_manager.add(llvm::createSeparateConstOffsetFromGEPPass(false));
290 module_pass_manager.add(llvm::createEarlyCSEPass(true));
291
292 {
293 TI_PROFILER("llvm_module_pass");
294 module_pass_manager.run(*module);
295 }
296
297 if (this->config_.print_kernel_llvm_ir_optimized) {
298 if (false) {
299 TI_INFO("Functions with > 100 instructions in optimized LLVM IR:");
300 TaichiLLVMContext::print_huge_functions(module);
301 }
302 static FileSequenceWriter writer(
303 "taichi_kernel_cpu_llvm_ir_optimized_{:04d}.ll",
304 "optimized LLVM IR (CPU)");
305 writer.write(module);
306 }
307}
308
309std::unique_ptr<JITSession> create_llvm_jit_session_cpu(
310 TaichiLLVMContext *tlctx,
311 const CompileConfig &config,
312 Arch arch) {
313 TI_ASSERT(arch_is_cpu(arch));
314 auto target_info = get_host_target_info();
315 auto EPC = SelfExecutorProcessControl::Create();
316 TI_ASSERT(EPC);
317 return std::make_unique<JITSessionCPU>(tlctx, std::move(*EPC), config,
318 target_info.first, target_info.second);
319}
320
321} // namespace taichi::lang
322