1 | #include "taichi/runtime/llvm/llvm_runtime_executor.h" |
2 | |
3 | #include "taichi/runtime/llvm/llvm_offline_cache.h" |
4 | #include "taichi/runtime/llvm/runtime_module/mem_request.h" |
5 | #include "taichi/rhi/cpu/cpu_device.h" |
6 | #include "taichi/rhi/cuda/cuda_device.h" |
7 | #include "taichi/platform/cuda/detect_cuda.h" |
8 | #include "taichi/rhi/cuda/cuda_driver.h" |
9 | |
10 | #if defined(TI_WITH_CUDA) |
11 | #include "taichi/rhi/cuda/cuda_context.h" |
12 | #endif |
13 | |
14 | #include "taichi/platform/amdgpu/detect_amdgpu.h" |
15 | #include "taichi/rhi/amdgpu/amdgpu_driver.h" |
16 | #include "taichi/rhi/amdgpu/amdgpu_device.h" |
17 | #if defined(TI_WITH_AMDGPU) |
18 | #include "taichi/rhi/amdgpu/amdgpu_context.h" |
19 | #endif |
20 | |
21 | namespace taichi::lang { |
22 | namespace { |
23 | void assert_failed_host(const char *msg) { |
24 | TI_ERROR("Assertion failure: {}" , msg); |
25 | } |
26 | |
27 | void *taichi_allocate_aligned(MemoryPool *memory_pool, |
28 | std::size_t size, |
29 | std::size_t alignment) { |
30 | return memory_pool->allocate(size, alignment); |
31 | } |
32 | } // namespace |
33 | |
34 | LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config, |
35 | KernelProfilerBase *profiler) |
36 | : config_(config) { |
37 | if (config.arch == Arch::cuda) { |
38 | #if defined(TI_WITH_CUDA) |
39 | if (!is_cuda_api_available()) { |
40 | TI_WARN("No CUDA driver API detected." ); |
41 | config.arch = host_arch(); |
42 | } else if (!CUDAContext::get_instance().detected()) { |
43 | TI_WARN("No CUDA device detected." ); |
44 | config.arch = host_arch(); |
45 | } else { |
46 | // CUDA runtime created successfully |
47 | } |
48 | #else |
49 | TI_WARN("Taichi is not compiled with CUDA." ); |
50 | config.arch = host_arch(); |
51 | #endif |
52 | |
53 | if (config.arch != Arch::cuda) { |
54 | TI_WARN("Falling back to {}." , arch_name(host_arch())); |
55 | } |
56 | } else if (config.arch == Arch::amdgpu) { |
57 | #if defined(TI_WITH_AMDGPU) |
58 | if (!is_rocm_api_available()) { |
59 | TI_WARN("No AMDGPU ROCm API detected." ); |
60 | config.arch = host_arch(); |
61 | } else if (!AMDGPUContext::get_instance().detected()) { |
62 | TI_WARN("No AMDGPU device detected." ); |
63 | config.arch = host_arch(); |
64 | } else { |
65 | // AMDGPU runtime created successfully |
66 | } |
67 | #else |
68 | TI_WARN("Taichi is not compiled with AMDGPU." ); |
69 | config.arch = host_arch(); |
70 | #endif |
71 | } |
72 | |
73 | if (config.kernel_profiler) { |
74 | profiler_ = profiler; |
75 | } |
76 | |
77 | snode_tree_buffer_manager_ = std::make_unique<SNodeTreeBufferManager>(this); |
78 | thread_pool_ = std::make_unique<ThreadPool>(config.cpu_max_num_threads); |
79 | preallocated_device_buffer_ = nullptr; |
80 | |
81 | llvm_runtime_ = nullptr; |
82 | |
83 | if (arch_is_cpu(config.arch)) { |
84 | config.max_block_dim = 1024; |
85 | device_ = std::make_shared<cpu::CpuDevice>(); |
86 | } |
87 | #if defined(TI_WITH_CUDA) |
88 | else if (config.arch == Arch::cuda) { |
89 | int num_SMs{1}; |
90 | CUDADriver::get_instance().device_get_attribute( |
91 | &num_SMs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, nullptr); |
92 | int query_max_block_dim{1024}; |
93 | CUDADriver::get_instance().device_get_attribute( |
94 | &query_max_block_dim, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, nullptr); |
95 | int version{0}; |
96 | CUDADriver::get_instance().driver_get_version(&version); |
97 | int query_max_block_per_sm{16}; |
98 | if (version >= 11000) { |
99 | // query this attribute only when CUDA version is above 11.0 |
100 | CUDADriver::get_instance().device_get_attribute( |
101 | &query_max_block_per_sm, |
102 | CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR, nullptr); |
103 | } |
104 | |
105 | if (config.max_block_dim == 0) { |
106 | config.max_block_dim = query_max_block_dim; |
107 | } |
108 | |
109 | if (config.saturating_grid_dim == 0) { |
110 | if (version >= 11000) { |
111 | TI_TRACE("CUDA max blocks per SM = {}" , query_max_block_per_sm); |
112 | } |
113 | config.saturating_grid_dim = num_SMs * query_max_block_per_sm * 2; |
114 | } |
115 | if (config.kernel_profiler) { |
116 | CUDAContext::get_instance().set_profiler(profiler); |
117 | } else { |
118 | CUDAContext::get_instance().set_profiler(nullptr); |
119 | } |
120 | CUDAContext::get_instance().set_debug(config.debug); |
121 | device_ = std::make_shared<cuda::CudaDevice>(); |
122 | } |
123 | #endif |
124 | |
125 | #if defined(TI_WITH_AMDGPU) |
126 | else if (config.arch == Arch::amdgpu) { |
127 | int num_workgroups{1}; |
128 | AMDGPUDriver::get_instance().device_get_attribute( |
129 | &num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0); |
130 | int query_max_block_dim{1024}; |
131 | AMDGPUDriver::get_instance().device_get_attribute( |
132 | &query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 0); |
133 | // magic number 32 |
134 | // I didn't find the relevant parameter to limit the max block num per CU |
135 | // So .... |
136 | int query_max_block_per_cu{32}; |
137 | if (config.max_block_dim == 0) { |
138 | config.max_block_dim = query_max_block_dim; |
139 | } |
140 | if (config.saturating_grid_dim == 0) { |
141 | config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2; |
142 | } |
143 | AMDGPUContext::get_instance().set_debug(config.debug); |
144 | device_ = std::make_shared<amdgpu::AmdgpuDevice>(); |
145 | } |
146 | #endif |
147 | |
148 | #ifdef TI_WITH_DX12 |
149 | else if (config.arch == Arch::dx12) { |
150 | // FIXME: add dx12 device. |
151 | // FIXME: set value based on DX12. |
152 | config.max_block_dim = 1024; |
153 | device_ = std::make_shared<cpu::CpuDevice>(); |
154 | } |
155 | #endif |
156 | else { |
157 | TI_NOT_IMPLEMENTED |
158 | } |
159 | llvm_context_ = std::make_unique<TaichiLLVMContext>( |
160 | config_, arch_is_cpu(config.arch) ? host_arch() : config.arch); |
161 | init_runtime_jit_module(llvm_context_->clone_runtime_module()); |
162 | } |
163 | |
164 | TaichiLLVMContext *LlvmRuntimeExecutor::get_llvm_context() { |
165 | return llvm_context_.get(); |
166 | } |
167 | |
168 | JITModule *LlvmRuntimeExecutor::create_jit_module( |
169 | std::unique_ptr<llvm::Module> module) { |
170 | return get_llvm_context()->jit->add_module(std::move(module)); |
171 | } |
172 | |
173 | JITModule *LlvmRuntimeExecutor::get_runtime_jit_module() { |
174 | return runtime_jit_module_; |
175 | } |
176 | |
177 | void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager, |
178 | uint64 *result_buffer) { |
179 | auto list_manager_len = runtime_query<int32>("ListManager_get_num_elements" , |
180 | result_buffer, list_manager); |
181 | |
182 | auto element_size = runtime_query<int32>("ListManager_get_element_size" , |
183 | result_buffer, list_manager); |
184 | |
185 | auto elements_per_chunk = |
186 | runtime_query<int32>("ListManager_get_max_num_elements_per_chunk" , |
187 | result_buffer, list_manager); |
188 | |
189 | auto num_active_chunks = runtime_query<int32>( |
190 | "ListManager_get_num_active_chunks" , result_buffer, list_manager); |
191 | |
192 | auto size_MB = 1e-6f * num_active_chunks * elements_per_chunk * element_size; |
193 | |
194 | fmt::print( |
195 | " length={:n} {:n} chunks x [{:n} x {:n} B] total={:.4f} MB\n" , |
196 | list_manager_len, num_active_chunks, elements_per_chunk, element_size, |
197 | size_MB); |
198 | } |
199 | |
200 | void LlvmRuntimeExecutor::synchronize() { |
201 | if (config_.arch == Arch::cuda) { |
202 | #if defined(TI_WITH_CUDA) |
203 | CUDADriver::get_instance().stream_synchronize(nullptr); |
204 | #else |
205 | TI_ERROR("No CUDA support" ); |
206 | #endif |
207 | } else if (config_.arch == Arch::amdgpu) { |
208 | #if defined(TI_WITH_AMDGPU) |
209 | AMDGPUDriver::get_instance().stream_synchronize(nullptr); |
210 | #else |
211 | TI_ERROR("No AMDGPU support" ); |
212 | #endif |
213 | } |
214 | fflush(stdout); |
215 | } |
216 | |
217 | uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) { |
218 | // TODO: We are likely doing more synchronization than necessary. Simplify the |
219 | // sync logic when we fetch the result. |
220 | synchronize(); |
221 | uint64 ret; |
222 | if (config_.arch == Arch::cuda) { |
223 | #if defined(TI_WITH_CUDA) |
224 | CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i, |
225 | sizeof(uint64)); |
226 | #else |
227 | TI_NOT_IMPLEMENTED; |
228 | #endif |
229 | } else if (config_.arch == Arch::amdgpu) { |
230 | #if defined(TI_WITH_AMDGPU) |
231 | AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i, |
232 | sizeof(uint64)); |
233 | #else |
234 | TI_NOT_IMPLEMENTED; |
235 | #endif |
236 | } else { |
237 | ret = result_buffer[i]; |
238 | } |
239 | return ret; |
240 | } |
241 | |
242 | std::size_t LlvmRuntimeExecutor::get_snode_num_dynamically_allocated( |
243 | SNode *snode, |
244 | uint64 *result_buffer) { |
245 | TI_ASSERT(arch_uses_llvm(config_.arch)); |
246 | |
247 | auto node_allocator = |
248 | runtime_query<void *>("LLVMRuntime_get_node_allocators" , result_buffer, |
249 | llvm_runtime_, snode->id); |
250 | auto data_list = runtime_query<void *>("NodeManager_get_data_list" , |
251 | result_buffer, node_allocator); |
252 | |
253 | return (std::size_t)runtime_query<int32>("ListManager_get_num_elements" , |
254 | result_buffer, data_list); |
255 | } |
256 | |
257 | void LlvmRuntimeExecutor::check_runtime_error(uint64 *result_buffer) { |
258 | synchronize(); |
259 | auto *runtime_jit_module = get_runtime_jit_module(); |
260 | runtime_jit_module->call<void *>("runtime_retrieve_and_reset_error_code" , |
261 | llvm_runtime_); |
262 | auto error_code = |
263 | fetch_result<int64>(taichi_result_buffer_error_id, result_buffer); |
264 | |
265 | if (error_code) { |
266 | std::string error_message_template; |
267 | |
268 | // Here we fetch the error_message_template char by char. |
269 | // This is not efficient, but fortunately we only need to do this when an |
270 | // assertion fails. Note that we may not have unified memory here, so using |
271 | // "fetch_result" that works across device/host memory is necessary. |
272 | for (int i = 0;; i++) { |
273 | runtime_jit_module->call<void *>("runtime_retrieve_error_message" , |
274 | llvm_runtime_, i); |
275 | auto c = fetch_result<char>(taichi_result_buffer_error_id, result_buffer); |
276 | error_message_template += c; |
277 | if (c == '\0') { |
278 | break; |
279 | } |
280 | } |
281 | |
282 | if (error_code == 1) { |
283 | const auto error_message_formatted = format_error_message( |
284 | error_message_template, |
285 | [runtime_jit_module, result_buffer, this](int argument_id) { |
286 | runtime_jit_module->call<void *>( |
287 | "runtime_retrieve_error_message_argument" , llvm_runtime_, |
288 | argument_id); |
289 | return fetch_result<uint64>(taichi_result_buffer_error_id, |
290 | result_buffer); |
291 | }); |
292 | throw TaichiAssertionError(error_message_formatted); |
293 | } else { |
294 | TI_NOT_IMPLEMENTED |
295 | } |
296 | } |
297 | } |
298 | |
299 | void LlvmRuntimeExecutor::print_memory_profiler_info( |
300 | std::vector<std::unique_ptr<SNodeTree>> &snode_trees_, |
301 | uint64 *result_buffer) { |
302 | TI_ASSERT(arch_uses_llvm(config_.arch)); |
303 | |
304 | fmt::print("\n[Memory Profiler]\n" ); |
305 | |
306 | std::locale::global(std::locale("en_US.UTF-8" )); |
307 | // So that thousand separators are added to "{:n}" slots in fmtlib. |
308 | // E.g., 10000 is printed as "10,000". |
309 | // TODO: is there a way to set locale only locally in this function? |
310 | |
311 | std::function<void(SNode *, int)> visit = [&](SNode *snode, int depth) { |
312 | auto element_list = |
313 | runtime_query<void *>("LLVMRuntime_get_element_lists" , result_buffer, |
314 | llvm_runtime_, snode->id); |
315 | |
316 | if (snode->type != SNodeType::place) { |
317 | fmt::print("SNode {:10}\n" , snode->get_node_type_name_hinted()); |
318 | |
319 | if (element_list) { |
320 | fmt::print(" active element list:" ); |
321 | print_list_manager_info(element_list, result_buffer); |
322 | |
323 | auto node_allocator = |
324 | runtime_query<void *>("LLVMRuntime_get_node_allocators" , |
325 | result_buffer, llvm_runtime_, snode->id); |
326 | |
327 | if (node_allocator) { |
328 | auto free_list = runtime_query<void *>("NodeManager_get_free_list" , |
329 | result_buffer, node_allocator); |
330 | auto recycled_list = runtime_query<void *>( |
331 | "NodeManager_get_recycled_list" , result_buffer, node_allocator); |
332 | |
333 | auto free_list_len = runtime_query<int32>( |
334 | "ListManager_get_num_elements" , result_buffer, free_list); |
335 | |
336 | auto recycled_list_len = runtime_query<int32>( |
337 | "ListManager_get_num_elements" , result_buffer, recycled_list); |
338 | |
339 | auto free_list_used = runtime_query<int32>( |
340 | "NodeManager_get_free_list_used" , result_buffer, node_allocator); |
341 | |
342 | auto data_list = runtime_query<void *>("NodeManager_get_data_list" , |
343 | result_buffer, node_allocator); |
344 | fmt::print(" data list: " ); |
345 | print_list_manager_info(data_list, result_buffer); |
346 | |
347 | fmt::print( |
348 | " Allocated elements={:n}; free list length={:n}; recycled list " |
349 | "length={:n}\n" , |
350 | free_list_used, free_list_len, recycled_list_len); |
351 | } |
352 | } |
353 | } |
354 | for (const auto &ch : snode->ch) { |
355 | visit(ch.get(), depth + 1); |
356 | } |
357 | }; |
358 | |
359 | for (auto &a : snode_trees_) { |
360 | visit(a->root(), /*depth=*/0); |
361 | } |
362 | |
363 | auto total_requested_memory = runtime_query<std::size_t>( |
364 | "LLVMRuntime_get_total_requested_memory" , result_buffer, llvm_runtime_); |
365 | |
366 | fmt::print( |
367 | "Total requested dynamic memory (excluding alignment padding): {:n} B\n" , |
368 | total_requested_memory); |
369 | } |
370 | |
371 | DevicePtr LlvmRuntimeExecutor::get_snode_tree_device_ptr(int tree_id) { |
372 | DeviceAllocation tree_alloc = snode_tree_allocs_[tree_id]; |
373 | return tree_alloc.get_ptr(); |
374 | } |
375 | |
376 | void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes( |
377 | const LlvmOfflineCache::FieldCacheData &field_cache_data, |
378 | uint64 *result_buffer) { |
379 | auto *const runtime_jit = get_runtime_jit_module(); |
380 | // By the time this creator is called, "this" is already destroyed. |
381 | // Therefore it is necessary to capture members by values. |
382 | size_t root_size = field_cache_data.root_size; |
383 | const auto snode_metas = field_cache_data.snode_metas; |
384 | const int tree_id = field_cache_data.tree_id; |
385 | const int root_id = field_cache_data.root_id; |
386 | |
387 | TI_TRACE("Allocating data structure of size {} bytes" , root_size); |
388 | std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size); |
389 | |
390 | Ptr root_buffer = snode_tree_buffer_manager_->allocate( |
391 | runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id, |
392 | result_buffer); |
393 | if (config_.arch == Arch::cuda) { |
394 | #if defined(TI_WITH_CUDA) |
395 | CUDADriver::get_instance().memset(root_buffer, 0, rounded_size); |
396 | #else |
397 | TI_NOT_IMPLEMENTED |
398 | #endif |
399 | } else if (config_.arch == Arch::amdgpu) { |
400 | #if defined(TI_WITH_AMDGPU) |
401 | AMDGPUDriver::get_instance().memset(root_buffer, 0, rounded_size); |
402 | #else |
403 | TI_NOT_IMPLEMENTED; |
404 | #endif |
405 | } else { |
406 | std::memset(root_buffer, 0, rounded_size); |
407 | } |
408 | |
409 | DeviceAllocation alloc{kDeviceNullAllocation}; |
410 | |
411 | if (config_.arch == Arch::cuda) { |
412 | #if defined(TI_WITH_CUDA) |
413 | alloc = cuda_device()->import_memory(root_buffer, rounded_size); |
414 | #else |
415 | TI_NOT_IMPLEMENTED |
416 | #endif |
417 | } else if (config_.arch == Arch::amdgpu) { |
418 | #if defined(TI_WITH_AMDGPU) |
419 | alloc = amdgpu_device()->import_memory(root_buffer, rounded_size); |
420 | #else |
421 | TI_NOT_IMPLEMENTED |
422 | #endif |
423 | } else { |
424 | alloc = cpu_device()->import_memory(root_buffer, rounded_size); |
425 | } |
426 | |
427 | snode_tree_allocs_[tree_id] = alloc; |
428 | |
429 | bool all_dense = config_.demote_dense_struct_fors; |
430 | for (size_t i = 0; i < snode_metas.size(); i++) { |
431 | if (snode_metas[i].type != SNodeType::dense && |
432 | snode_metas[i].type != SNodeType::place && |
433 | snode_metas[i].type != SNodeType::root) { |
434 | all_dense = false; |
435 | break; |
436 | } |
437 | } |
438 | |
439 | runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>( |
440 | "runtime_initialize_snodes" , llvm_runtime_, root_size, root_id, |
441 | (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense); |
442 | |
443 | for (size_t i = 0; i < snode_metas.size(); i++) { |
444 | if (is_gc_able(snode_metas[i].type)) { |
445 | const auto snode_id = snode_metas[i].id; |
446 | std::size_t node_size; |
447 | auto element_size = snode_metas[i].cell_size_bytes; |
448 | if (snode_metas[i].type == SNodeType::pointer) { |
449 | // pointer. Allocators are for single elements |
450 | node_size = element_size; |
451 | } else { |
452 | // dynamic. Allocators are for the chunks |
453 | node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size; |
454 | } |
455 | TI_TRACE("Initializing allocator for snode {} (node size {})" , snode_id, |
456 | node_size); |
457 | runtime_jit->call<void *, int, std::size_t>( |
458 | "runtime_NodeAllocator_initialize" , llvm_runtime_, snode_id, |
459 | node_size); |
460 | TI_TRACE("Allocating ambient element for snode {} (node size {})" , |
461 | snode_id, node_size); |
462 | runtime_jit->call<void *, int>("runtime_allocate_ambient" , llvm_runtime_, |
463 | snode_id, node_size); |
464 | } |
465 | } |
466 | } |
467 | |
468 | cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() { |
469 | if (config_.arch != Arch::cuda) { |
470 | TI_ERROR("arch is not cuda" ); |
471 | } |
472 | return static_cast<cuda::CudaDevice *>(device_.get()); |
473 | } |
474 | |
475 | amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() { |
476 | if (config_.arch != Arch::amdgpu) { |
477 | TI_ERROR("arch is not amdgpu" ); |
478 | } |
479 | return static_cast<amdgpu::AmdgpuDevice *>(device_.get()); |
480 | } |
481 | |
482 | cpu::CpuDevice *LlvmRuntimeExecutor::cpu_device() { |
483 | TI_ERROR_IF(!arch_is_cpu(config_.arch), "arch is not cpu" ); |
484 | return static_cast<cpu::CpuDevice *>(device_.get()); |
485 | } |
486 | |
487 | LlvmDevice *LlvmRuntimeExecutor::llvm_device() { |
488 | TI_ASSERT(dynamic_cast<LlvmDevice *>(device_.get())); |
489 | return static_cast<LlvmDevice *>(device_.get()); |
490 | } |
491 | |
492 | DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray( |
493 | std::size_t alloc_size, |
494 | uint64 *result_buffer) { |
495 | return llvm_device()->allocate_memory_runtime( |
496 | {{alloc_size, /*host_write=*/false, /*host_read=*/false, |
497 | /*export_sharing=*/false, AllocUsage::Storage}, |
498 | config_.ndarray_use_cached_allocator, |
499 | get_runtime_jit_module(), |
500 | get_llvm_runtime(), |
501 | result_buffer}); |
502 | } |
503 | |
504 | void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) { |
505 | cuda_device()->dealloc_memory(handle); |
506 | } |
507 | |
508 | void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc, |
509 | std::size_t size, |
510 | uint32_t data) { |
511 | auto ptr = get_ndarray_alloc_info_ptr(alloc); |
512 | if (config_.arch == Arch::cuda) { |
513 | #if defined(TI_WITH_CUDA) |
514 | CUDADriver::get_instance().memsetd32((void *)ptr, data, size); |
515 | #else |
516 | TI_NOT_IMPLEMENTED |
517 | #endif |
518 | } else if (config_.arch == Arch::amdgpu) { |
519 | #if defined(TI_WITH_AMDGPU) |
520 | AMDGPUDriver::get_instance().memset((void *)ptr, data, size); |
521 | #else |
522 | TI_NOT_IMPLEMENTED; |
523 | #endif |
524 | } else { |
525 | std::fill((uint32_t *)ptr, (uint32_t *)ptr + size, data); |
526 | } |
527 | } |
528 | |
529 | uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr( |
530 | const DeviceAllocation &alloc) { |
531 | if (config_.arch == Arch::cuda) { |
532 | #if defined(TI_WITH_CUDA) |
533 | return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr; |
534 | #else |
535 | TI_NOT_IMPLEMENTED |
536 | #endif |
537 | } else if (config_.arch == Arch::amdgpu) { |
538 | #if defined(TI_WITH_AMDGPU) |
539 | return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr; |
540 | #else |
541 | TI_NOT_IMPLEMENTED |
542 | #endif |
543 | } else { |
544 | return (uint64_t *)cpu_device()->get_alloc_info(alloc).ptr; |
545 | } |
546 | } |
547 | |
548 | void LlvmRuntimeExecutor::finalize() { |
549 | profiler_ = nullptr; |
550 | if (preallocated_device_buffer_ != nullptr) { |
551 | if (config_.arch == Arch::cuda) { |
552 | #if defined(TI_WITH_CUDA) |
553 | cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_); |
554 | #endif |
555 | } else if (config_.arch == Arch::amdgpu) { |
556 | #if defined(TI_WITH_AMDGPU) |
557 | amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_); |
558 | #endif |
559 | } |
560 | } |
561 | } |
562 | |
563 | void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool, |
564 | KernelProfilerBase *profiler, |
565 | uint64 **result_buffer_ptr) { |
566 | std::size_t prealloc_size = 0; |
567 | if (config_.arch == Arch::cuda) { |
568 | #if defined(TI_WITH_CUDA) |
569 | CUDADriver::get_instance().malloc( |
570 | (void **)result_buffer_ptr, |
571 | sizeof(uint64) * taichi_result_buffer_entries); |
572 | const auto total_mem = CUDAContext::get_instance().get_total_memory(); |
573 | if (config_.device_memory_fraction == 0) { |
574 | TI_ASSERT(config_.device_memory_GB > 0); |
575 | prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30)); |
576 | } else { |
577 | prealloc_size = std::size_t(config_.device_memory_fraction * total_mem); |
578 | } |
579 | TI_ASSERT(prealloc_size <= total_mem); |
580 | |
581 | TI_TRACE("Allocating device memory {:.2f} GB" , |
582 | 1.0 * prealloc_size / (1UL << 30)); |
583 | |
584 | Device::AllocParams preallocated_device_buffer_alloc_params; |
585 | preallocated_device_buffer_alloc_params.size = prealloc_size; |
586 | preallocated_device_buffer_alloc_ = |
587 | cuda_device()->allocate_memory(preallocated_device_buffer_alloc_params); |
588 | cuda::CudaDevice::AllocInfo preallocated_device_buffer_alloc_info = |
589 | cuda_device()->get_alloc_info(preallocated_device_buffer_alloc_); |
590 | preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr; |
591 | |
592 | CUDADriver::get_instance().memset(preallocated_device_buffer_, 0, |
593 | prealloc_size); |
594 | #else |
595 | TI_NOT_IMPLEMENTED |
596 | #endif |
597 | } else if (config_.arch == Arch::amdgpu) { |
598 | #if defined(TI_WITH_AMDGPU) |
599 | AMDGPUDriver::get_instance().malloc( |
600 | (void **)result_buffer_ptr, |
601 | sizeof(uint64) * taichi_result_buffer_entries); |
602 | const auto total_mem = AMDGPUContext::get_instance().get_total_memory(); |
603 | if (config_.device_memory_fraction == 0) { |
604 | TI_ASSERT(config_.device_memory_GB > 0); |
605 | prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30)); |
606 | } else { |
607 | prealloc_size = std::size_t(config_.device_memory_fraction * total_mem); |
608 | } |
609 | TI_ASSERT(prealloc_size <= total_mem); |
610 | |
611 | TI_TRACE("Allocating device memory {:.2f} GB" , |
612 | 1.0 * prealloc_size / (1UL << 30)); |
613 | |
614 | Device::AllocParams preallocated_device_buffer_alloc_params; |
615 | preallocated_device_buffer_alloc_params.size = prealloc_size; |
616 | preallocated_device_buffer_alloc_ = amdgpu_device()->allocate_memory( |
617 | preallocated_device_buffer_alloc_params); |
618 | amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info = |
619 | amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_); |
620 | preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr; |
621 | |
622 | AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0, |
623 | prealloc_size); |
624 | #else |
625 | TI_NOT_IMPLEMENTED |
626 | #endif |
627 | } else { |
628 | *result_buffer_ptr = (uint64 *)memory_pool->allocate( |
629 | sizeof(uint64) * taichi_result_buffer_entries, 8); |
630 | } |
631 | auto *const runtime_jit = get_runtime_jit_module(); |
632 | |
633 | // Starting random state for the program calculated using the random seed. |
634 | // The seed is multiplied by 1048391 so that two programs with different seeds |
635 | // will not have overlapping random states in any thread. |
636 | int starting_rand_state = config_.random_seed * 1048391; |
637 | |
638 | // Number of random states. One per CPU/CUDA thread. |
639 | int num_rand_states = 0; |
640 | |
641 | if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) { |
642 | #if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU) |
643 | // It is important to make sure that every CUDA thread has its own random |
644 | // state so that we do not need expensive per-state locks. |
645 | num_rand_states = config_.saturating_grid_dim * config_.max_block_dim; |
646 | #else |
647 | TI_NOT_IMPLEMENTED |
648 | #endif |
649 | } else { |
650 | num_rand_states = config_.cpu_max_num_threads; |
651 | } |
652 | |
653 | TI_TRACE("Launching runtime_initialize" ); |
654 | |
655 | runtime_jit |
656 | ->call<void *, void *, std::size_t, void *, int, void *, void *, void *>( |
657 | "runtime_initialize" , *result_buffer_ptr, memory_pool, prealloc_size, |
658 | preallocated_device_buffer_, num_rand_states, |
659 | (void *)&taichi_allocate_aligned, (void *)std::printf, |
660 | (void *)std::vsnprintf); |
661 | |
662 | TI_TRACE("LLVMRuntime initialized (excluding `root`)" ); |
663 | llvm_runtime_ = fetch_result<void *>(taichi_result_buffer_ret_value_id, |
664 | *result_buffer_ptr); |
665 | TI_TRACE("LLVMRuntime pointer fetched" ); |
666 | |
667 | if (config_.arch == Arch::cuda) { |
668 | TI_TRACE("Initializing {} random states using CUDA" , num_rand_states); |
669 | runtime_jit->launch<void *, int>( |
670 | "runtime_initialize_rand_states_cuda" , config_.saturating_grid_dim, |
671 | config_.max_block_dim, 0, llvm_runtime_, starting_rand_state); |
672 | } else { |
673 | TI_TRACE("Initializing {} random states (serially)" , num_rand_states); |
674 | runtime_jit->call<void *, int>("runtime_initialize_rand_states_serial" , |
675 | llvm_runtime_, starting_rand_state); |
676 | } |
677 | |
678 | if (arch_use_host_memory(config_.arch)) { |
679 | runtime_jit->call<void *>("runtime_get_mem_req_queue" , llvm_runtime_); |
680 | auto mem_req_queue = fetch_result<void *>(taichi_result_buffer_ret_value_id, |
681 | *result_buffer_ptr); |
682 | memory_pool->set_queue((MemRequestQueue *)mem_req_queue); |
683 | } |
684 | |
685 | if (arch_use_host_memory(config_.arch)) { |
686 | runtime_jit->call<void *, void *, void *>( |
687 | "LLVMRuntime_initialize_thread_pool" , llvm_runtime_, thread_pool_.get(), |
688 | (void *)ThreadPool::static_run); |
689 | |
690 | runtime_jit->call<void *, void *>("LLVMRuntime_set_assert_failed" , |
691 | llvm_runtime_, |
692 | (void *)assert_failed_host); |
693 | } |
694 | if (arch_is_cpu(config_.arch) && (profiler != nullptr)) { |
695 | // Profiler functions can only be called on CPU kernels |
696 | runtime_jit->call<void *, void *>("LLVMRuntime_set_profiler" , llvm_runtime_, |
697 | profiler); |
698 | runtime_jit->call<void *, void *>( |
699 | "LLVMRuntime_set_profiler_start" , llvm_runtime_, |
700 | (void *)&KernelProfilerBase::profiler_start); |
701 | runtime_jit->call<void *, void *>( |
702 | "LLVMRuntime_set_profiler_stop" , llvm_runtime_, |
703 | (void *)&KernelProfilerBase::profiler_stop); |
704 | } |
705 | if (arch_is_cpu(config_.arch) || config_.arch == Arch::cuda) { |
706 | runtime_jit->call<void *>("runtime_initialize_runtime_context_buffer" , |
707 | llvm_runtime_); |
708 | } |
709 | } |
710 | |
711 | void LlvmRuntimeExecutor::destroy_snode_tree(SNodeTree *snode_tree) { |
712 | get_llvm_context()->delete_snode_tree(snode_tree->id()); |
713 | snode_tree_buffer_manager_->destroy(snode_tree); |
714 | } |
715 | |
716 | Device *LlvmRuntimeExecutor::get_compute_device() { |
717 | return device_.get(); |
718 | } |
719 | |
720 | LLVMRuntime *LlvmRuntimeExecutor::get_llvm_runtime() { |
721 | return static_cast<LLVMRuntime *>(llvm_runtime_); |
722 | } |
723 | |
724 | void LlvmRuntimeExecutor::prepare_runtime_context(RuntimeContext *ctx) { |
725 | ctx->runtime = get_llvm_runtime(); |
726 | } |
727 | |
728 | void LlvmRuntimeExecutor::init_runtime_jit_module( |
729 | std::unique_ptr<llvm::Module> module) { |
730 | llvm_context_->init_runtime_module(module.get()); |
731 | runtime_jit_module_ = create_jit_module(std::move(module)); |
732 | } |
733 | |
734 | void LlvmRuntimeExecutor::fetch_result_impl(void *dest, |
735 | char *result_buffer, |
736 | int offset, |
737 | int size) { |
738 | synchronize(); |
739 | if (config_.arch == Arch::cuda) { |
740 | #if defined(TI_WITH_CUDA) |
741 | CUDADriver::get_instance().memcpy_device_to_host( |
742 | dest, result_buffer + offset, size); |
743 | #else |
744 | TI_NOT_IMPLEMENTED; |
745 | #endif |
746 | } else if (config_.arch == Arch::amdgpu) { |
747 | #if defined(TI_WITH_AMDGPU) |
748 | AMDGPUDriver::get_instance().memcpy_device_to_host( |
749 | dest, result_buffer + offset, size); |
750 | #else |
751 | TI_NOT_IMPLEMENTED; |
752 | #endif |
753 | } else { |
754 | memcpy(dest, result_buffer + offset, size); |
755 | } |
756 | } |
757 | |
758 | } // namespace taichi::lang |
759 | |