1#include "taichi/runtime/llvm/llvm_runtime_executor.h"
2
3#include "taichi/runtime/llvm/llvm_offline_cache.h"
4#include "taichi/runtime/llvm/runtime_module/mem_request.h"
5#include "taichi/rhi/cpu/cpu_device.h"
6#include "taichi/rhi/cuda/cuda_device.h"
7#include "taichi/platform/cuda/detect_cuda.h"
8#include "taichi/rhi/cuda/cuda_driver.h"
9
10#if defined(TI_WITH_CUDA)
11#include "taichi/rhi/cuda/cuda_context.h"
12#endif
13
14#include "taichi/platform/amdgpu/detect_amdgpu.h"
15#include "taichi/rhi/amdgpu/amdgpu_driver.h"
16#include "taichi/rhi/amdgpu/amdgpu_device.h"
17#if defined(TI_WITH_AMDGPU)
18#include "taichi/rhi/amdgpu/amdgpu_context.h"
19#endif
20
21namespace taichi::lang {
22namespace {
23void assert_failed_host(const char *msg) {
24 TI_ERROR("Assertion failure: {}", msg);
25}
26
27void *taichi_allocate_aligned(MemoryPool *memory_pool,
28 std::size_t size,
29 std::size_t alignment) {
30 return memory_pool->allocate(size, alignment);
31}
32} // namespace
33
34LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
35 KernelProfilerBase *profiler)
36 : config_(config) {
37 if (config.arch == Arch::cuda) {
38#if defined(TI_WITH_CUDA)
39 if (!is_cuda_api_available()) {
40 TI_WARN("No CUDA driver API detected.");
41 config.arch = host_arch();
42 } else if (!CUDAContext::get_instance().detected()) {
43 TI_WARN("No CUDA device detected.");
44 config.arch = host_arch();
45 } else {
46 // CUDA runtime created successfully
47 }
48#else
49 TI_WARN("Taichi is not compiled with CUDA.");
50 config.arch = host_arch();
51#endif
52
53 if (config.arch != Arch::cuda) {
54 TI_WARN("Falling back to {}.", arch_name(host_arch()));
55 }
56 } else if (config.arch == Arch::amdgpu) {
57#if defined(TI_WITH_AMDGPU)
58 if (!is_rocm_api_available()) {
59 TI_WARN("No AMDGPU ROCm API detected.");
60 config.arch = host_arch();
61 } else if (!AMDGPUContext::get_instance().detected()) {
62 TI_WARN("No AMDGPU device detected.");
63 config.arch = host_arch();
64 } else {
65 // AMDGPU runtime created successfully
66 }
67#else
68 TI_WARN("Taichi is not compiled with AMDGPU.");
69 config.arch = host_arch();
70#endif
71 }
72
73 if (config.kernel_profiler) {
74 profiler_ = profiler;
75 }
76
77 snode_tree_buffer_manager_ = std::make_unique<SNodeTreeBufferManager>(this);
78 thread_pool_ = std::make_unique<ThreadPool>(config.cpu_max_num_threads);
79 preallocated_device_buffer_ = nullptr;
80
81 llvm_runtime_ = nullptr;
82
83 if (arch_is_cpu(config.arch)) {
84 config.max_block_dim = 1024;
85 device_ = std::make_shared<cpu::CpuDevice>();
86 }
87#if defined(TI_WITH_CUDA)
88 else if (config.arch == Arch::cuda) {
89 int num_SMs{1};
90 CUDADriver::get_instance().device_get_attribute(
91 &num_SMs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, nullptr);
92 int query_max_block_dim{1024};
93 CUDADriver::get_instance().device_get_attribute(
94 &query_max_block_dim, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, nullptr);
95 int version{0};
96 CUDADriver::get_instance().driver_get_version(&version);
97 int query_max_block_per_sm{16};
98 if (version >= 11000) {
99 // query this attribute only when CUDA version is above 11.0
100 CUDADriver::get_instance().device_get_attribute(
101 &query_max_block_per_sm,
102 CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR, nullptr);
103 }
104
105 if (config.max_block_dim == 0) {
106 config.max_block_dim = query_max_block_dim;
107 }
108
109 if (config.saturating_grid_dim == 0) {
110 if (version >= 11000) {
111 TI_TRACE("CUDA max blocks per SM = {}", query_max_block_per_sm);
112 }
113 config.saturating_grid_dim = num_SMs * query_max_block_per_sm * 2;
114 }
115 if (config.kernel_profiler) {
116 CUDAContext::get_instance().set_profiler(profiler);
117 } else {
118 CUDAContext::get_instance().set_profiler(nullptr);
119 }
120 CUDAContext::get_instance().set_debug(config.debug);
121 device_ = std::make_shared<cuda::CudaDevice>();
122 }
123#endif
124
125#if defined(TI_WITH_AMDGPU)
126 else if (config.arch == Arch::amdgpu) {
127 int num_workgroups{1};
128 AMDGPUDriver::get_instance().device_get_attribute(
129 &num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, 0);
130 int query_max_block_dim{1024};
131 AMDGPUDriver::get_instance().device_get_attribute(
132 &query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, 0);
133 // magic number 32
134 // I didn't find the relevant parameter to limit the max block num per CU
135 // So ....
136 int query_max_block_per_cu{32};
137 if (config.max_block_dim == 0) {
138 config.max_block_dim = query_max_block_dim;
139 }
140 if (config.saturating_grid_dim == 0) {
141 config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * 2;
142 }
143 AMDGPUContext::get_instance().set_debug(config.debug);
144 device_ = std::make_shared<amdgpu::AmdgpuDevice>();
145 }
146#endif
147
148#ifdef TI_WITH_DX12
149 else if (config.arch == Arch::dx12) {
150 // FIXME: add dx12 device.
151 // FIXME: set value based on DX12.
152 config.max_block_dim = 1024;
153 device_ = std::make_shared<cpu::CpuDevice>();
154 }
155#endif
156 else {
157 TI_NOT_IMPLEMENTED
158 }
159 llvm_context_ = std::make_unique<TaichiLLVMContext>(
160 config_, arch_is_cpu(config.arch) ? host_arch() : config.arch);
161 init_runtime_jit_module(llvm_context_->clone_runtime_module());
162}
163
164TaichiLLVMContext *LlvmRuntimeExecutor::get_llvm_context() {
165 return llvm_context_.get();
166}
167
168JITModule *LlvmRuntimeExecutor::create_jit_module(
169 std::unique_ptr<llvm::Module> module) {
170 return get_llvm_context()->jit->add_module(std::move(module));
171}
172
173JITModule *LlvmRuntimeExecutor::get_runtime_jit_module() {
174 return runtime_jit_module_;
175}
176
177void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager,
178 uint64 *result_buffer) {
179 auto list_manager_len = runtime_query<int32>("ListManager_get_num_elements",
180 result_buffer, list_manager);
181
182 auto element_size = runtime_query<int32>("ListManager_get_element_size",
183 result_buffer, list_manager);
184
185 auto elements_per_chunk =
186 runtime_query<int32>("ListManager_get_max_num_elements_per_chunk",
187 result_buffer, list_manager);
188
189 auto num_active_chunks = runtime_query<int32>(
190 "ListManager_get_num_active_chunks", result_buffer, list_manager);
191
192 auto size_MB = 1e-6f * num_active_chunks * elements_per_chunk * element_size;
193
194 fmt::print(
195 " length={:n} {:n} chunks x [{:n} x {:n} B] total={:.4f} MB\n",
196 list_manager_len, num_active_chunks, elements_per_chunk, element_size,
197 size_MB);
198}
199
200void LlvmRuntimeExecutor::synchronize() {
201 if (config_.arch == Arch::cuda) {
202#if defined(TI_WITH_CUDA)
203 CUDADriver::get_instance().stream_synchronize(nullptr);
204#else
205 TI_ERROR("No CUDA support");
206#endif
207 } else if (config_.arch == Arch::amdgpu) {
208#if defined(TI_WITH_AMDGPU)
209 AMDGPUDriver::get_instance().stream_synchronize(nullptr);
210#else
211 TI_ERROR("No AMDGPU support");
212#endif
213 }
214 fflush(stdout);
215}
216
217uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) {
218 // TODO: We are likely doing more synchronization than necessary. Simplify the
219 // sync logic when we fetch the result.
220 synchronize();
221 uint64 ret;
222 if (config_.arch == Arch::cuda) {
223#if defined(TI_WITH_CUDA)
224 CUDADriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
225 sizeof(uint64));
226#else
227 TI_NOT_IMPLEMENTED;
228#endif
229 } else if (config_.arch == Arch::amdgpu) {
230#if defined(TI_WITH_AMDGPU)
231 AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
232 sizeof(uint64));
233#else
234 TI_NOT_IMPLEMENTED;
235#endif
236 } else {
237 ret = result_buffer[i];
238 }
239 return ret;
240}
241
242std::size_t LlvmRuntimeExecutor::get_snode_num_dynamically_allocated(
243 SNode *snode,
244 uint64 *result_buffer) {
245 TI_ASSERT(arch_uses_llvm(config_.arch));
246
247 auto node_allocator =
248 runtime_query<void *>("LLVMRuntime_get_node_allocators", result_buffer,
249 llvm_runtime_, snode->id);
250 auto data_list = runtime_query<void *>("NodeManager_get_data_list",
251 result_buffer, node_allocator);
252
253 return (std::size_t)runtime_query<int32>("ListManager_get_num_elements",
254 result_buffer, data_list);
255}
256
257void LlvmRuntimeExecutor::check_runtime_error(uint64 *result_buffer) {
258 synchronize();
259 auto *runtime_jit_module = get_runtime_jit_module();
260 runtime_jit_module->call<void *>("runtime_retrieve_and_reset_error_code",
261 llvm_runtime_);
262 auto error_code =
263 fetch_result<int64>(taichi_result_buffer_error_id, result_buffer);
264
265 if (error_code) {
266 std::string error_message_template;
267
268 // Here we fetch the error_message_template char by char.
269 // This is not efficient, but fortunately we only need to do this when an
270 // assertion fails. Note that we may not have unified memory here, so using
271 // "fetch_result" that works across device/host memory is necessary.
272 for (int i = 0;; i++) {
273 runtime_jit_module->call<void *>("runtime_retrieve_error_message",
274 llvm_runtime_, i);
275 auto c = fetch_result<char>(taichi_result_buffer_error_id, result_buffer);
276 error_message_template += c;
277 if (c == '\0') {
278 break;
279 }
280 }
281
282 if (error_code == 1) {
283 const auto error_message_formatted = format_error_message(
284 error_message_template,
285 [runtime_jit_module, result_buffer, this](int argument_id) {
286 runtime_jit_module->call<void *>(
287 "runtime_retrieve_error_message_argument", llvm_runtime_,
288 argument_id);
289 return fetch_result<uint64>(taichi_result_buffer_error_id,
290 result_buffer);
291 });
292 throw TaichiAssertionError(error_message_formatted);
293 } else {
294 TI_NOT_IMPLEMENTED
295 }
296 }
297}
298
299void LlvmRuntimeExecutor::print_memory_profiler_info(
300 std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
301 uint64 *result_buffer) {
302 TI_ASSERT(arch_uses_llvm(config_.arch));
303
304 fmt::print("\n[Memory Profiler]\n");
305
306 std::locale::global(std::locale("en_US.UTF-8"));
307 // So that thousand separators are added to "{:n}" slots in fmtlib.
308 // E.g., 10000 is printed as "10,000".
309 // TODO: is there a way to set locale only locally in this function?
310
311 std::function<void(SNode *, int)> visit = [&](SNode *snode, int depth) {
312 auto element_list =
313 runtime_query<void *>("LLVMRuntime_get_element_lists", result_buffer,
314 llvm_runtime_, snode->id);
315
316 if (snode->type != SNodeType::place) {
317 fmt::print("SNode {:10}\n", snode->get_node_type_name_hinted());
318
319 if (element_list) {
320 fmt::print(" active element list:");
321 print_list_manager_info(element_list, result_buffer);
322
323 auto node_allocator =
324 runtime_query<void *>("LLVMRuntime_get_node_allocators",
325 result_buffer, llvm_runtime_, snode->id);
326
327 if (node_allocator) {
328 auto free_list = runtime_query<void *>("NodeManager_get_free_list",
329 result_buffer, node_allocator);
330 auto recycled_list = runtime_query<void *>(
331 "NodeManager_get_recycled_list", result_buffer, node_allocator);
332
333 auto free_list_len = runtime_query<int32>(
334 "ListManager_get_num_elements", result_buffer, free_list);
335
336 auto recycled_list_len = runtime_query<int32>(
337 "ListManager_get_num_elements", result_buffer, recycled_list);
338
339 auto free_list_used = runtime_query<int32>(
340 "NodeManager_get_free_list_used", result_buffer, node_allocator);
341
342 auto data_list = runtime_query<void *>("NodeManager_get_data_list",
343 result_buffer, node_allocator);
344 fmt::print(" data list: ");
345 print_list_manager_info(data_list, result_buffer);
346
347 fmt::print(
348 " Allocated elements={:n}; free list length={:n}; recycled list "
349 "length={:n}\n",
350 free_list_used, free_list_len, recycled_list_len);
351 }
352 }
353 }
354 for (const auto &ch : snode->ch) {
355 visit(ch.get(), depth + 1);
356 }
357 };
358
359 for (auto &a : snode_trees_) {
360 visit(a->root(), /*depth=*/0);
361 }
362
363 auto total_requested_memory = runtime_query<std::size_t>(
364 "LLVMRuntime_get_total_requested_memory", result_buffer, llvm_runtime_);
365
366 fmt::print(
367 "Total requested dynamic memory (excluding alignment padding): {:n} B\n",
368 total_requested_memory);
369}
370
371DevicePtr LlvmRuntimeExecutor::get_snode_tree_device_ptr(int tree_id) {
372 DeviceAllocation tree_alloc = snode_tree_allocs_[tree_id];
373 return tree_alloc.get_ptr();
374}
375
376void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
377 const LlvmOfflineCache::FieldCacheData &field_cache_data,
378 uint64 *result_buffer) {
379 auto *const runtime_jit = get_runtime_jit_module();
380 // By the time this creator is called, "this" is already destroyed.
381 // Therefore it is necessary to capture members by values.
382 size_t root_size = field_cache_data.root_size;
383 const auto snode_metas = field_cache_data.snode_metas;
384 const int tree_id = field_cache_data.tree_id;
385 const int root_id = field_cache_data.root_id;
386
387 TI_TRACE("Allocating data structure of size {} bytes", root_size);
388 std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
389
390 Ptr root_buffer = snode_tree_buffer_manager_->allocate(
391 runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id,
392 result_buffer);
393 if (config_.arch == Arch::cuda) {
394#if defined(TI_WITH_CUDA)
395 CUDADriver::get_instance().memset(root_buffer, 0, rounded_size);
396#else
397 TI_NOT_IMPLEMENTED
398#endif
399 } else if (config_.arch == Arch::amdgpu) {
400#if defined(TI_WITH_AMDGPU)
401 AMDGPUDriver::get_instance().memset(root_buffer, 0, rounded_size);
402#else
403 TI_NOT_IMPLEMENTED;
404#endif
405 } else {
406 std::memset(root_buffer, 0, rounded_size);
407 }
408
409 DeviceAllocation alloc{kDeviceNullAllocation};
410
411 if (config_.arch == Arch::cuda) {
412#if defined(TI_WITH_CUDA)
413 alloc = cuda_device()->import_memory(root_buffer, rounded_size);
414#else
415 TI_NOT_IMPLEMENTED
416#endif
417 } else if (config_.arch == Arch::amdgpu) {
418#if defined(TI_WITH_AMDGPU)
419 alloc = amdgpu_device()->import_memory(root_buffer, rounded_size);
420#else
421 TI_NOT_IMPLEMENTED
422#endif
423 } else {
424 alloc = cpu_device()->import_memory(root_buffer, rounded_size);
425 }
426
427 snode_tree_allocs_[tree_id] = alloc;
428
429 bool all_dense = config_.demote_dense_struct_fors;
430 for (size_t i = 0; i < snode_metas.size(); i++) {
431 if (snode_metas[i].type != SNodeType::dense &&
432 snode_metas[i].type != SNodeType::place &&
433 snode_metas[i].type != SNodeType::root) {
434 all_dense = false;
435 break;
436 }
437 }
438
439 runtime_jit->call<void *, std::size_t, int, int, int, std::size_t, Ptr>(
440 "runtime_initialize_snodes", llvm_runtime_, root_size, root_id,
441 (int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense);
442
443 for (size_t i = 0; i < snode_metas.size(); i++) {
444 if (is_gc_able(snode_metas[i].type)) {
445 const auto snode_id = snode_metas[i].id;
446 std::size_t node_size;
447 auto element_size = snode_metas[i].cell_size_bytes;
448 if (snode_metas[i].type == SNodeType::pointer) {
449 // pointer. Allocators are for single elements
450 node_size = element_size;
451 } else {
452 // dynamic. Allocators are for the chunks
453 node_size = sizeof(void *) + element_size * snode_metas[i].chunk_size;
454 }
455 TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
456 node_size);
457 runtime_jit->call<void *, int, std::size_t>(
458 "runtime_NodeAllocator_initialize", llvm_runtime_, snode_id,
459 node_size);
460 TI_TRACE("Allocating ambient element for snode {} (node size {})",
461 snode_id, node_size);
462 runtime_jit->call<void *, int>("runtime_allocate_ambient", llvm_runtime_,
463 snode_id, node_size);
464 }
465 }
466}
467
468cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() {
469 if (config_.arch != Arch::cuda) {
470 TI_ERROR("arch is not cuda");
471 }
472 return static_cast<cuda::CudaDevice *>(device_.get());
473}
474
475amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() {
476 if (config_.arch != Arch::amdgpu) {
477 TI_ERROR("arch is not amdgpu");
478 }
479 return static_cast<amdgpu::AmdgpuDevice *>(device_.get());
480}
481
482cpu::CpuDevice *LlvmRuntimeExecutor::cpu_device() {
483 TI_ERROR_IF(!arch_is_cpu(config_.arch), "arch is not cpu");
484 return static_cast<cpu::CpuDevice *>(device_.get());
485}
486
487LlvmDevice *LlvmRuntimeExecutor::llvm_device() {
488 TI_ASSERT(dynamic_cast<LlvmDevice *>(device_.get()));
489 return static_cast<LlvmDevice *>(device_.get());
490}
491
492DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
493 std::size_t alloc_size,
494 uint64 *result_buffer) {
495 return llvm_device()->allocate_memory_runtime(
496 {{alloc_size, /*host_write=*/false, /*host_read=*/false,
497 /*export_sharing=*/false, AllocUsage::Storage},
498 config_.ndarray_use_cached_allocator,
499 get_runtime_jit_module(),
500 get_llvm_runtime(),
501 result_buffer});
502}
503
504void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
505 cuda_device()->dealloc_memory(handle);
506}
507
508void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
509 std::size_t size,
510 uint32_t data) {
511 auto ptr = get_ndarray_alloc_info_ptr(alloc);
512 if (config_.arch == Arch::cuda) {
513#if defined(TI_WITH_CUDA)
514 CUDADriver::get_instance().memsetd32((void *)ptr, data, size);
515#else
516 TI_NOT_IMPLEMENTED
517#endif
518 } else if (config_.arch == Arch::amdgpu) {
519#if defined(TI_WITH_AMDGPU)
520 AMDGPUDriver::get_instance().memset((void *)ptr, data, size);
521#else
522 TI_NOT_IMPLEMENTED;
523#endif
524 } else {
525 std::fill((uint32_t *)ptr, (uint32_t *)ptr + size, data);
526 }
527}
528
529uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
530 const DeviceAllocation &alloc) {
531 if (config_.arch == Arch::cuda) {
532#if defined(TI_WITH_CUDA)
533 return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr;
534#else
535 TI_NOT_IMPLEMENTED
536#endif
537 } else if (config_.arch == Arch::amdgpu) {
538#if defined(TI_WITH_AMDGPU)
539 return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr;
540#else
541 TI_NOT_IMPLEMENTED
542#endif
543 } else {
544 return (uint64_t *)cpu_device()->get_alloc_info(alloc).ptr;
545 }
546}
547
548void LlvmRuntimeExecutor::finalize() {
549 profiler_ = nullptr;
550 if (preallocated_device_buffer_ != nullptr) {
551 if (config_.arch == Arch::cuda) {
552#if defined(TI_WITH_CUDA)
553 cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
554#endif
555 } else if (config_.arch == Arch::amdgpu) {
556#if defined(TI_WITH_AMDGPU)
557 amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
558#endif
559 }
560 }
561}
562
563void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
564 KernelProfilerBase *profiler,
565 uint64 **result_buffer_ptr) {
566 std::size_t prealloc_size = 0;
567 if (config_.arch == Arch::cuda) {
568#if defined(TI_WITH_CUDA)
569 CUDADriver::get_instance().malloc(
570 (void **)result_buffer_ptr,
571 sizeof(uint64) * taichi_result_buffer_entries);
572 const auto total_mem = CUDAContext::get_instance().get_total_memory();
573 if (config_.device_memory_fraction == 0) {
574 TI_ASSERT(config_.device_memory_GB > 0);
575 prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
576 } else {
577 prealloc_size = std::size_t(config_.device_memory_fraction * total_mem);
578 }
579 TI_ASSERT(prealloc_size <= total_mem);
580
581 TI_TRACE("Allocating device memory {:.2f} GB",
582 1.0 * prealloc_size / (1UL << 30));
583
584 Device::AllocParams preallocated_device_buffer_alloc_params;
585 preallocated_device_buffer_alloc_params.size = prealloc_size;
586 preallocated_device_buffer_alloc_ =
587 cuda_device()->allocate_memory(preallocated_device_buffer_alloc_params);
588 cuda::CudaDevice::AllocInfo preallocated_device_buffer_alloc_info =
589 cuda_device()->get_alloc_info(preallocated_device_buffer_alloc_);
590 preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
591
592 CUDADriver::get_instance().memset(preallocated_device_buffer_, 0,
593 prealloc_size);
594#else
595 TI_NOT_IMPLEMENTED
596#endif
597 } else if (config_.arch == Arch::amdgpu) {
598#if defined(TI_WITH_AMDGPU)
599 AMDGPUDriver::get_instance().malloc(
600 (void **)result_buffer_ptr,
601 sizeof(uint64) * taichi_result_buffer_entries);
602 const auto total_mem = AMDGPUContext::get_instance().get_total_memory();
603 if (config_.device_memory_fraction == 0) {
604 TI_ASSERT(config_.device_memory_GB > 0);
605 prealloc_size = std::size_t(config_.device_memory_GB * (1UL << 30));
606 } else {
607 prealloc_size = std::size_t(config_.device_memory_fraction * total_mem);
608 }
609 TI_ASSERT(prealloc_size <= total_mem);
610
611 TI_TRACE("Allocating device memory {:.2f} GB",
612 1.0 * prealloc_size / (1UL << 30));
613
614 Device::AllocParams preallocated_device_buffer_alloc_params;
615 preallocated_device_buffer_alloc_params.size = prealloc_size;
616 preallocated_device_buffer_alloc_ = amdgpu_device()->allocate_memory(
617 preallocated_device_buffer_alloc_params);
618 amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info =
619 amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_);
620 preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
621
622 AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, 0,
623 prealloc_size);
624#else
625 TI_NOT_IMPLEMENTED
626#endif
627 } else {
628 *result_buffer_ptr = (uint64 *)memory_pool->allocate(
629 sizeof(uint64) * taichi_result_buffer_entries, 8);
630 }
631 auto *const runtime_jit = get_runtime_jit_module();
632
633 // Starting random state for the program calculated using the random seed.
634 // The seed is multiplied by 1048391 so that two programs with different seeds
635 // will not have overlapping random states in any thread.
636 int starting_rand_state = config_.random_seed * 1048391;
637
638 // Number of random states. One per CPU/CUDA thread.
639 int num_rand_states = 0;
640
641 if (config_.arch == Arch::cuda || config_.arch == Arch::amdgpu) {
642#if defined(TI_WITH_CUDA) || defined(TI_WITH_AMDGPU)
643 // It is important to make sure that every CUDA thread has its own random
644 // state so that we do not need expensive per-state locks.
645 num_rand_states = config_.saturating_grid_dim * config_.max_block_dim;
646#else
647 TI_NOT_IMPLEMENTED
648#endif
649 } else {
650 num_rand_states = config_.cpu_max_num_threads;
651 }
652
653 TI_TRACE("Launching runtime_initialize");
654
655 runtime_jit
656 ->call<void *, void *, std::size_t, void *, int, void *, void *, void *>(
657 "runtime_initialize", *result_buffer_ptr, memory_pool, prealloc_size,
658 preallocated_device_buffer_, num_rand_states,
659 (void *)&taichi_allocate_aligned, (void *)std::printf,
660 (void *)std::vsnprintf);
661
662 TI_TRACE("LLVMRuntime initialized (excluding `root`)");
663 llvm_runtime_ = fetch_result<void *>(taichi_result_buffer_ret_value_id,
664 *result_buffer_ptr);
665 TI_TRACE("LLVMRuntime pointer fetched");
666
667 if (config_.arch == Arch::cuda) {
668 TI_TRACE("Initializing {} random states using CUDA", num_rand_states);
669 runtime_jit->launch<void *, int>(
670 "runtime_initialize_rand_states_cuda", config_.saturating_grid_dim,
671 config_.max_block_dim, 0, llvm_runtime_, starting_rand_state);
672 } else {
673 TI_TRACE("Initializing {} random states (serially)", num_rand_states);
674 runtime_jit->call<void *, int>("runtime_initialize_rand_states_serial",
675 llvm_runtime_, starting_rand_state);
676 }
677
678 if (arch_use_host_memory(config_.arch)) {
679 runtime_jit->call<void *>("runtime_get_mem_req_queue", llvm_runtime_);
680 auto mem_req_queue = fetch_result<void *>(taichi_result_buffer_ret_value_id,
681 *result_buffer_ptr);
682 memory_pool->set_queue((MemRequestQueue *)mem_req_queue);
683 }
684
685 if (arch_use_host_memory(config_.arch)) {
686 runtime_jit->call<void *, void *, void *>(
687 "LLVMRuntime_initialize_thread_pool", llvm_runtime_, thread_pool_.get(),
688 (void *)ThreadPool::static_run);
689
690 runtime_jit->call<void *, void *>("LLVMRuntime_set_assert_failed",
691 llvm_runtime_,
692 (void *)assert_failed_host);
693 }
694 if (arch_is_cpu(config_.arch) && (profiler != nullptr)) {
695 // Profiler functions can only be called on CPU kernels
696 runtime_jit->call<void *, void *>("LLVMRuntime_set_profiler", llvm_runtime_,
697 profiler);
698 runtime_jit->call<void *, void *>(
699 "LLVMRuntime_set_profiler_start", llvm_runtime_,
700 (void *)&KernelProfilerBase::profiler_start);
701 runtime_jit->call<void *, void *>(
702 "LLVMRuntime_set_profiler_stop", llvm_runtime_,
703 (void *)&KernelProfilerBase::profiler_stop);
704 }
705 if (arch_is_cpu(config_.arch) || config_.arch == Arch::cuda) {
706 runtime_jit->call<void *>("runtime_initialize_runtime_context_buffer",
707 llvm_runtime_);
708 }
709}
710
711void LlvmRuntimeExecutor::destroy_snode_tree(SNodeTree *snode_tree) {
712 get_llvm_context()->delete_snode_tree(snode_tree->id());
713 snode_tree_buffer_manager_->destroy(snode_tree);
714}
715
716Device *LlvmRuntimeExecutor::get_compute_device() {
717 return device_.get();
718}
719
720LLVMRuntime *LlvmRuntimeExecutor::get_llvm_runtime() {
721 return static_cast<LLVMRuntime *>(llvm_runtime_);
722}
723
724void LlvmRuntimeExecutor::prepare_runtime_context(RuntimeContext *ctx) {
725 ctx->runtime = get_llvm_runtime();
726}
727
728void LlvmRuntimeExecutor::init_runtime_jit_module(
729 std::unique_ptr<llvm::Module> module) {
730 llvm_context_->init_runtime_module(module.get());
731 runtime_jit_module_ = create_jit_module(std::move(module));
732}
733
734void LlvmRuntimeExecutor::fetch_result_impl(void *dest,
735 char *result_buffer,
736 int offset,
737 int size) {
738 synchronize();
739 if (config_.arch == Arch::cuda) {
740#if defined(TI_WITH_CUDA)
741 CUDADriver::get_instance().memcpy_device_to_host(
742 dest, result_buffer + offset, size);
743#else
744 TI_NOT_IMPLEMENTED;
745#endif
746 } else if (config_.arch == Arch::amdgpu) {
747#if defined(TI_WITH_AMDGPU)
748 AMDGPUDriver::get_instance().memcpy_device_to_host(
749 dest, result_buffer + offset, size);
750#else
751 TI_NOT_IMPLEMENTED;
752#endif
753 } else {
754 memcpy(dest, result_buffer + offset, size);
755 }
756}
757
758} // namespace taichi::lang
759