llvm_runtime_executor.cpp source code [taichi/taichi/runtime/llvm/llvm_runtime_executor.cpp]

1	#include "taichi/runtime/llvm/llvm_runtime_executor.h"
2
3	#include "taichi/runtime/llvm/llvm_offline_cache.h"
4	#include "taichi/runtime/llvm/runtime_module/mem_request.h"
5	#include "taichi/rhi/cpu/cpu_device.h"
6	#include "taichi/rhi/cuda/cuda_device.h"
7	#include "taichi/platform/cuda/detect_cuda.h"
8	#include "taichi/rhi/cuda/cuda_driver.h"
9
10	#if defined(TI_WITH_CUDA)
11	#include "taichi/rhi/cuda/cuda_context.h"
12	#endif
13
14	#include "taichi/platform/amdgpu/detect_amdgpu.h"
15	#include "taichi/rhi/amdgpu/amdgpu_driver.h"
16	#include "taichi/rhi/amdgpu/amdgpu_device.h"
17	#if defined(TI_WITH_AMDGPU)
18	#include "taichi/rhi/amdgpu/amdgpu_context.h"
19	#endif
20
21	namespace taichi::lang {
22	namespace {
23	void assert_failed_host(const char *msg) {
24	TI_ERROR("Assertion failure: {}", msg);
25	}
26
27	void taichi_allocate_aligned(MemoryPool memory_pool,
28	std::size_t size,
29	std::size_t alignment) {
30	return memory_pool->allocate(size, alignment);
31	}
32	} // namespace
33
34	LlvmRuntimeExecutor::LlvmRuntimeExecutor(CompileConfig &config,
35	KernelProfilerBase *profiler)
36	: config_(config) {
37	if (config.arch == Arch::cuda) {
38	#if defined(TI_WITH_CUDA)
39	if (!is_cuda_api_available()) {
40	TI_WARN("No CUDA driver API detected.");
41	config.arch = host_arch();
42	} else if (!CUDAContext::get_instance().detected()) {
43	TI_WARN("No CUDA device detected.");
44	config.arch = host_arch();
45	} else {
46	// CUDA runtime created successfully
47	}
48	#else
49	TI_WARN("Taichi is not compiled with CUDA.");
50	config.arch = host_arch();
51	#endif
52
53	if (config.arch != Arch::cuda) {
54	TI_WARN("Falling back to {}.", arch_name(host_arch()));
55	}
56	} else if (config.arch == Arch::amdgpu) {
57	#if defined(TI_WITH_AMDGPU)
58	if (!is_rocm_api_available()) {
59	TI_WARN("No AMDGPU ROCm API detected.");
60	config.arch = host_arch();
61	} else if (!AMDGPUContext::get_instance().detected()) {
62	TI_WARN("No AMDGPU device detected.");
63	config.arch = host_arch();
64	} else {
65	// AMDGPU runtime created successfully
66	}
67	#else
68	TI_WARN("Taichi is not compiled with AMDGPU.");
69	config.arch = host_arch();
70	#endif
71	}
72
73	if (config.kernel_profiler) {
74	profiler_ = profiler;
75	}
76
77	snode_tree_buffer_manager_ = std::make_unique<SNodeTreeBufferManager>(this);
78	thread_pool_ = std::make_unique<ThreadPool>(config.cpu_max_num_threads);
79	preallocated_device_buffer_ = nullptr;
80
81	llvm_runtime_ = nullptr;
82
83	if (arch_is_cpu(config.arch)) {
84	config.max_block_dim = `1024`;
85	device_ = std::make_shared<cpu::CpuDevice>();
86	}
87	#if defined(TI_WITH_CUDA)
88	else if (config.arch == Arch::cuda) {
89	int num_SMs{`1`};
90	CUDADriver::get_instance().device_get_attribute (
91	&num_SMs, CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, nullptr);
92	int query_max_block_dim{`1024`};
93	CUDADriver::get_instance().device_get_attribute (
94	&query_max_block_dim, CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, nullptr);
95	int version{`0`};
96	CUDADriver::get_instance().driver_get_version(&version);
97	int query_max_block_per_sm{`16`};
98	if (version >= `11000`) {
99	// query this attribute only when CUDA version is above 11.0
100	CUDADriver::get_instance().device_get_attribute (
101	&query_max_block_per_sm,
102	CU_DEVICE_ATTRIBUTE_MAX_BLOCKS_PER_MULTIPROCESSOR, nullptr);
103	}
104
105	if (config.max_block_dim == `0`) {
106	config.max_block_dim = query_max_block_dim;
107	}
108
109	if (config.saturating_grid_dim == `0`) {
110	if (version >= `11000`) {
111	TI_TRACE("CUDA max blocks per SM = {}", query_max_block_per_sm);
112	}
113	config.saturating_grid_dim = num_SMs * query_max_block_per_sm * `2`;
114	}
115	if (config.kernel_profiler) {
116	CUDAContext::get_instance().set_profiler(profiler);
117	} else {
118	CUDAContext::get_instance().set_profiler(nullptr);
119	}
120	CUDAContext::get_instance().set_debug(config.debug);
121	device_ = std::make_shared<cuda::CudaDevice>();
122	}
123	#endif
124
125	#if defined(TI_WITH_AMDGPU)
126	else if (config.arch == Arch::amdgpu) {
127	int num_workgroups{`1`};
128	AMDGPUDriver::get_instance().device_get_attribute(
129	&num_workgroups, HIP_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, `0`);
130	int query_max_block_dim{`1024`};
131	AMDGPUDriver::get_instance().device_get_attribute(
132	&query_max_block_dim, HIP_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X, `0`);
133	// magic number 32
134	// I didn't find the relevant parameter to limit the max block num per CU
135	// So ....
136	int query_max_block_per_cu{`32`};
137	if (config.max_block_dim == `0`) {
138	config.max_block_dim = query_max_block_dim;
139	}
140	if (config.saturating_grid_dim == `0`) {
141	config.saturating_grid_dim = num_workgroups * query_max_block_per_cu * `2`;
142	}
143	AMDGPUContext::get_instance().set_debug(config.debug);
144	device_ = std::make_shared<amdgpu::AmdgpuDevice>();
145	}
146	#endif
147
148	#ifdef TI_WITH_DX12
149	else if (config.arch == Arch::dx12) {
150	// FIXME: add dx12 device.
151	// FIXME: set value based on DX12.
152	config.max_block_dim = `1024`;
153	device_ = std::make_shared<cpu::CpuDevice>();
154	}
155	#endif
156	else {
157	TI_NOT_IMPLEMENTED
158	}
159	llvm_context_ = std::make_unique<TaichiLLVMContext>(
160	config_, arch_is_cpu(config.arch) ? host_arch() : config.arch);
161	init_runtime_jit_module(llvm_context_->clone_runtime_module());
162	}
163
164	TaichiLLVMContext *LlvmRuntimeExecutor::get_llvm_context() {
165	return llvm_context_.get();
166	}
167
168	JITModule *LlvmRuntimeExecutor::create_jit_module(
169	std::unique_ptr<llvm::Module> module) {
170	return get_llvm_context()->jit ->add_module(std::move(module));
171	}
172
173	JITModule *LlvmRuntimeExecutor::get_runtime_jit_module() {
174	return runtime_jit_module_;
175	}
176
177	void LlvmRuntimeExecutor::print_list_manager_info(void *list_manager,
178	uint64 *result_buffer) {
179	auto list_manager_len = runtime_query<int32>("ListManager_get_num_elements",
180	result_buffer, list_manager);
181
182	auto element_size = runtime_query<int32>("ListManager_get_element_size",
183	result_buffer, list_manager);
184
185	auto elements_per_chunk =
186	runtime_query<int32>("ListManager_get_max_num_elements_per_chunk",
187	result_buffer, list_manager);
188
189	auto num_active_chunks = runtime_query<int32>(
190	"ListManager_get_num_active_chunks", result_buffer, list_manager);
191
192	auto size_MB = `1e-6f` * num_active_chunks * elements_per_chunk * element_size;
193
194	fmt::print(
195	" length={:n} {:n} chunks x [{:n} x {:n} B] total={:.4f} MB\n",
196	list_manager_len, num_active_chunks, elements_per_chunk, element_size,
197	size_MB);
198	}
199
200	void LlvmRuntimeExecutor::synchronize() {
201	if (config_.arch == Arch::cuda) {
202	#if defined(TI_WITH_CUDA)
203	CUDADriver::get_instance().stream_synchronize (nullptr);
204	#else
205	TI_ERROR("No CUDA support");
206	#endif
207	} else if (config_.arch == Arch::amdgpu) {
208	#if defined(TI_WITH_AMDGPU)
209	AMDGPUDriver::get_instance().stream_synchronize(nullptr);
210	#else
211	TI_ERROR("No AMDGPU support");
212	#endif
213	}
214	fflush(stdout);
215	}
216
217	uint64 LlvmRuntimeExecutor::fetch_result_uint64(int i, uint64 *result_buffer) {
218	// TODO: We are likely doing more synchronization than necessary. Simplify the
219	// sync logic when we fetch the result.
220	synchronize();
221	uint64 ret;
222	if (config_.arch == Arch::cuda) {
223	#if defined(TI_WITH_CUDA)
224	CUDADriver::get_instance().memcpy_device_to_host (&ret, result_buffer + i,
225	sizeof(uint64));
226	#else
227	TI_NOT_IMPLEMENTED;
228	#endif
229	} else if (config_.arch == Arch::amdgpu) {
230	#if defined(TI_WITH_AMDGPU)
231	AMDGPUDriver::get_instance().memcpy_device_to_host(&ret, result_buffer + i,
232	sizeof(uint64));
233	#else
234	TI_NOT_IMPLEMENTED;
235	#endif
236	} else {
237	ret = result_buffer[i];
238	}
239	return ret;
240	}
241
242	std::size_t LlvmRuntimeExecutor::get_snode_num_dynamically_allocated(
243	SNode *snode,
244	uint64 *result_buffer) {
245	TI_ASSERT(arch_uses_llvm(config_.arch));
246
247	auto node_allocator =
248	runtime_query<void *>("LLVMRuntime_get_node_allocators", result_buffer,
249	llvm_runtime_, snode->id);
250	auto data_list = runtime_query<void *>("NodeManager_get_data_list",
251	result_buffer, node_allocator);
252
253	return (std::size_t)runtime_query<int32>("ListManager_get_num_elements",
254	result_buffer, data_list);
255	}
256
257	void LlvmRuntimeExecutor::check_runtime_error(uint64 *result_buffer) {
258	synchronize();
259	auto *runtime_jit_module = get_runtime_jit_module();
260	runtime_jit_module->call<void *>("runtime_retrieve_and_reset_error_code",
261	llvm_runtime_);
262	auto error_code =
263	fetch_result<int64>(taichi_result_buffer_error_id, result_buffer);
264
265	if (error_code) {
266	std::string error_message_template;
267
268	// Here we fetch the error_message_template char by char.
269	// This is not efficient, but fortunately we only need to do this when an
270	// assertion fails. Note that we may not have unified memory here, so using
271	// "fetch_result" that works across device/host memory is necessary.
272	for (int i = `0`;; i++) {
273	runtime_jit_module->call<void *>("runtime_retrieve_error_message",
274	llvm_runtime_, i);
275	auto c = fetch_result<char>(taichi_result_buffer_error_id, result_buffer);
276	error_message_template += c;
277	if (c == `'\0'`) {
278	break;
279	}
280	}
281
282	if (error_code == `1`) {
283	const auto error_message_formatted = format_error_message(
284	error_message_template,
285	[runtime_jit_module, result_buffer, this](int argument_id) {
286	runtime_jit_module->call<void *>(
287	"runtime_retrieve_error_message_argument", llvm_runtime_,
288	argument_id);
289	return fetch_result<uint64>(taichi_result_buffer_error_id,
290	result_buffer);
291	});
292	throw TaichiAssertionError (error_message_formatted);
293	} else {
294	TI_NOT_IMPLEMENTED
295	}
296	}
297	}
298
299	void LlvmRuntimeExecutor::print_memory_profiler_info(
300	std::vector<std::unique_ptr<SNodeTree>> &snode_trees_,
301	uint64 *result_buffer) {
302	TI_ASSERT(arch_uses_llvm(config_.arch));
303
304	fmt::print("\n[Memory Profiler]\n");
305
306	std::locale::global(std::locale ("en_US.UTF-8"));
307	// So that thousand separators are added to "{:n}" slots in fmtlib.
308	// E.g., 10000 is printed as "10,000".
309	// TODO: is there a way to set locale only locally in this function?
310
311	std::function<void(SNode , int)> visit = [&](SNode snode, int depth) {
312	auto element_list =
313	runtime_query<void *>("LLVMRuntime_get_element_lists", result_buffer,
314	llvm_runtime_, snode->id);
315
316	if (snode->type != SNodeType::place) {
317	fmt::print("SNode {:10}\n", snode->get_node_type_name_hinted());
318
319	if (element_list) {
320	fmt::print(" active element list:");
321	print_list_manager_info(element_list, result_buffer);
322
323	auto node_allocator =
324	runtime_query<void *>("LLVMRuntime_get_node_allocators",
325	result_buffer, llvm_runtime_, snode->id);
326
327	if (node_allocator) {
328	auto free_list = runtime_query<void *>("NodeManager_get_free_list",
329	result_buffer, node_allocator);
330	auto recycled_list = runtime_query<void *>(
331	"NodeManager_get_recycled_list", result_buffer, node_allocator);
332
333	auto free_list_len = runtime_query<int32>(
334	"ListManager_get_num_elements", result_buffer, free_list);
335
336	auto recycled_list_len = runtime_query<int32>(
337	"ListManager_get_num_elements", result_buffer, recycled_list);
338
339	auto free_list_used = runtime_query<int32>(
340	"NodeManager_get_free_list_used", result_buffer, node_allocator);
341
342	auto data_list = runtime_query<void *>("NodeManager_get_data_list",
343	result_buffer, node_allocator);
344	fmt::print(" data list: ");
345	print_list_manager_info(data_list, result_buffer);
346
347	fmt::print(
348	" Allocated elements={:n}; free list length={:n}; recycled list "
349	"length={:n}\n",
350	free_list_used, free_list_len, recycled_list_len);
351	}
352	}
353	}
354	for (const auto &ch : snode->ch) {
355	visit (ch.get(), depth + `1`);
356	}
357	};
358
359	for (auto &a : snode_trees_) {
360	visit (a ->root(), /depth=/`0`);
361	}
362
363	auto total_requested_memory = runtime_query<std::size_t>(
364	"LLVMRuntime_get_total_requested_memory", result_buffer, llvm_runtime_);
365
366	fmt::print(
367	"Total requested dynamic memory (excluding alignment padding): {:n} B\n",
368	total_requested_memory);
369	}
370
371	DevicePtr LlvmRuntimeExecutor::get_snode_tree_device_ptr(int tree_id) {
372	DeviceAllocation tree_alloc = snode_tree_allocs_[tree_id];
373	return tree_alloc.get_ptr();
374	}
375
376	void LlvmRuntimeExecutor::initialize_llvm_runtime_snodes(
377	const LlvmOfflineCache::FieldCacheData &field_cache_data,
378	uint64 *result_buffer) {
379	auto *const runtime_jit = get_runtime_jit_module();
380	// By the time this creator is called, "this" is already destroyed.
381	// Therefore it is necessary to capture members by values.
382	size_t root_size = field_cache_data.root_size;
383	const auto snode_metas = field_cache_data.snode_metas;
384	const int tree_id = field_cache_data.tree_id;
385	const int root_id = field_cache_data.root_id;
386
387	TI_TRACE("Allocating data structure of size {} bytes", root_size);
388	std::size_t rounded_size = taichi::iroundup(root_size, taichi_page_size);
389
390	Ptr root_buffer = snode_tree_buffer_manager_->allocate(
391	runtime_jit, llvm_runtime_, rounded_size, taichi_page_size, tree_id,
392	result_buffer);
393	if (config_.arch == Arch::cuda) {
394	#if defined(TI_WITH_CUDA)
395	CUDADriver::get_instance().memset (root_buffer, `0`, rounded_size);
396	#else
397	TI_NOT_IMPLEMENTED
398	#endif
399	} else if (config_.arch == Arch::amdgpu) {
400	#if defined(TI_WITH_AMDGPU)
401	AMDGPUDriver::get_instance().memset(root_buffer, `0`, rounded_size);
402	#else
403	TI_NOT_IMPLEMENTED;
404	#endif
405	} else {
406	std::memset(root_buffer, `0`, rounded_size);
407	}
408
409	DeviceAllocation alloc{kDeviceNullAllocation};
410
411	if (config_.arch == Arch::cuda) {
412	#if defined(TI_WITH_CUDA)
413	alloc = cuda_device()->import_memory(root_buffer, rounded_size);
414	#else
415	TI_NOT_IMPLEMENTED
416	#endif
417	} else if (config_.arch == Arch::amdgpu) {
418	#if defined(TI_WITH_AMDGPU)
419	alloc = amdgpu_device()->import_memory(root_buffer, rounded_size);
420	#else
421	TI_NOT_IMPLEMENTED
422	#endif
423	} else {
424	alloc = cpu_device()->import_memory(root_buffer, rounded_size);
425	}
426
427	snode_tree_allocs_[tree_id] = alloc;
428
429	bool all_dense = config_.demote_dense_struct_fors;
430	for (size_t i = `0`; i < snode_metas.size(); i++) {
431	if (snode_metas [i].type != SNodeType::dense &&
432	snode_metas [i].type != SNodeType::place &&
433	snode_metas [i].type != SNodeType::root) {
434	all_dense = false;
435	break;
436	}
437	}
438
439	runtime_jit->call<void , std::size_t, int, int, int*, std::size_t, Ptr>(
440	"runtime_initialize_snodes", llvm_runtime_, root_size, root_id,
441	(int)snode_metas.size(), tree_id, rounded_size, root_buffer, all_dense);
442
443	for (size_t i = `0`; i < snode_metas.size(); i++) {
444	if (is_gc_able(snode_metas [i].type)) {
445	const auto snode_id = snode_metas [i].id;
446	std::size_t node_size;
447	auto element_size = snode_metas [i].cell_size_bytes;
448	if (snode_metas [i].type == SNodeType::pointer) {
449	// pointer. Allocators are for single elements
450	node_size = element_size;
451	} else {
452	// dynamic. Allocators are for the chunks
453	node_size = sizeof(void ) + element_size snode_metas [i].chunk_size;
454	}
455	TI_TRACE("Initializing allocator for snode {} (node size {})", snode_id,
456	node_size);
457	runtime_jit->call<void , int*, std::size_t>(
458	"runtime_NodeAllocator_initialize", llvm_runtime_, snode_id,
459	node_size);
460	TI_TRACE("Allocating ambient element for snode {} (node size {})",
461	snode_id, node_size);
462	runtime_jit->call<void , int*>("runtime_allocate_ambient", llvm_runtime_,
463	snode_id, node_size);
464	}
465	}
466	}
467
468	cuda::CudaDevice *LlvmRuntimeExecutor::cuda_device() {
469	if (config_.arch != Arch::cuda) {
470	TI_ERROR("arch is not cuda");
471	}
472	return static_cast<cuda::CudaDevice *>(device_.get());
473	}
474
475	amdgpu::AmdgpuDevice *LlvmRuntimeExecutor::amdgpu_device() {
476	if (config_.arch != Arch::amdgpu) {
477	TI_ERROR("arch is not amdgpu");
478	}
479	return static_cast<amdgpu::AmdgpuDevice *>(device_.get());
480	}
481
482	cpu::CpuDevice *LlvmRuntimeExecutor::cpu_device() {
483	TI_ERROR_IF(!arch_is_cpu(config_.arch), "arch is not cpu");
484	return static_cast<cpu::CpuDevice *>(device_.get());
485	}
486
487	LlvmDevice *LlvmRuntimeExecutor::llvm_device() {
488	TI_ASSERT(dynamic_cast<LlvmDevice *>(device_.get()));
489	return static_cast<LlvmDevice *>(device_.get());
490	}
491
492	DeviceAllocation LlvmRuntimeExecutor::allocate_memory_ndarray(
493	std::size_t alloc_size,
494	uint64 *result_buffer) {
495	return llvm_device()->allocate_memory_runtime(
496	{{alloc_size, /host_write=/false, /host_read=/false,
497	/export_sharing=/false, AllocUsage::Storage},
498	config_.ndarray_use_cached_allocator,
499	get_runtime_jit_module(),
500	get_llvm_runtime(),
501	result_buffer});
502	}
503
504	void LlvmRuntimeExecutor::deallocate_memory_ndarray(DeviceAllocation handle) {
505	cuda_device()->dealloc_memory(handle);
506	}
507
508	void LlvmRuntimeExecutor::fill_ndarray(const DeviceAllocation &alloc,
509	std::size_t size,
510	uint32_t data) {
511	auto ptr = get_ndarray_alloc_info_ptr(alloc);
512	if (config_.arch == Arch::cuda) {
513	#if defined(TI_WITH_CUDA)
514	CUDADriver::get_instance().memsetd32 ((void *)ptr, data, size);
515	#else
516	TI_NOT_IMPLEMENTED
517	#endif
518	} else if (config_.arch == Arch::amdgpu) {
519	#if defined(TI_WITH_AMDGPU)
520	AMDGPUDriver::get_instance().memset((void *)ptr, data, size);
521	#else
522	TI_NOT_IMPLEMENTED;
523	#endif
524	} else {
525	std::fill((uint32_t )ptr, (uint32_t )ptr + size, data);
526	}
527	}
528
529	uint64_t *LlvmRuntimeExecutor::get_ndarray_alloc_info_ptr(
530	const DeviceAllocation &alloc) {
531	if (config_.arch == Arch::cuda) {
532	#if defined(TI_WITH_CUDA)
533	return (uint64_t *)cuda_device()->get_alloc_info(alloc).ptr;
534	#else
535	TI_NOT_IMPLEMENTED
536	#endif
537	} else if (config_.arch == Arch::amdgpu) {
538	#if defined(TI_WITH_AMDGPU)
539	return (uint64_t *)amdgpu_device()->get_alloc_info(alloc).ptr;
540	#else
541	TI_NOT_IMPLEMENTED
542	#endif
543	} else {
544	return (uint64_t *)cpu_device()->get_alloc_info(alloc).ptr;
545	}
546	}
547
548	void LlvmRuntimeExecutor::finalize() {
549	profiler_ = nullptr;
550	if (preallocated_device_buffer_ != nullptr) {
551	if (config_.arch == Arch::cuda) {
552	#if defined(TI_WITH_CUDA)
553	cuda_device()->dealloc_memory(preallocated_device_buffer_alloc_);
554	#endif
555	} else if (config_.arch == Arch::amdgpu) {
556	#if defined(TI_WITH_AMDGPU)
557	amdgpu_device()->dealloc_memory(preallocated_device_buffer_alloc_);
558	#endif
559	}
560	}
561	}
562
563	void LlvmRuntimeExecutor::materialize_runtime(MemoryPool *memory_pool,
564	KernelProfilerBase *profiler,
565	uint64 **result_buffer_ptr) {
566	std::size_t prealloc_size = `0`;
567	if (config_.arch == Arch::cuda) {
568	#if defined(TI_WITH_CUDA)
569	CUDADriver::get_instance().malloc (
570	(void **)result_buffer_ptr,
571	sizeof(uint64) * taichi_result_buffer_entries);
572	const auto total_mem = CUDAContext::get_instance().get_total_memory();
573	if (config_.device_memory_fraction == `0`) {
574	TI_ASSERT(config_.device_memory_GB > `0`);
575	prealloc_size = std::size_t(config_.device_memory_GB * (`1UL` << `30`));
576	} else {
577	prealloc_size = std::size_t(config_.device_memory_fraction * total_mem);
578	}
579	TI_ASSERT(prealloc_size <= total_mem);
580
581	TI_TRACE("Allocating device memory {:.2f} GB",
582	`1.0` * prealloc_size / (`1UL` << `30`));
583
584	Device::AllocParams preallocated_device_buffer_alloc_params;
585	preallocated_device_buffer_alloc_params.size = prealloc_size;
586	preallocated_device_buffer_alloc_ =
587	cuda_device()->allocate_memory(preallocated_device_buffer_alloc_params);
588	cuda::CudaDevice::AllocInfo preallocated_device_buffer_alloc_info =
589	cuda_device()->get_alloc_info(preallocated_device_buffer_alloc_);
590	preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
591
592	CUDADriver::get_instance().memset (preallocated_device_buffer_, `0`,
593	prealloc_size);
594	#else
595	TI_NOT_IMPLEMENTED
596	#endif
597	} else if (config_.arch == Arch::amdgpu) {
598	#if defined(TI_WITH_AMDGPU)
599	AMDGPUDriver::get_instance().malloc(
600	(void **)result_buffer_ptr,
601	sizeof(uint64) * taichi_result_buffer_entries);
602	const auto total_mem = AMDGPUContext::get_instance().get_total_memory();
603	if (config_.device_memory_fraction == `0`) {
604	TI_ASSERT(config_.device_memory_GB > `0`);
605	prealloc_size = std::size_t(config_.device_memory_GB * (`1UL` << `30`));
606	} else {
607	prealloc_size = std::size_t(config_.device_memory_fraction * total_mem);
608	}
609	TI_ASSERT(prealloc_size <= total_mem);
610
611	TI_TRACE("Allocating device memory {:.2f} GB",
612	`1.0` * prealloc_size / (`1UL` << `30`));
613
614	Device::AllocParams preallocated_device_buffer_alloc_params;
615	preallocated_device_buffer_alloc_params.size = prealloc_size;
616	preallocated_device_buffer_alloc_ = amdgpu_device()->allocate_memory(
617	preallocated_device_buffer_alloc_params);
618	amdgpu::AmdgpuDevice::AllocInfo preallocated_device_buffer_alloc_info =
619	amdgpu_device()->get_alloc_info(preallocated_device_buffer_alloc_);
620	preallocated_device_buffer_ = preallocated_device_buffer_alloc_info.ptr;
621
622	AMDGPUDriver::get_instance().memset(preallocated_device_buffer_, `0`,
623	prealloc_size);
624	#else
625	TI_NOT_IMPLEMENTED
626	#endif
627	} else {
628	result_buffer_ptr = (uint64 )memory_pool->allocate(
629	sizeof(uint64) * taichi_result_buffer_entries, `8`);
630	}
631	auto *const runtime_jit = get_runtime_jit_module();
632
633	// Starting random state for the program calculated using the random seed.
634	// The seed is multiplied by 1048391 so that two programs with different seeds
635	// will not have overlapping random states in any thread.
636	int starting_rand_state = config_.random_seed * `1048391`;
637
638	// Number of random states. One per CPU/CUDA thread.
639	int num_rand_states = `0`;
640
641	if (config_.arch == Arch::cuda \|\| config_.arch == Arch::amdgpu) {
642	#if defined(TI_WITH_CUDA) \|\| defined(TI_WITH_AMDGPU)
643	// It is important to make sure that every CUDA thread has its own random
644	// state so that we do not need expensive per-state locks.
645	num_rand_states = config_.saturating_grid_dim * config_.max_block_dim;
646	#else
647	TI_NOT_IMPLEMENTED
648	#endif
649	} else {
650	num_rand_states = config_.cpu_max_num_threads;
651	}
652
653	TI_TRACE("Launching runtime_initialize");
654
655	runtime_jit
656	->call<void , void* , std::size_t, void* , int, void* , void* , void* *>(
657	"runtime_initialize", *result_buffer_ptr, memory_pool, prealloc_size,
658	preallocated_device_buffer_, num_rand_states,
659	(void )&taichi_allocate_aligned, (void* *)std::printf,
660	(void *)std::vsnprintf);
661
662	TI_TRACE("LLVMRuntime initialized (excluding `root`)");
663	llvm_runtime_ = fetch_result<void *>(taichi_result_buffer_ret_value_id,
664	*result_buffer_ptr);
665	TI_TRACE("LLVMRuntime pointer fetched");
666
667	if (config_.arch == Arch::cuda) {
668	TI_TRACE("Initializing {} random states using CUDA", num_rand_states);
669	runtime_jit->launch<void , int*>(
670	"runtime_initialize_rand_states_cuda", config_.saturating_grid_dim,
671	config_.max_block_dim, `0`, llvm_runtime_, starting_rand_state);
672	} else {
673	TI_TRACE("Initializing {} random states (serially)", num_rand_states);
674	runtime_jit->call<void , int*>("runtime_initialize_rand_states_serial",
675	llvm_runtime_, starting_rand_state);
676	}
677
678	if (arch_use_host_memory(config_.arch)) {
679	runtime_jit->call<void *>("runtime_get_mem_req_queue", llvm_runtime_);
680	auto mem_req_queue = fetch_result<void *>(taichi_result_buffer_ret_value_id,
681	*result_buffer_ptr);
682	memory_pool->set_queue((MemRequestQueue *)mem_req_queue);
683	}
684
685	if (arch_use_host_memory(config_.arch)) {
686	runtime_jit->call<void , void* , void* *>(
687	"LLVMRuntime_initialize_thread_pool", llvm_runtime_, thread_pool_.get(),
688	(void *)ThreadPool::static_run);
689
690	runtime_jit->call<void , void* *>("LLVMRuntime_set_assert_failed",
691	llvm_runtime_,
692	(void *)assert_failed_host);
693	}
694	if (arch_is_cpu(config_.arch) && (profiler != nullptr)) {
695	// Profiler functions can only be called on CPU kernels
696	runtime_jit->call<void , void* *>("LLVMRuntime_set_profiler", llvm_runtime_,
697	profiler);
698	runtime_jit->call<void , void* *>(
699	"LLVMRuntime_set_profiler_start", llvm_runtime_,
700	(void *)&KernelProfilerBase::profiler_start);
701	runtime_jit->call<void , void* *>(
702	"LLVMRuntime_set_profiler_stop", llvm_runtime_,
703	(void *)&KernelProfilerBase::profiler_stop);
704	}
705	if (arch_is_cpu(config_.arch) \|\| config_.arch == Arch::cuda) {
706	runtime_jit->call<void *>("runtime_initialize_runtime_context_buffer",
707	llvm_runtime_);
708	}
709	}
710
711	void LlvmRuntimeExecutor::destroy_snode_tree(SNodeTree *snode_tree) {
712	get_llvm_context()->delete_snode_tree(snode_tree->id());
713	snode_tree_buffer_manager_->destroy(snode_tree);
714	}
715
716	Device *LlvmRuntimeExecutor::get_compute_device() {
717	return device_.get();
718	}
719
720	LLVMRuntime *LlvmRuntimeExecutor::get_llvm_runtime() {
721	return static_cast<LLVMRuntime *>(llvm_runtime_);
722	}
723
724	void LlvmRuntimeExecutor::prepare_runtime_context(RuntimeContext *ctx) {
725	ctx->runtime = get_llvm_runtime();
726	}
727
728	void LlvmRuntimeExecutor::init_runtime_jit_module(
729	std::unique_ptr<llvm::Module> module) {
730	llvm_context_->init_runtime_module(module.get());
731	runtime_jit_module_ = create_jit_module(std::move(module));
732	}
733
734	void LlvmRuntimeExecutor::fetch_result_impl(void *dest,
735	char *result_buffer,
736	int offset,
737	int size) {
738	synchronize();
739	if (config_.arch == Arch::cuda) {
740	#if defined(TI_WITH_CUDA)
741	CUDADriver::get_instance().memcpy_device_to_host (
742	dest, result_buffer + offset, size);
743	#else
744	TI_NOT_IMPLEMENTED;
745	#endif
746	} else if (config_.arch == Arch::amdgpu) {
747	#if defined(TI_WITH_AMDGPU)
748	AMDGPUDriver::get_instance().memcpy_device_to_host(
749	dest, result_buffer + offset, size);
750	#else
751	TI_NOT_IMPLEMENTED;
752	#endif
753	} else {
754	memcpy(dest, result_buffer + offset, size);
755	}
756	}
757
758	} // namespace taichi::lang
759

Browse the source code of taichi/taichi/runtime/llvm/llvm_runtime_executor.cpp