1// Generated from "/code/pytorch/third_party/nvfuser/runtime/block_sync_atomic.cu"
2// 2023-02-12 08:01:26
3
4namespace nvfuser_resources {
5
6constexpr const char* block_sync_atomic_cu = R"(
7
8// Counter-based block synchronization. Only meant to be used for
9// debugging and validating synchronization. This should be replaced
10// with cuda::barrier::arrive_and_wait as that should be more robust.
11
12namespace block_sync {
13
14using CounterType = unsigned int;
15static constexpr CounterType COUNTER_TYPE_MAX = ~(CounterType)0;
16__shared__ CounterType sync_counter;
17
18__device__ void init() {
19 const unsigned int tid = threadIdx.x + threadIdx.y * blockDim.x +
20 threadIdx.z * blockDim.x * blockDim.y;
21 if (tid == 0) {
22 sync_counter = 0;
23 }
24 __syncthreads();
25}
26
27// Emulate __syncthreads() with a synchronization counter
28__device__ void sync() {
29 unsigned int backoff = 8;
30 const unsigned int backoff_max = 256;
31 const unsigned int num_threads = blockDim.x * blockDim.y * blockDim.z;
32
33 __threadfence_block();
34
35 // Use counter range only up to a limit so that the next val won't
36 // overflow.
37
38 const auto counter_max = (COUNTER_TYPE_MAX / num_threads) * num_threads;
39 const auto old = atomicInc(&sync_counter, counter_max - 1);
40
41 const auto next = (old / num_threads) * num_threads + num_threads;
42
43 auto local_sync_counter = *(volatile CounterType*)(&sync_counter);
44
45 // sync_counter may wrap around, which means local_sync_counter
46 // becomes smaller than old. In that case, it's guaranteed that all
47 // threads have incremented the counter.
48 while (local_sync_counter < next && old < local_sync_counter) {
49#if __CUDA_ARCH__ >= 700
50 // __nanosleep only available on compute capability 7.0 or higher
51 __nanosleep(backoff); // avoids busy waiting
52#endif
53 if (backoff < backoff_max) {
54 backoff *= 2;
55 }
56 local_sync_counter = *(volatile CounterType*)(&sync_counter);
57 }
58}
59
60} // namespace block_sync
61)";
62
63} // namespace nvfuser_resources
64