1 | #pragma once |
2 | #ifndef FXDIV_H |
3 | #define FXDIV_H |
4 | |
5 | #if defined(__cplusplus) && (__cplusplus >= 201103L) |
6 | #include <cstddef> |
7 | #include <cstdint> |
8 | #include <climits> |
9 | #elif !defined(__OPENCL_VERSION__) |
10 | #include <stddef.h> |
11 | #include <stdint.h> |
12 | #include <limits.h> |
13 | #endif |
14 | |
15 | #if defined(_MSC_VER) |
16 | #include <intrin.h> |
17 | #if defined(_M_IX86) || defined(_M_X64) |
18 | #include <immintrin.h> |
19 | #endif |
20 | #endif |
21 | |
22 | #ifndef FXDIV_USE_INLINE_ASSEMBLY |
23 | #define FXDIV_USE_INLINE_ASSEMBLY 0 |
24 | #endif |
25 | |
26 | static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) { |
27 | #if defined(_MSC_VER) && defined(_M_IX86) |
28 | return (uint64_t) __emulu((unsigned int) a, (unsigned int) b); |
29 | #else |
30 | return (uint64_t) a * (uint64_t) b; |
31 | #endif |
32 | } |
33 | |
34 | static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) { |
35 | #if defined(__OPENCL_VERSION__) |
36 | return mul_hi(a, b); |
37 | #elif defined(__CUDA_ARCH__) |
38 | return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b); |
39 | #elif defined(_MSC_VER) && defined(_M_IX86) |
40 | return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32); |
41 | #elif defined(_MSC_VER) && defined(_M_ARM) |
42 | return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b); |
43 | #else |
44 | return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32); |
45 | #endif |
46 | } |
47 | |
48 | static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) { |
49 | #if defined(__OPENCL_VERSION__) |
50 | return mul_hi(a, b); |
51 | #elif defined(__CUDA_ARCH__) |
52 | return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b); |
53 | #elif defined(_MSC_VER) && defined(_M_X64) |
54 | return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b); |
55 | #elif defined(__GNUC__) && defined(__SIZEOF_INT128__) |
56 | return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64); |
57 | #else |
58 | const uint32_t a_lo = (uint32_t) a; |
59 | const uint32_t a_hi = (uint32_t) (a >> 32); |
60 | const uint32_t b_lo = (uint32_t) b; |
61 | const uint32_t b_hi = (uint32_t) (b >> 32); |
62 | |
63 | const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) + |
64 | (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo); |
65 | return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) + |
66 | ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32); |
67 | #endif |
68 | } |
69 | |
70 | static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) { |
71 | #if SIZE_MAX == UINT32_MAX |
72 | return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b); |
73 | #elif SIZE_MAX == UINT64_MAX |
74 | return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b); |
75 | #else |
76 | #error Unsupported platform |
77 | #endif |
78 | } |
79 | |
80 | struct fxdiv_divisor_uint32_t { |
81 | uint32_t value; |
82 | uint32_t m; |
83 | uint8_t s1; |
84 | uint8_t s2; |
85 | }; |
86 | |
87 | struct fxdiv_result_uint32_t { |
88 | uint32_t quotient; |
89 | uint32_t remainder; |
90 | }; |
91 | |
92 | struct fxdiv_divisor_uint64_t { |
93 | uint64_t value; |
94 | uint64_t m; |
95 | uint8_t s1; |
96 | uint8_t s2; |
97 | }; |
98 | |
99 | struct fxdiv_result_uint64_t { |
100 | uint64_t quotient; |
101 | uint64_t remainder; |
102 | }; |
103 | |
104 | struct fxdiv_divisor_size_t { |
105 | size_t value; |
106 | size_t m; |
107 | uint8_t s1; |
108 | uint8_t s2; |
109 | }; |
110 | |
111 | struct fxdiv_result_size_t { |
112 | size_t quotient; |
113 | size_t remainder; |
114 | }; |
115 | |
116 | static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) { |
117 | struct fxdiv_divisor_uint32_t result = { d }; |
118 | if (d == 1) { |
119 | result.m = UINT32_C(1); |
120 | result.s1 = 0; |
121 | result.s2 = 0; |
122 | } else { |
123 | #if defined(__OPENCL_VERSION__) |
124 | const uint32_t l_minus_1 = 31 - clz(d - 1); |
125 | #elif defined(__CUDA_ARCH__) |
126 | const uint32_t l_minus_1 = 31 - __clz((int) (d - 1)); |
127 | #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64)) |
128 | unsigned long l_minus_1; |
129 | _BitScanReverse(&l_minus_1, (unsigned long) (d - 1)); |
130 | #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY |
131 | uint32_t l_minus_1; |
132 | __asm__("BSRL %[d_minus_1], %[l_minus_1]" |
133 | : [l_minus_1] "=r" (l_minus_1) |
134 | : [d_minus_1] "r" (d - 1) |
135 | : "cc" ); |
136 | #elif defined(__GNUC__) |
137 | const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1); |
138 | #else |
139 | /* Based on Algorithm 2 from Hacker's delight */ |
140 | |
141 | uint32_t l_minus_1 = 0; |
142 | uint32_t x = d - 1; |
143 | uint32_t y = x >> 16; |
144 | if (y != 0) { |
145 | l_minus_1 += 16; |
146 | x = y; |
147 | } |
148 | y = x >> 8; |
149 | if (y != 0) { |
150 | l_minus_1 += 8; |
151 | x = y; |
152 | } |
153 | y = x >> 4; |
154 | if (y != 0) { |
155 | l_minus_1 += 4; |
156 | x = y; |
157 | } |
158 | y = x >> 2; |
159 | if (y != 0) { |
160 | l_minus_1 += 2; |
161 | x = y; |
162 | } |
163 | if ((x & 2) != 0) { |
164 | l_minus_1 += 1; |
165 | } |
166 | #endif |
167 | uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d; |
168 | |
169 | /* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */ |
170 | #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY |
171 | uint32_t q; |
172 | __asm__("DIVL %[d]" |
173 | : "=a" (q), "+d" (u_hi) |
174 | : [d] "r" (d), "a" (0) |
175 | : "cc" ); |
176 | #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64)) |
177 | unsigned int remainder; |
178 | const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder); |
179 | #else |
180 | const uint32_t q = ((uint64_t) u_hi << 32) / d; |
181 | #endif |
182 | |
183 | result.m = q + UINT32_C(1); |
184 | result.s1 = 1; |
185 | result.s2 = (uint8_t) l_minus_1; |
186 | } |
187 | return result; |
188 | } |
189 | |
190 | static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) { |
191 | struct fxdiv_divisor_uint64_t result = { d }; |
192 | if (d == 1) { |
193 | result.m = UINT64_C(1); |
194 | result.s1 = 0; |
195 | result.s2 = 0; |
196 | } else { |
197 | #if defined(__OPENCL_VERSION__) |
198 | const uint32_t nlz_d = clz(d); |
199 | const uint32_t l_minus_1 = 63 - clz(d - 1); |
200 | #elif defined(__CUDA_ARCH__) |
201 | const uint32_t nlz_d = __clzll((long long) d); |
202 | const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1)); |
203 | #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64)) |
204 | unsigned long l_minus_1; |
205 | _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1)); |
206 | unsigned long bsr_d; |
207 | _BitScanReverse64(&bsr_d, (unsigned __int64) d); |
208 | const uint32_t nlz_d = bsr_d ^ 0x3F; |
209 | #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM)) |
210 | const uint64_t d_minus_1 = d - 1; |
211 | const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0; |
212 | unsigned long l_minus_1; |
213 | if ((uint32_t) (d_minus_1 >> 32) == 0) { |
214 | _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1); |
215 | } else { |
216 | _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32)); |
217 | l_minus_1 += 32; |
218 | } |
219 | const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2; |
220 | #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY |
221 | uint64_t l_minus_1; |
222 | __asm__("BSRQ %[d_minus_1], %[l_minus_1]" |
223 | : [l_minus_1] "=r" (l_minus_1) |
224 | : [d_minus_1] "r" (d - 1) |
225 | : "cc" ); |
226 | #elif defined(__GNUC__) |
227 | const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1); |
228 | const uint32_t nlz_d = __builtin_clzll(d); |
229 | #else |
230 | /* Based on Algorithm 2 from Hacker's delight */ |
231 | const uint64_t d_minus_1 = d - 1; |
232 | const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0; |
233 | uint32_t l_minus_1 = 0; |
234 | uint32_t x = (uint32_t) d_minus_1; |
235 | uint32_t y = d_minus_1 >> 32; |
236 | if (y != 0) { |
237 | l_minus_1 += 32; |
238 | x = y; |
239 | } |
240 | y = x >> 16; |
241 | if (y != 0) { |
242 | l_minus_1 += 16; |
243 | x = y; |
244 | } |
245 | y = x >> 8; |
246 | if (y != 0) { |
247 | l_minus_1 += 8; |
248 | x = y; |
249 | } |
250 | y = x >> 4; |
251 | if (y != 0) { |
252 | l_minus_1 += 4; |
253 | x = y; |
254 | } |
255 | y = x >> 2; |
256 | if (y != 0) { |
257 | l_minus_1 += 2; |
258 | x = y; |
259 | } |
260 | if ((x & 2) != 0) { |
261 | l_minus_1 += 1; |
262 | } |
263 | const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2; |
264 | #endif |
265 | uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d; |
266 | |
267 | /* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */ |
268 | #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY |
269 | uint64_t q; |
270 | __asm__("DIVQ %[d]" |
271 | : "=a" (q), "+d" (u_hi) |
272 | : [d] "r" (d), "a" (UINT64_C(0)) |
273 | : "cc" ); |
274 | #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__) |
275 | /* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */ |
276 | const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d)); |
277 | #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64) |
278 | unsigned __int64 remainder; |
279 | const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder); |
280 | #else |
281 | /* Implementation based on code from Hacker's delight */ |
282 | |
283 | /* Normalize divisor and shift divident left */ |
284 | d <<= nlz_d; |
285 | u_hi <<= nlz_d; |
286 | /* Break divisor up into two 32-bit digits */ |
287 | const uint64_t d_hi = (uint32_t) (d >> 32); |
288 | const uint32_t d_lo = (uint32_t) d; |
289 | |
290 | /* Compute the first quotient digit, q1 */ |
291 | uint64_t q1 = u_hi / d_hi; |
292 | uint64_t r1 = u_hi - q1 * d_hi; |
293 | |
294 | while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) { |
295 | q1 -= 1; |
296 | r1 += d_hi; |
297 | if ((r1 >> 32) != 0) { |
298 | break; |
299 | } |
300 | } |
301 | |
302 | /* Multiply and subtract. */ |
303 | u_hi = (u_hi << 32) - q1 * d; |
304 | |
305 | /* Compute the second quotient digit, q0 */ |
306 | uint64_t q0 = u_hi / d_hi; |
307 | uint64_t r0 = u_hi - q0 * d_hi; |
308 | |
309 | while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) { |
310 | q0 -= 1; |
311 | r0 += d_hi; |
312 | if ((r0 >> 32) != 0) { |
313 | break; |
314 | } |
315 | } |
316 | const uint64_t q = (q1 << 32) | (uint32_t) q0; |
317 | #endif |
318 | result.m = q + UINT64_C(1); |
319 | result.s1 = 1; |
320 | result.s2 = (uint8_t) l_minus_1; |
321 | } |
322 | return result; |
323 | } |
324 | |
325 | static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) { |
326 | #if SIZE_MAX == UINT32_MAX |
327 | const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d); |
328 | #elif SIZE_MAX == UINT64_MAX |
329 | const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d); |
330 | #else |
331 | #error Unsupported platform |
332 | #endif |
333 | struct fxdiv_divisor_size_t size_result = { |
334 | (size_t) uint_result.value, |
335 | (size_t) uint_result.m, |
336 | uint_result.s1, |
337 | uint_result.s2 |
338 | }; |
339 | return size_result; |
340 | } |
341 | |
342 | static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) { |
343 | const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m); |
344 | return (t + ((n - t) >> divisor.s1)) >> divisor.s2; |
345 | } |
346 | |
347 | static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) { |
348 | const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m); |
349 | return (t + ((n - t) >> divisor.s1)) >> divisor.s2; |
350 | } |
351 | |
352 | static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) { |
353 | #if SIZE_MAX == UINT32_MAX |
354 | const struct fxdiv_divisor_uint32_t uint32_divisor = { |
355 | (uint32_t) divisor.value, |
356 | (uint32_t) divisor.m, |
357 | divisor.s1, |
358 | divisor.s2 |
359 | }; |
360 | return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor); |
361 | #elif SIZE_MAX == UINT64_MAX |
362 | const struct fxdiv_divisor_uint64_t uint64_divisor = { |
363 | (uint64_t) divisor.value, |
364 | (uint64_t) divisor.m, |
365 | divisor.s1, |
366 | divisor.s2 |
367 | }; |
368 | return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor); |
369 | #else |
370 | #error Unsupported platform |
371 | #endif |
372 | } |
373 | |
374 | static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) { |
375 | const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor); |
376 | return n - quotient * divisor.value; |
377 | } |
378 | |
379 | static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) { |
380 | const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor); |
381 | return n - quotient * divisor.value; |
382 | } |
383 | |
384 | static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) { |
385 | const size_t quotient = fxdiv_quotient_size_t(n, divisor); |
386 | return n - quotient * divisor.value; |
387 | } |
388 | |
389 | static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) { |
390 | const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity); |
391 | return quotient * granularity.value; |
392 | } |
393 | |
394 | static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) { |
395 | const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity); |
396 | return quotient * granularity.value; |
397 | } |
398 | |
399 | static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) { |
400 | const size_t quotient = fxdiv_quotient_size_t(n, granularity); |
401 | return quotient * granularity.value; |
402 | } |
403 | |
404 | static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) { |
405 | const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor); |
406 | const uint32_t remainder = n - quotient * divisor.value; |
407 | struct fxdiv_result_uint32_t result = { quotient, remainder }; |
408 | return result; |
409 | } |
410 | |
411 | static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) { |
412 | const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor); |
413 | const uint64_t remainder = n - quotient * divisor.value; |
414 | struct fxdiv_result_uint64_t result = { quotient, remainder }; |
415 | return result; |
416 | } |
417 | |
418 | static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) { |
419 | const size_t quotient = fxdiv_quotient_size_t(n, divisor); |
420 | const size_t remainder = n - quotient * divisor.value; |
421 | struct fxdiv_result_size_t result = { quotient, remainder }; |
422 | return result; |
423 | } |
424 | |
425 | #endif /* FXDIV_H */ |
426 | |