1#pragma once
2#ifndef FXDIV_H
3#define FXDIV_H
4
5#if defined(__cplusplus) && (__cplusplus >= 201103L)
6 #include <cstddef>
7 #include <cstdint>
8 #include <climits>
9#elif !defined(__OPENCL_VERSION__)
10 #include <stddef.h>
11 #include <stdint.h>
12 #include <limits.h>
13#endif
14
15#if defined(_MSC_VER)
16 #include <intrin.h>
17 #if defined(_M_IX86) || defined(_M_X64)
18 #include <immintrin.h>
19 #endif
20#endif
21
22#ifndef FXDIV_USE_INLINE_ASSEMBLY
23 #define FXDIV_USE_INLINE_ASSEMBLY 0
24#endif
25
26static inline uint64_t fxdiv_mulext_uint32_t(uint32_t a, uint32_t b) {
27#if defined(_MSC_VER) && defined(_M_IX86)
28 return (uint64_t) __emulu((unsigned int) a, (unsigned int) b);
29#else
30 return (uint64_t) a * (uint64_t) b;
31#endif
32}
33
34static inline uint32_t fxdiv_mulhi_uint32_t(uint32_t a, uint32_t b) {
35#if defined(__OPENCL_VERSION__)
36 return mul_hi(a, b);
37#elif defined(__CUDA_ARCH__)
38 return (uint32_t) __umulhi((unsigned int) a, (unsigned int) b);
39#elif defined(_MSC_VER) && defined(_M_IX86)
40 return (uint32_t) (__emulu((unsigned int) a, (unsigned int) b) >> 32);
41#elif defined(_MSC_VER) && defined(_M_ARM)
42 return (uint32_t) _MulUnsignedHigh((unsigned long) a, (unsigned long) b);
43#else
44 return (uint32_t) (((uint64_t) a * (uint64_t) b) >> 32);
45#endif
46}
47
48static inline uint64_t fxdiv_mulhi_uint64_t(uint64_t a, uint64_t b) {
49#if defined(__OPENCL_VERSION__)
50 return mul_hi(a, b);
51#elif defined(__CUDA_ARCH__)
52 return (uint64_t) __umul64hi((unsigned long long) a, (unsigned long long) b);
53#elif defined(_MSC_VER) && defined(_M_X64)
54 return (uint64_t) __umulh((unsigned __int64) a, (unsigned __int64) b);
55#elif defined(__GNUC__) && defined(__SIZEOF_INT128__)
56 return (uint64_t) (((((unsigned __int128) a) * ((unsigned __int128) b))) >> 64);
57#else
58 const uint32_t a_lo = (uint32_t) a;
59 const uint32_t a_hi = (uint32_t) (a >> 32);
60 const uint32_t b_lo = (uint32_t) b;
61 const uint32_t b_hi = (uint32_t) (b >> 32);
62
63 const uint64_t t = fxdiv_mulext_uint32_t(a_hi, b_lo) +
64 (uint64_t) fxdiv_mulhi_uint32_t(a_lo, b_lo);
65 return fxdiv_mulext_uint32_t(a_hi, b_hi) + (t >> 32) +
66 ((fxdiv_mulext_uint32_t(a_lo, b_hi) + (uint64_t) (uint32_t) t) >> 32);
67#endif
68}
69
70static inline size_t fxdiv_mulhi_size_t(size_t a, size_t b) {
71#if SIZE_MAX == UINT32_MAX
72 return (size_t) fxdiv_mulhi_uint32_t((uint32_t) a, (uint32_t) b);
73#elif SIZE_MAX == UINT64_MAX
74 return (size_t) fxdiv_mulhi_uint64_t((uint64_t) a, (uint64_t) b);
75#else
76 #error Unsupported platform
77#endif
78}
79
80struct fxdiv_divisor_uint32_t {
81 uint32_t value;
82 uint32_t m;
83 uint8_t s1;
84 uint8_t s2;
85};
86
87struct fxdiv_result_uint32_t {
88 uint32_t quotient;
89 uint32_t remainder;
90};
91
92struct fxdiv_divisor_uint64_t {
93 uint64_t value;
94 uint64_t m;
95 uint8_t s1;
96 uint8_t s2;
97};
98
99struct fxdiv_result_uint64_t {
100 uint64_t quotient;
101 uint64_t remainder;
102};
103
104struct fxdiv_divisor_size_t {
105 size_t value;
106 size_t m;
107 uint8_t s1;
108 uint8_t s2;
109};
110
111struct fxdiv_result_size_t {
112 size_t quotient;
113 size_t remainder;
114};
115
116static inline struct fxdiv_divisor_uint32_t fxdiv_init_uint32_t(uint32_t d) {
117 struct fxdiv_divisor_uint32_t result = { d };
118 if (d == 1) {
119 result.m = UINT32_C(1);
120 result.s1 = 0;
121 result.s2 = 0;
122 } else {
123 #if defined(__OPENCL_VERSION__)
124 const uint32_t l_minus_1 = 31 - clz(d - 1);
125 #elif defined(__CUDA_ARCH__)
126 const uint32_t l_minus_1 = 31 - __clz((int) (d - 1));
127 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64))
128 unsigned long l_minus_1;
129 _BitScanReverse(&l_minus_1, (unsigned long) (d - 1));
130 #elif defined(__GNUC__) && (defined(__i386__) || defined(__x86_64__)) && FXDIV_USE_INLINE_ASSEMBLY
131 uint32_t l_minus_1;
132 __asm__("BSRL %[d_minus_1], %[l_minus_1]"
133 : [l_minus_1] "=r" (l_minus_1)
134 : [d_minus_1] "r" (d - 1)
135 : "cc");
136 #elif defined(__GNUC__)
137 const uint32_t l_minus_1 = 31 - __builtin_clz(d - 1);
138 #else
139 /* Based on Algorithm 2 from Hacker's delight */
140
141 uint32_t l_minus_1 = 0;
142 uint32_t x = d - 1;
143 uint32_t y = x >> 16;
144 if (y != 0) {
145 l_minus_1 += 16;
146 x = y;
147 }
148 y = x >> 8;
149 if (y != 0) {
150 l_minus_1 += 8;
151 x = y;
152 }
153 y = x >> 4;
154 if (y != 0) {
155 l_minus_1 += 4;
156 x = y;
157 }
158 y = x >> 2;
159 if (y != 0) {
160 l_minus_1 += 2;
161 x = y;
162 }
163 if ((x & 2) != 0) {
164 l_minus_1 += 1;
165 }
166 #endif
167 uint32_t u_hi = (UINT32_C(2) << (uint32_t) l_minus_1) - d;
168
169 /* Division of 64-bit number u_hi:UINT32_C(0) by 32-bit number d, 32-bit quotient output q */
170 #if defined(__GNUC__) && defined(__i386__) && FXDIV_USE_INLINE_ASSEMBLY
171 uint32_t q;
172 __asm__("DIVL %[d]"
173 : "=a" (q), "+d" (u_hi)
174 : [d] "r" (d), "a" (0)
175 : "cc");
176 #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && (defined(_M_IX86) || defined(_M_X64))
177 unsigned int remainder;
178 const uint32_t q = (uint32_t) _udiv64((unsigned __int64) ((uint64_t) u_hi << 32), (unsigned int) d, &remainder);
179 #else
180 const uint32_t q = ((uint64_t) u_hi << 32) / d;
181 #endif
182
183 result.m = q + UINT32_C(1);
184 result.s1 = 1;
185 result.s2 = (uint8_t) l_minus_1;
186 }
187 return result;
188}
189
190static inline struct fxdiv_divisor_uint64_t fxdiv_init_uint64_t(uint64_t d) {
191 struct fxdiv_divisor_uint64_t result = { d };
192 if (d == 1) {
193 result.m = UINT64_C(1);
194 result.s1 = 0;
195 result.s2 = 0;
196 } else {
197 #if defined(__OPENCL_VERSION__)
198 const uint32_t nlz_d = clz(d);
199 const uint32_t l_minus_1 = 63 - clz(d - 1);
200 #elif defined(__CUDA_ARCH__)
201 const uint32_t nlz_d = __clzll((long long) d);
202 const uint32_t l_minus_1 = 63 - __clzll((long long) (d - 1));
203 #elif defined(_MSC_VER) && (defined(_M_X64) || defined(_M_ARM64))
204 unsigned long l_minus_1;
205 _BitScanReverse64(&l_minus_1, (unsigned __int64) (d - 1));
206 unsigned long bsr_d;
207 _BitScanReverse64(&bsr_d, (unsigned __int64) d);
208 const uint32_t nlz_d = bsr_d ^ 0x3F;
209 #elif defined(_MSC_VER) && (defined(_M_IX86) || defined(_M_ARM))
210 const uint64_t d_minus_1 = d - 1;
211 const uint8_t d_is_power_of_2 = (d & d_minus_1) == 0;
212 unsigned long l_minus_1;
213 if ((uint32_t) (d_minus_1 >> 32) == 0) {
214 _BitScanReverse(&l_minus_1, (unsigned long) d_minus_1);
215 } else {
216 _BitScanReverse(&l_minus_1, (unsigned long) (uint32_t) (d_minus_1 >> 32));
217 l_minus_1 += 32;
218 }
219 const uint32_t nlz_d = ((uint8_t) l_minus_1 ^ UINT8_C(0x3F)) - d_is_power_of_2;
220 #elif defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
221 uint64_t l_minus_1;
222 __asm__("BSRQ %[d_minus_1], %[l_minus_1]"
223 : [l_minus_1] "=r" (l_minus_1)
224 : [d_minus_1] "r" (d - 1)
225 : "cc");
226 #elif defined(__GNUC__)
227 const uint32_t l_minus_1 = 63 - __builtin_clzll(d - 1);
228 const uint32_t nlz_d = __builtin_clzll(d);
229 #else
230 /* Based on Algorithm 2 from Hacker's delight */
231 const uint64_t d_minus_1 = d - 1;
232 const uint32_t d_is_power_of_2 = (d & d_minus_1) == 0;
233 uint32_t l_minus_1 = 0;
234 uint32_t x = (uint32_t) d_minus_1;
235 uint32_t y = d_minus_1 >> 32;
236 if (y != 0) {
237 l_minus_1 += 32;
238 x = y;
239 }
240 y = x >> 16;
241 if (y != 0) {
242 l_minus_1 += 16;
243 x = y;
244 }
245 y = x >> 8;
246 if (y != 0) {
247 l_minus_1 += 8;
248 x = y;
249 }
250 y = x >> 4;
251 if (y != 0) {
252 l_minus_1 += 4;
253 x = y;
254 }
255 y = x >> 2;
256 if (y != 0) {
257 l_minus_1 += 2;
258 x = y;
259 }
260 if ((x & 2) != 0) {
261 l_minus_1 += 1;
262 }
263 const uint32_t nlz_d = (l_minus_1 ^ UINT32_C(0x3F)) - d_is_power_of_2;
264 #endif
265 uint64_t u_hi = (UINT64_C(2) << (uint32_t) l_minus_1) - d;
266
267 /* Division of 128-bit number u_hi:UINT64_C(0) by 64-bit number d, 64-bit quotient output q */
268 #if defined(__GNUC__) && defined(__x86_64__) && FXDIV_USE_INLINE_ASSEMBLY
269 uint64_t q;
270 __asm__("DIVQ %[d]"
271 : "=a" (q), "+d" (u_hi)
272 : [d] "r" (d), "a" (UINT64_C(0))
273 : "cc");
274 #elif 0 && defined(__GNUC__) && defined(__SIZEOF_INT128__)
275 /* GCC, Clang, and Intel Compiler fail to inline optimized implementation and call into support library for 128-bit division */
276 const uint64_t q = (uint64_t) (((unsigned __int128) u_hi << 64) / ((unsigned __int128) d));
277 #elif (defined(_MSC_VER) && _MSC_VER >= 1920) && !defined(__clang__) && !defined(__INTEL_COMPILER) && defined(_M_X64)
278 unsigned __int64 remainder;
279 const uint64_t q = (uint64_t) _udiv128((unsigned __int64) u_hi, 0, (unsigned __int64) d, &remainder);
280 #else
281 /* Implementation based on code from Hacker's delight */
282
283 /* Normalize divisor and shift divident left */
284 d <<= nlz_d;
285 u_hi <<= nlz_d;
286 /* Break divisor up into two 32-bit digits */
287 const uint64_t d_hi = (uint32_t) (d >> 32);
288 const uint32_t d_lo = (uint32_t) d;
289
290 /* Compute the first quotient digit, q1 */
291 uint64_t q1 = u_hi / d_hi;
292 uint64_t r1 = u_hi - q1 * d_hi;
293
294 while ((q1 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q1, d_lo) > (r1 << 32)) {
295 q1 -= 1;
296 r1 += d_hi;
297 if ((r1 >> 32) != 0) {
298 break;
299 }
300 }
301
302 /* Multiply and subtract. */
303 u_hi = (u_hi << 32) - q1 * d;
304
305 /* Compute the second quotient digit, q0 */
306 uint64_t q0 = u_hi / d_hi;
307 uint64_t r0 = u_hi - q0 * d_hi;
308
309 while ((q0 >> 32) != 0 || fxdiv_mulext_uint32_t((uint32_t) q0, d_lo) > (r0 << 32)) {
310 q0 -= 1;
311 r0 += d_hi;
312 if ((r0 >> 32) != 0) {
313 break;
314 }
315 }
316 const uint64_t q = (q1 << 32) | (uint32_t) q0;
317 #endif
318 result.m = q + UINT64_C(1);
319 result.s1 = 1;
320 result.s2 = (uint8_t) l_minus_1;
321 }
322 return result;
323}
324
325static inline struct fxdiv_divisor_size_t fxdiv_init_size_t(size_t d) {
326#if SIZE_MAX == UINT32_MAX
327 const struct fxdiv_divisor_uint32_t uint_result = fxdiv_init_uint32_t((uint32_t) d);
328#elif SIZE_MAX == UINT64_MAX
329 const struct fxdiv_divisor_uint64_t uint_result = fxdiv_init_uint64_t((uint64_t) d);
330#else
331 #error Unsupported platform
332#endif
333 struct fxdiv_divisor_size_t size_result = {
334 (size_t) uint_result.value,
335 (size_t) uint_result.m,
336 uint_result.s1,
337 uint_result.s2
338 };
339 return size_result;
340}
341
342static inline uint32_t fxdiv_quotient_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
343 const uint32_t t = fxdiv_mulhi_uint32_t(n, divisor.m);
344 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
345}
346
347static inline uint64_t fxdiv_quotient_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
348 const uint64_t t = fxdiv_mulhi_uint64_t(n, divisor.m);
349 return (t + ((n - t) >> divisor.s1)) >> divisor.s2;
350}
351
352static inline size_t fxdiv_quotient_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
353#if SIZE_MAX == UINT32_MAX
354 const struct fxdiv_divisor_uint32_t uint32_divisor = {
355 (uint32_t) divisor.value,
356 (uint32_t) divisor.m,
357 divisor.s1,
358 divisor.s2
359 };
360 return fxdiv_quotient_uint32_t((uint32_t) n, uint32_divisor);
361#elif SIZE_MAX == UINT64_MAX
362 const struct fxdiv_divisor_uint64_t uint64_divisor = {
363 (uint64_t) divisor.value,
364 (uint64_t) divisor.m,
365 divisor.s1,
366 divisor.s2
367 };
368 return fxdiv_quotient_uint64_t((uint64_t) n, uint64_divisor);
369#else
370 #error Unsupported platform
371#endif
372}
373
374static inline uint32_t fxdiv_remainder_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
375 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
376 return n - quotient * divisor.value;
377}
378
379static inline uint64_t fxdiv_remainder_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
380 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
381 return n - quotient * divisor.value;
382}
383
384static inline size_t fxdiv_remainder_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
385 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
386 return n - quotient * divisor.value;
387}
388
389static inline uint32_t fxdiv_round_down_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t granularity) {
390 const uint32_t quotient = fxdiv_quotient_uint32_t(n, granularity);
391 return quotient * granularity.value;
392}
393
394static inline uint64_t fxdiv_round_down_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t granularity) {
395 const uint64_t quotient = fxdiv_quotient_uint64_t(n, granularity);
396 return quotient * granularity.value;
397}
398
399static inline size_t fxdiv_round_down_size_t(size_t n, const struct fxdiv_divisor_size_t granularity) {
400 const size_t quotient = fxdiv_quotient_size_t(n, granularity);
401 return quotient * granularity.value;
402}
403
404static inline struct fxdiv_result_uint32_t fxdiv_divide_uint32_t(uint32_t n, const struct fxdiv_divisor_uint32_t divisor) {
405 const uint32_t quotient = fxdiv_quotient_uint32_t(n, divisor);
406 const uint32_t remainder = n - quotient * divisor.value;
407 struct fxdiv_result_uint32_t result = { quotient, remainder };
408 return result;
409}
410
411static inline struct fxdiv_result_uint64_t fxdiv_divide_uint64_t(uint64_t n, const struct fxdiv_divisor_uint64_t divisor) {
412 const uint64_t quotient = fxdiv_quotient_uint64_t(n, divisor);
413 const uint64_t remainder = n - quotient * divisor.value;
414 struct fxdiv_result_uint64_t result = { quotient, remainder };
415 return result;
416}
417
418static inline struct fxdiv_result_size_t fxdiv_divide_size_t(size_t n, const struct fxdiv_divisor_size_t divisor) {
419 const size_t quotient = fxdiv_quotient_size_t(n, divisor);
420 const size_t remainder = n - quotient * divisor.value;
421 struct fxdiv_result_size_t result = { quotient, remainder };
422 return result;
423}
424
425#endif /* FXDIV_H */
426