1#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
2#define JEMALLOC_INTERNAL_BIT_UTIL_H
3
4#include "jemalloc/internal/assert.h"
5
6/* Sanity check. */
7#if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \
8 || !defined(JEMALLOC_INTERNAL_FFS)
9# error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
10#endif
11
12/*
13 * Unlike the builtins and posix ffs functions, our ffs requires a non-zero
14 * input, and returns the position of the lowest bit set (as opposed to the
15 * posix versions, which return 1 larger than that position and use a return
16 * value of zero as a sentinel. This tends to simplify logic in callers, and
17 * allows for consistency with the builtins we build fls on top of.
18 */
19static inline unsigned
20ffs_llu(unsigned long long x) {
21 util_assume(x != 0);
22 return JEMALLOC_INTERNAL_FFSLL(x) - 1;
23}
24
25static inline unsigned
26ffs_lu(unsigned long x) {
27 util_assume(x != 0);
28 return JEMALLOC_INTERNAL_FFSL(x) - 1;
29}
30
31static inline unsigned
32ffs_u(unsigned x) {
33 util_assume(x != 0);
34 return JEMALLOC_INTERNAL_FFS(x) - 1;
35}
36
37#define DO_FLS_SLOW(x, suffix) do { \
38 util_assume(x != 0); \
39 x |= (x >> 1); \
40 x |= (x >> 2); \
41 x |= (x >> 4); \
42 x |= (x >> 8); \
43 x |= (x >> 16); \
44 if (sizeof(x) > 4) { \
45 /* \
46 * If sizeof(x) is 4, then the expression "x >> 32" \
47 * will generate compiler warnings even if the code \
48 * never executes. This circumvents the warning, and \
49 * gets compiled out in optimized builds. \
50 */ \
51 int constant_32 = sizeof(x) * 4; \
52 x |= (x >> constant_32); \
53 } \
54 x++; \
55 if (x == 0) { \
56 return 8 * sizeof(x) - 1; \
57 } \
58 return ffs_##suffix(x) - 1; \
59} while(0)
60
61static inline unsigned
62fls_llu_slow(unsigned long long x) {
63 DO_FLS_SLOW(x, llu);
64}
65
66static inline unsigned
67fls_lu_slow(unsigned long x) {
68 DO_FLS_SLOW(x, lu);
69}
70
71static inline unsigned
72fls_u_slow(unsigned x) {
73 DO_FLS_SLOW(x, u);
74}
75
76#undef DO_FLS_SLOW
77
78#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
79static inline unsigned
80fls_llu(unsigned long long x) {
81 util_assume(x != 0);
82 /*
83 * Note that the xor here is more naturally written as subtraction; the
84 * last bit set is the number of bits in the type minus the number of
85 * leading zero bits. But GCC implements that as:
86 * bsr edi, edi
87 * mov eax, 31
88 * xor edi, 31
89 * sub eax, edi
90 * If we write it as xor instead, then we get
91 * bsr eax, edi
92 * as desired.
93 */
94 return (8 * sizeof(x) - 1) ^ __builtin_clzll(x);
95}
96
97static inline unsigned
98fls_lu(unsigned long x) {
99 util_assume(x != 0);
100 return (8 * sizeof(x) - 1) ^ __builtin_clzl(x);
101}
102
103static inline unsigned
104fls_u(unsigned x) {
105 util_assume(x != 0);
106 return (8 * sizeof(x) - 1) ^ __builtin_clz(x);
107}
108#elif defined(_MSC_VER)
109
110#if LG_SIZEOF_PTR == 3
111#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
112#else
113/*
114 * This never actually runs; we're just dodging a compiler error for the
115 * never-taken branch where sizeof(void *) == 8.
116 */
117#define DO_BSR64(bit, x) bit = 0; unreachable()
118#endif
119
120#define DO_FLS(x) do { \
121 if (x == 0) { \
122 return 8 * sizeof(x); \
123 } \
124 unsigned long bit; \
125 if (sizeof(x) == 4) { \
126 _BitScanReverse(&bit, (unsigned)x); \
127 return (unsigned)bit; \
128 } \
129 if (sizeof(x) == 8 && sizeof(void *) == 8) { \
130 DO_BSR64(bit, x); \
131 return (unsigned)bit; \
132 } \
133 if (sizeof(x) == 8 && sizeof(void *) == 4) { \
134 /* Dodge a compiler warning, as above. */ \
135 int constant_32 = sizeof(x) * 4; \
136 if (_BitScanReverse(&bit, \
137 (unsigned)(x >> constant_32))) { \
138 return 32 + (unsigned)bit; \
139 } else { \
140 _BitScanReverse(&bit, (unsigned)x); \
141 return (unsigned)bit; \
142 } \
143 } \
144 unreachable(); \
145} while (0)
146
147static inline unsigned
148fls_llu(unsigned long long x) {
149 DO_FLS(x);
150}
151
152static inline unsigned
153fls_lu(unsigned long x) {
154 DO_FLS(x);
155}
156
157static inline unsigned
158fls_u(unsigned x) {
159 DO_FLS(x);
160}
161
162#undef DO_FLS
163#undef DO_BSR64
164#else
165
166static inline unsigned
167fls_llu(unsigned long long x) {
168 return fls_llu_slow(x);
169}
170
171static inline unsigned
172fls_lu(unsigned long x) {
173 return fls_lu_slow(x);
174}
175
176static inline unsigned
177fls_u(unsigned x) {
178 return fls_u_slow(x);
179}
180#endif
181
182#if LG_SIZEOF_LONG_LONG > 3
183# error "Haven't implemented popcount for 16-byte ints."
184#endif
185
186#define DO_POPCOUNT(x, type) do { \
187 /* \
188 * Algorithm from an old AMD optimization reference manual. \
189 * We're putting a little bit more work than you might expect \
190 * into the no-instrinsic case, since we only support the \
191 * GCC intrinsics spelling of popcount (for now). Detecting \
192 * whether or not the popcount builtin is actually useable in \
193 * MSVC is nontrivial. \
194 */ \
195 \
196 type bmul = (type)0x0101010101010101ULL; \
197 \
198 /* \
199 * Replace each 2 bits with the sideways sum of the original \
200 * values. 0x5 = 0b0101. \
201 * \
202 * You might expect this to be: \
203 * x = (x & 0x55...) + ((x >> 1) & 0x55...). \
204 * That costs an extra mask relative to this, though. \
205 */ \
206 x = x - ((x >> 1) & (0x55U * bmul)); \
207 /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\
208 x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \
209 /* \
210 * Replace each 8 bits with their sideways sum. Note that we \
211 * can't overflow within each 4-bit sum here, so we can skip \
212 * the initial mask. \
213 */ \
214 x = (x + (x >> 4)) & (bmul * 0x0FU); \
215 /* \
216 * None of the partial sums in this multiplication (viewed in \
217 * base-256) can overflow into the next digit. So the least \
218 * significant byte of the product will be the least \
219 * significant byte of the original value, the second least \
220 * significant byte will be the sum of the two least \
221 * significant bytes of the original value, and so on. \
222 * Importantly, the high byte will be the byte-wise sum of all \
223 * the bytes of the original value. \
224 */ \
225 x = x * bmul; \
226 x >>= ((sizeof(x) - 1) * 8); \
227 return (unsigned)x; \
228} while(0)
229
230static inline unsigned
231popcount_u_slow(unsigned bitmap) {
232 DO_POPCOUNT(bitmap, unsigned);
233}
234
235static inline unsigned
236popcount_lu_slow(unsigned long bitmap) {
237 DO_POPCOUNT(bitmap, unsigned long);
238}
239
240static inline unsigned
241popcount_llu_slow(unsigned long long bitmap) {
242 DO_POPCOUNT(bitmap, unsigned long long);
243}
244
245#undef DO_POPCOUNT
246
247static inline unsigned
248popcount_u(unsigned bitmap) {
249#ifdef JEMALLOC_INTERNAL_POPCOUNT
250 return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
251#else
252 return popcount_u_slow(bitmap);
253#endif
254}
255
256static inline unsigned
257popcount_lu(unsigned long bitmap) {
258#ifdef JEMALLOC_INTERNAL_POPCOUNTL
259 return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
260#else
261 return popcount_lu_slow(bitmap);
262#endif
263}
264
265static inline unsigned
266popcount_llu(unsigned long long bitmap) {
267#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
268 return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
269#else
270 return popcount_llu_slow(bitmap);
271#endif
272}
273
274/*
275 * Clears first unset bit in bitmap, and returns
276 * place of bit. bitmap *must not* be 0.
277 */
278
279static inline size_t
280cfs_lu(unsigned long* bitmap) {
281 util_assume(*bitmap != 0);
282 size_t bit = ffs_lu(*bitmap);
283 *bitmap ^= ZU(1) << bit;
284 return bit;
285}
286
287static inline unsigned
288ffs_zu(size_t x) {
289#if LG_SIZEOF_PTR == LG_SIZEOF_INT
290 return ffs_u(x);
291#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
292 return ffs_lu(x);
293#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
294 return ffs_llu(x);
295#else
296#error No implementation for size_t ffs()
297#endif
298}
299
300static inline unsigned
301fls_zu(size_t x) {
302#if LG_SIZEOF_PTR == LG_SIZEOF_INT
303 return fls_u(x);
304#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
305 return fls_lu(x);
306#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
307 return fls_llu(x);
308#else
309#error No implementation for size_t fls()
310#endif
311}
312
313
314static inline unsigned
315ffs_u64(uint64_t x) {
316#if LG_SIZEOF_LONG == 3
317 return ffs_lu(x);
318#elif LG_SIZEOF_LONG_LONG == 3
319 return ffs_llu(x);
320#else
321#error No implementation for 64-bit ffs()
322#endif
323}
324
325static inline unsigned
326fls_u64(uint64_t x) {
327#if LG_SIZEOF_LONG == 3
328 return fls_lu(x);
329#elif LG_SIZEOF_LONG_LONG == 3
330 return fls_llu(x);
331#else
332#error No implementation for 64-bit fls()
333#endif
334}
335
336static inline unsigned
337ffs_u32(uint32_t x) {
338#if LG_SIZEOF_INT == 2
339 return ffs_u(x);
340#else
341#error No implementation for 32-bit ffs()
342#endif
343 return ffs_u(x);
344}
345
346static inline unsigned
347fls_u32(uint32_t x) {
348#if LG_SIZEOF_INT == 2
349 return fls_u(x);
350#else
351#error No implementation for 32-bit fls()
352#endif
353 return fls_u(x);
354}
355
356static inline uint64_t
357pow2_ceil_u64(uint64_t x) {
358 if (unlikely(x <= 1)) {
359 return x;
360 }
361 size_t msb_on_index = fls_u64(x - 1);
362 /*
363 * Range-check; it's on the callers to ensure that the result of this
364 * call won't overflow.
365 */
366 assert(msb_on_index < 63);
367 return 1ULL << (msb_on_index + 1);
368}
369
370static inline uint32_t
371pow2_ceil_u32(uint32_t x) {
372 if (unlikely(x <= 1)) {
373 return x;
374 }
375 size_t msb_on_index = fls_u32(x - 1);
376 /* As above. */
377 assert(msb_on_index < 31);
378 return 1U << (msb_on_index + 1);
379}
380
381/* Compute the smallest power of 2 that is >= x. */
382static inline size_t
383pow2_ceil_zu(size_t x) {
384#if (LG_SIZEOF_PTR == 3)
385 return pow2_ceil_u64(x);
386#else
387 return pow2_ceil_u32(x);
388#endif
389}
390
391static inline unsigned
392lg_floor(size_t x) {
393 util_assume(x != 0);
394#if (LG_SIZEOF_PTR == 3)
395 return fls_u64(x);
396#else
397 return fls_u32(x);
398#endif
399}
400
401static inline unsigned
402lg_ceil(size_t x) {
403 return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1);
404}
405
406/* A compile-time version of lg_floor and lg_ceil. */
407#define LG_FLOOR_1(x) 0
408#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
409#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
410#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
411#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
412#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
413#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
414#if LG_SIZEOF_PTR == 2
415# define LG_FLOOR(x) LG_FLOOR_32((x))
416#else
417# define LG_FLOOR(x) LG_FLOOR_64((x))
418#endif
419
420#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
421
422#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
423