1 | #ifndef JEMALLOC_INTERNAL_BIT_UTIL_H |
2 | #define JEMALLOC_INTERNAL_BIT_UTIL_H |
3 | |
4 | #include "jemalloc/internal/assert.h" |
5 | |
6 | /* Sanity check. */ |
7 | #if !defined(JEMALLOC_INTERNAL_FFSLL) || !defined(JEMALLOC_INTERNAL_FFSL) \ |
8 | || !defined(JEMALLOC_INTERNAL_FFS) |
9 | # error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure |
10 | #endif |
11 | |
12 | /* |
13 | * Unlike the builtins and posix ffs functions, our ffs requires a non-zero |
14 | * input, and returns the position of the lowest bit set (as opposed to the |
15 | * posix versions, which return 1 larger than that position and use a return |
16 | * value of zero as a sentinel. This tends to simplify logic in callers, and |
17 | * allows for consistency with the builtins we build fls on top of. |
18 | */ |
19 | static inline unsigned |
20 | ffs_llu(unsigned long long x) { |
21 | util_assume(x != 0); |
22 | return JEMALLOC_INTERNAL_FFSLL(x) - 1; |
23 | } |
24 | |
25 | static inline unsigned |
26 | ffs_lu(unsigned long x) { |
27 | util_assume(x != 0); |
28 | return JEMALLOC_INTERNAL_FFSL(x) - 1; |
29 | } |
30 | |
31 | static inline unsigned |
32 | ffs_u(unsigned x) { |
33 | util_assume(x != 0); |
34 | return JEMALLOC_INTERNAL_FFS(x) - 1; |
35 | } |
36 | |
37 | #define DO_FLS_SLOW(x, suffix) do { \ |
38 | util_assume(x != 0); \ |
39 | x |= (x >> 1); \ |
40 | x |= (x >> 2); \ |
41 | x |= (x >> 4); \ |
42 | x |= (x >> 8); \ |
43 | x |= (x >> 16); \ |
44 | if (sizeof(x) > 4) { \ |
45 | /* \ |
46 | * If sizeof(x) is 4, then the expression "x >> 32" \ |
47 | * will generate compiler warnings even if the code \ |
48 | * never executes. This circumvents the warning, and \ |
49 | * gets compiled out in optimized builds. \ |
50 | */ \ |
51 | int constant_32 = sizeof(x) * 4; \ |
52 | x |= (x >> constant_32); \ |
53 | } \ |
54 | x++; \ |
55 | if (x == 0) { \ |
56 | return 8 * sizeof(x) - 1; \ |
57 | } \ |
58 | return ffs_##suffix(x) - 1; \ |
59 | } while(0) |
60 | |
61 | static inline unsigned |
62 | fls_llu_slow(unsigned long long x) { |
63 | DO_FLS_SLOW(x, llu); |
64 | } |
65 | |
66 | static inline unsigned |
67 | fls_lu_slow(unsigned long x) { |
68 | DO_FLS_SLOW(x, lu); |
69 | } |
70 | |
71 | static inline unsigned |
72 | fls_u_slow(unsigned x) { |
73 | DO_FLS_SLOW(x, u); |
74 | } |
75 | |
76 | #undef DO_FLS_SLOW |
77 | |
78 | #ifdef JEMALLOC_HAVE_BUILTIN_CLZ |
79 | static inline unsigned |
80 | fls_llu(unsigned long long x) { |
81 | util_assume(x != 0); |
82 | /* |
83 | * Note that the xor here is more naturally written as subtraction; the |
84 | * last bit set is the number of bits in the type minus the number of |
85 | * leading zero bits. But GCC implements that as: |
86 | * bsr edi, edi |
87 | * mov eax, 31 |
88 | * xor edi, 31 |
89 | * sub eax, edi |
90 | * If we write it as xor instead, then we get |
91 | * bsr eax, edi |
92 | * as desired. |
93 | */ |
94 | return (8 * sizeof(x) - 1) ^ __builtin_clzll(x); |
95 | } |
96 | |
97 | static inline unsigned |
98 | fls_lu(unsigned long x) { |
99 | util_assume(x != 0); |
100 | return (8 * sizeof(x) - 1) ^ __builtin_clzl(x); |
101 | } |
102 | |
103 | static inline unsigned |
104 | fls_u(unsigned x) { |
105 | util_assume(x != 0); |
106 | return (8 * sizeof(x) - 1) ^ __builtin_clz(x); |
107 | } |
108 | #elif defined(_MSC_VER) |
109 | |
110 | #if LG_SIZEOF_PTR == 3 |
111 | #define DO_BSR64(bit, x) _BitScanReverse64(&bit, x) |
112 | #else |
113 | /* |
114 | * This never actually runs; we're just dodging a compiler error for the |
115 | * never-taken branch where sizeof(void *) == 8. |
116 | */ |
117 | #define DO_BSR64(bit, x) bit = 0; unreachable() |
118 | #endif |
119 | |
120 | #define DO_FLS(x) do { \ |
121 | if (x == 0) { \ |
122 | return 8 * sizeof(x); \ |
123 | } \ |
124 | unsigned long bit; \ |
125 | if (sizeof(x) == 4) { \ |
126 | _BitScanReverse(&bit, (unsigned)x); \ |
127 | return (unsigned)bit; \ |
128 | } \ |
129 | if (sizeof(x) == 8 && sizeof(void *) == 8) { \ |
130 | DO_BSR64(bit, x); \ |
131 | return (unsigned)bit; \ |
132 | } \ |
133 | if (sizeof(x) == 8 && sizeof(void *) == 4) { \ |
134 | /* Dodge a compiler warning, as above. */ \ |
135 | int constant_32 = sizeof(x) * 4; \ |
136 | if (_BitScanReverse(&bit, \ |
137 | (unsigned)(x >> constant_32))) { \ |
138 | return 32 + (unsigned)bit; \ |
139 | } else { \ |
140 | _BitScanReverse(&bit, (unsigned)x); \ |
141 | return (unsigned)bit; \ |
142 | } \ |
143 | } \ |
144 | unreachable(); \ |
145 | } while (0) |
146 | |
147 | static inline unsigned |
148 | fls_llu(unsigned long long x) { |
149 | DO_FLS(x); |
150 | } |
151 | |
152 | static inline unsigned |
153 | fls_lu(unsigned long x) { |
154 | DO_FLS(x); |
155 | } |
156 | |
157 | static inline unsigned |
158 | fls_u(unsigned x) { |
159 | DO_FLS(x); |
160 | } |
161 | |
162 | #undef DO_FLS |
163 | #undef DO_BSR64 |
164 | #else |
165 | |
166 | static inline unsigned |
167 | fls_llu(unsigned long long x) { |
168 | return fls_llu_slow(x); |
169 | } |
170 | |
171 | static inline unsigned |
172 | fls_lu(unsigned long x) { |
173 | return fls_lu_slow(x); |
174 | } |
175 | |
176 | static inline unsigned |
177 | fls_u(unsigned x) { |
178 | return fls_u_slow(x); |
179 | } |
180 | #endif |
181 | |
182 | #if LG_SIZEOF_LONG_LONG > 3 |
183 | # error "Haven't implemented popcount for 16-byte ints." |
184 | #endif |
185 | |
186 | #define DO_POPCOUNT(x, type) do { \ |
187 | /* \ |
188 | * Algorithm from an old AMD optimization reference manual. \ |
189 | * We're putting a little bit more work than you might expect \ |
190 | * into the no-instrinsic case, since we only support the \ |
191 | * GCC intrinsics spelling of popcount (for now). Detecting \ |
192 | * whether or not the popcount builtin is actually useable in \ |
193 | * MSVC is nontrivial. \ |
194 | */ \ |
195 | \ |
196 | type bmul = (type)0x0101010101010101ULL; \ |
197 | \ |
198 | /* \ |
199 | * Replace each 2 bits with the sideways sum of the original \ |
200 | * values. 0x5 = 0b0101. \ |
201 | * \ |
202 | * You might expect this to be: \ |
203 | * x = (x & 0x55...) + ((x >> 1) & 0x55...). \ |
204 | * That costs an extra mask relative to this, though. \ |
205 | */ \ |
206 | x = x - ((x >> 1) & (0x55U * bmul)); \ |
207 | /* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\ |
208 | x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \ |
209 | /* \ |
210 | * Replace each 8 bits with their sideways sum. Note that we \ |
211 | * can't overflow within each 4-bit sum here, so we can skip \ |
212 | * the initial mask. \ |
213 | */ \ |
214 | x = (x + (x >> 4)) & (bmul * 0x0FU); \ |
215 | /* \ |
216 | * None of the partial sums in this multiplication (viewed in \ |
217 | * base-256) can overflow into the next digit. So the least \ |
218 | * significant byte of the product will be the least \ |
219 | * significant byte of the original value, the second least \ |
220 | * significant byte will be the sum of the two least \ |
221 | * significant bytes of the original value, and so on. \ |
222 | * Importantly, the high byte will be the byte-wise sum of all \ |
223 | * the bytes of the original value. \ |
224 | */ \ |
225 | x = x * bmul; \ |
226 | x >>= ((sizeof(x) - 1) * 8); \ |
227 | return (unsigned)x; \ |
228 | } while(0) |
229 | |
230 | static inline unsigned |
231 | popcount_u_slow(unsigned bitmap) { |
232 | DO_POPCOUNT(bitmap, unsigned); |
233 | } |
234 | |
235 | static inline unsigned |
236 | popcount_lu_slow(unsigned long bitmap) { |
237 | DO_POPCOUNT(bitmap, unsigned long); |
238 | } |
239 | |
240 | static inline unsigned |
241 | popcount_llu_slow(unsigned long long bitmap) { |
242 | DO_POPCOUNT(bitmap, unsigned long long); |
243 | } |
244 | |
245 | #undef DO_POPCOUNT |
246 | |
247 | static inline unsigned |
248 | popcount_u(unsigned bitmap) { |
249 | #ifdef JEMALLOC_INTERNAL_POPCOUNT |
250 | return JEMALLOC_INTERNAL_POPCOUNT(bitmap); |
251 | #else |
252 | return popcount_u_slow(bitmap); |
253 | #endif |
254 | } |
255 | |
256 | static inline unsigned |
257 | popcount_lu(unsigned long bitmap) { |
258 | #ifdef JEMALLOC_INTERNAL_POPCOUNTL |
259 | return JEMALLOC_INTERNAL_POPCOUNTL(bitmap); |
260 | #else |
261 | return popcount_lu_slow(bitmap); |
262 | #endif |
263 | } |
264 | |
265 | static inline unsigned |
266 | popcount_llu(unsigned long long bitmap) { |
267 | #ifdef JEMALLOC_INTERNAL_POPCOUNTLL |
268 | return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap); |
269 | #else |
270 | return popcount_llu_slow(bitmap); |
271 | #endif |
272 | } |
273 | |
274 | /* |
275 | * Clears first unset bit in bitmap, and returns |
276 | * place of bit. bitmap *must not* be 0. |
277 | */ |
278 | |
279 | static inline size_t |
280 | cfs_lu(unsigned long* bitmap) { |
281 | util_assume(*bitmap != 0); |
282 | size_t bit = ffs_lu(*bitmap); |
283 | *bitmap ^= ZU(1) << bit; |
284 | return bit; |
285 | } |
286 | |
287 | static inline unsigned |
288 | ffs_zu(size_t x) { |
289 | #if LG_SIZEOF_PTR == LG_SIZEOF_INT |
290 | return ffs_u(x); |
291 | #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG |
292 | return ffs_lu(x); |
293 | #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG |
294 | return ffs_llu(x); |
295 | #else |
296 | #error No implementation for size_t ffs() |
297 | #endif |
298 | } |
299 | |
300 | static inline unsigned |
301 | fls_zu(size_t x) { |
302 | #if LG_SIZEOF_PTR == LG_SIZEOF_INT |
303 | return fls_u(x); |
304 | #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG |
305 | return fls_lu(x); |
306 | #elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG |
307 | return fls_llu(x); |
308 | #else |
309 | #error No implementation for size_t fls() |
310 | #endif |
311 | } |
312 | |
313 | |
314 | static inline unsigned |
315 | ffs_u64(uint64_t x) { |
316 | #if LG_SIZEOF_LONG == 3 |
317 | return ffs_lu(x); |
318 | #elif LG_SIZEOF_LONG_LONG == 3 |
319 | return ffs_llu(x); |
320 | #else |
321 | #error No implementation for 64-bit ffs() |
322 | #endif |
323 | } |
324 | |
325 | static inline unsigned |
326 | fls_u64(uint64_t x) { |
327 | #if LG_SIZEOF_LONG == 3 |
328 | return fls_lu(x); |
329 | #elif LG_SIZEOF_LONG_LONG == 3 |
330 | return fls_llu(x); |
331 | #else |
332 | #error No implementation for 64-bit fls() |
333 | #endif |
334 | } |
335 | |
336 | static inline unsigned |
337 | ffs_u32(uint32_t x) { |
338 | #if LG_SIZEOF_INT == 2 |
339 | return ffs_u(x); |
340 | #else |
341 | #error No implementation for 32-bit ffs() |
342 | #endif |
343 | return ffs_u(x); |
344 | } |
345 | |
346 | static inline unsigned |
347 | fls_u32(uint32_t x) { |
348 | #if LG_SIZEOF_INT == 2 |
349 | return fls_u(x); |
350 | #else |
351 | #error No implementation for 32-bit fls() |
352 | #endif |
353 | return fls_u(x); |
354 | } |
355 | |
356 | static inline uint64_t |
357 | pow2_ceil_u64(uint64_t x) { |
358 | if (unlikely(x <= 1)) { |
359 | return x; |
360 | } |
361 | size_t msb_on_index = fls_u64(x - 1); |
362 | /* |
363 | * Range-check; it's on the callers to ensure that the result of this |
364 | * call won't overflow. |
365 | */ |
366 | assert(msb_on_index < 63); |
367 | return 1ULL << (msb_on_index + 1); |
368 | } |
369 | |
370 | static inline uint32_t |
371 | pow2_ceil_u32(uint32_t x) { |
372 | if (unlikely(x <= 1)) { |
373 | return x; |
374 | } |
375 | size_t msb_on_index = fls_u32(x - 1); |
376 | /* As above. */ |
377 | assert(msb_on_index < 31); |
378 | return 1U << (msb_on_index + 1); |
379 | } |
380 | |
381 | /* Compute the smallest power of 2 that is >= x. */ |
382 | static inline size_t |
383 | pow2_ceil_zu(size_t x) { |
384 | #if (LG_SIZEOF_PTR == 3) |
385 | return pow2_ceil_u64(x); |
386 | #else |
387 | return pow2_ceil_u32(x); |
388 | #endif |
389 | } |
390 | |
391 | static inline unsigned |
392 | lg_floor(size_t x) { |
393 | util_assume(x != 0); |
394 | #if (LG_SIZEOF_PTR == 3) |
395 | return fls_u64(x); |
396 | #else |
397 | return fls_u32(x); |
398 | #endif |
399 | } |
400 | |
401 | static inline unsigned |
402 | lg_ceil(size_t x) { |
403 | return lg_floor(x) + ((x & (x - 1)) == 0 ? 0 : 1); |
404 | } |
405 | |
406 | /* A compile-time version of lg_floor and lg_ceil. */ |
407 | #define LG_FLOOR_1(x) 0 |
408 | #define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1)) |
409 | #define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2)) |
410 | #define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4)) |
411 | #define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8)) |
412 | #define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16)) |
413 | #define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32)) |
414 | #if LG_SIZEOF_PTR == 2 |
415 | # define LG_FLOOR(x) LG_FLOOR_32((x)) |
416 | #else |
417 | # define LG_FLOOR(x) LG_FLOOR_64((x)) |
418 | #endif |
419 | |
420 | #define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1)) |
421 | |
422 | #endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */ |
423 | |