bit_util.h source code [jemalloc/include/jemalloc/internal/bit_util.h]

1	#ifndef JEMALLOC_INTERNAL_BIT_UTIL_H
2	#define JEMALLOC_INTERNAL_BIT_UTIL_H
3
4	#include "jemalloc/internal/assert.h"
5
6	/ Sanity check. /
7	#if !defined(JEMALLOC_INTERNAL_FFSLL) \|\| !defined(JEMALLOC_INTERNAL_FFSL) \
8	\|\| !defined(JEMALLOC_INTERNAL_FFS)
9	# error JEMALLOC_INTERNAL_FFS{,L,LL} should have been defined by configure
10	#endif
11
12	/*
13	* Unlike the builtins and posix ffs functions, our ffs requires a non-zero
14	* input, and returns the position of the lowest bit set (as opposed to the
15	* posix versions, which return 1 larger than that position and use a return
16	* value of zero as a sentinel. This tends to simplify logic in callers, and
17	* allows for consistency with the builtins we build fls on top of.
18	*/
19	static inline unsigned
20	ffs_llu(unsigned long long x) {
21	util_assume(x != `0`);
22	return JEMALLOC_INTERNAL_FFSLL(x) - `1`;
23	}
24
25	static inline unsigned
26	ffs_lu(unsigned long x) {
27	util_assume(x != `0`);
28	return JEMALLOC_INTERNAL_FFSL(x) - `1`;
29	}
30
31	static inline unsigned
32	ffs_u(unsigned x) {
33	util_assume(x != `0`);
34	return JEMALLOC_INTERNAL_FFS(x) - `1`;
35	}
36
37	#define DO_FLS_SLOW(x, suffix) do { \
38	util_assume(x != 0); \
39	x \|= (x >> 1); \
40	x \|= (x >> 2); \
41	x \|= (x >> 4); \
42	x \|= (x >> 8); \
43	x \|= (x >> 16); \
44	if (sizeof(x) > 4) { \
45	/* \
46	* If sizeof(x) is 4, then the expression "x >> 32" \
47	* will generate compiler warnings even if the code \
48	* never executes. This circumvents the warning, and \
49	* gets compiled out in optimized builds. \
50	*/ \
51	int constant_32 = sizeof(x) * 4; \
52	x \|= (x >> constant_32); \
53	} \
54	x++; \
55	if (x == 0) { \
56	return 8 * sizeof(x) - 1; \
57	} \
58	return ffs_##suffix(x) - 1; \
59	} while(0)
60
61	static inline unsigned
62	fls_llu_slow(unsigned long long x) {
63	DO_FLS_SLOW(x, llu);
64	}
65
66	static inline unsigned
67	fls_lu_slow(unsigned long x) {
68	DO_FLS_SLOW(x, lu);
69	}
70
71	static inline unsigned
72	fls_u_slow(unsigned x) {
73	DO_FLS_SLOW(x, u);
74	}
75
76	#undef DO_FLS_SLOW
77
78	#ifdef JEMALLOC_HAVE_BUILTIN_CLZ
79	static inline unsigned
80	fls_llu(unsigned long long x) {
81	util_assume(x != `0`);
82	/*
83	* Note that the xor here is more naturally written as subtraction; the
84	* last bit set is the number of bits in the type minus the number of
85	* leading zero bits. But GCC implements that as:
86	* bsr edi, edi
87	* mov eax, 31
88	* xor edi, 31
89	* sub eax, edi
90	* If we write it as xor instead, then we get
91	* bsr eax, edi
92	* as desired.
93	*/
94	return (`8` * sizeof(x) - `1`) ^ __builtin_clzll(x);
95	}
96
97	static inline unsigned
98	fls_lu(unsigned long x) {
99	util_assume(x != `0`);
100	return (`8` * sizeof(x) - `1`) ^ __builtin_clzl(x);
101	}
102
103	static inline unsigned
104	fls_u(unsigned x) {
105	util_assume(x != `0`);
106	return (`8` * sizeof(x) - `1`) ^ __builtin_clz(x);
107	}
108	#elif defined(_MSC_VER)
109
110	#if LG_SIZEOF_PTR == 3
111	#define DO_BSR64(bit, x) _BitScanReverse64(&bit, x)
112	#else
113	/*
114	* This never actually runs; we're just dodging a compiler error for the
115	* never-taken branch where sizeof(void *) == 8.
116	*/
117	#define DO_BSR64(bit, x) bit = 0; unreachable()
118	#endif
119
120	#define DO_FLS(x) do { \
121	if (x == 0) { \
122	return 8 * sizeof(x); \
123	} \
124	unsigned long bit; \
125	if (sizeof(x) == 4) { \
126	_BitScanReverse(&bit, (unsigned)x); \
127	return (unsigned)bit; \
128	} \
129	if (sizeof(x) == 8 && sizeof(void *) == 8) { \
130	DO_BSR64(bit, x); \
131	return (unsigned)bit; \
132	} \
133	if (sizeof(x) == 8 && sizeof(void *) == 4) { \
134	/* Dodge a compiler warning, as above. */ \
135	int constant_32 = sizeof(x) * 4; \
136	if (_BitScanReverse(&bit, \
137	(unsigned)(x >> constant_32))) { \
138	return 32 + (unsigned)bit; \
139	} else { \
140	_BitScanReverse(&bit, (unsigned)x); \
141	return (unsigned)bit; \
142	} \
143	} \
144	unreachable(); \
145	} while (0)
146
147	static inline unsigned
148	fls_llu(unsigned long long x) {
149	DO_FLS(x);
150	}
151
152	static inline unsigned
153	fls_lu(unsigned long x) {
154	DO_FLS(x);
155	}
156
157	static inline unsigned
158	fls_u(unsigned x) {
159	DO_FLS(x);
160	}
161
162	#undef DO_FLS
163	#undef DO_BSR64
164	#else
165
166	static inline unsigned
167	fls_llu(unsigned long long x) {
168	return fls_llu_slow(x);
169	}
170
171	static inline unsigned
172	fls_lu(unsigned long x) {
173	return fls_lu_slow(x);
174	}
175
176	static inline unsigned
177	fls_u(unsigned x) {
178	return fls_u_slow(x);
179	}
180	#endif
181
182	#if LG_SIZEOF_LONG_LONG > 3
183	# error "Haven't implemented popcount for 16-byte ints."
184	#endif
185
186	#define DO_POPCOUNT(x, type) do { \
187	/* \
188	* Algorithm from an old AMD optimization reference manual. \
189	* We're putting a little bit more work than you might expect \
190	* into the no-instrinsic case, since we only support the \
191	* GCC intrinsics spelling of popcount (for now). Detecting \
192	* whether or not the popcount builtin is actually useable in \
193	* MSVC is nontrivial. \
194	*/ \
195	\
196	type bmul = (type)0x0101010101010101ULL; \
197	\
198	/* \
199	* Replace each 2 bits with the sideways sum of the original \
200	* values. 0x5 = 0b0101. \
201	* \
202	* You might expect this to be: \
203	* x = (x & 0x55...) + ((x >> 1) & 0x55...). \
204	* That costs an extra mask relative to this, though. \
205	*/ \
206	x = x - ((x >> 1) & (0x55U * bmul)); \
207	/* Replace each 4 bits with their sideays sum. 0x3 = 0b0011. */\
208	x = (x & (bmul * 0x33U)) + ((x >> 2) & (bmul * 0x33U)); \
209	/* \
210	* Replace each 8 bits with their sideways sum. Note that we \
211	* can't overflow within each 4-bit sum here, so we can skip \
212	* the initial mask. \
213	*/ \
214	x = (x + (x >> 4)) & (bmul * 0x0FU); \
215	/* \
216	* None of the partial sums in this multiplication (viewed in \
217	* base-256) can overflow into the next digit. So the least \
218	* significant byte of the product will be the least \
219	* significant byte of the original value, the second least \
220	* significant byte will be the sum of the two least \
221	* significant bytes of the original value, and so on. \
222	* Importantly, the high byte will be the byte-wise sum of all \
223	* the bytes of the original value. \
224	*/ \
225	x = x * bmul; \
226	x >>= ((sizeof(x) - 1) * 8); \
227	return (unsigned)x; \
228	} while(0)
229
230	static inline unsigned
231	popcount_u_slow(unsigned bitmap) {
232	DO_POPCOUNT(bitmap, unsigned);
233	}
234
235	static inline unsigned
236	popcount_lu_slow(unsigned long bitmap) {
237	DO_POPCOUNT(bitmap, unsigned long);
238	}
239
240	static inline unsigned
241	popcount_llu_slow(unsigned long long bitmap) {
242	DO_POPCOUNT(bitmap, unsigned long long);
243	}
244
245	#undef DO_POPCOUNT
246
247	static inline unsigned
248	popcount_u(unsigned bitmap) {
249	#ifdef JEMALLOC_INTERNAL_POPCOUNT
250	return JEMALLOC_INTERNAL_POPCOUNT(bitmap);
251	#else
252	return popcount_u_slow(bitmap);
253	#endif
254	}
255
256	static inline unsigned
257	popcount_lu(unsigned long bitmap) {
258	#ifdef JEMALLOC_INTERNAL_POPCOUNTL
259	return JEMALLOC_INTERNAL_POPCOUNTL(bitmap);
260	#else
261	return popcount_lu_slow(bitmap);
262	#endif
263	}
264
265	static inline unsigned
266	popcount_llu(unsigned long long bitmap) {
267	#ifdef JEMALLOC_INTERNAL_POPCOUNTLL
268	return JEMALLOC_INTERNAL_POPCOUNTLL(bitmap);
269	#else
270	return popcount_llu_slow(bitmap);
271	#endif
272	}
273
274	/*
275	* Clears first unset bit in bitmap, and returns
276	* place of bit. bitmap must not be 0.
277	*/
278
279	static inline size_t
280	cfs_lu(unsigned long* bitmap) {
281	util_assume(*bitmap != `0`);
282	size_t bit = ffs_lu(*bitmap);
283	*bitmap ^= ZU(`1`) << bit;
284	return bit;
285	}
286
287	static inline unsigned
288	ffs_zu(size_t x) {
289	#if LG_SIZEOF_PTR == LG_SIZEOF_INT
290	return ffs_u(x);
291	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
292	return ffs_lu(x);
293	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
294	return ffs_llu(x);
295	#else
296	#error No implementation for size_t ffs()
297	#endif
298	}
299
300	static inline unsigned
301	fls_zu(size_t x) {
302	#if LG_SIZEOF_PTR == LG_SIZEOF_INT
303	return fls_u(x);
304	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG
305	return fls_lu(x);
306	#elif LG_SIZEOF_PTR == LG_SIZEOF_LONG_LONG
307	return fls_llu(x);
308	#else
309	#error No implementation for size_t fls()
310	#endif
311	}
312
313
314	static inline unsigned
315	ffs_u64(uint64_t x) {
316	#if LG_SIZEOF_LONG == 3
317	return ffs_lu(x);
318	#elif LG_SIZEOF_LONG_LONG == 3
319	return ffs_llu(x);
320	#else
321	#error No implementation for 64-bit ffs()
322	#endif
323	}
324
325	static inline unsigned
326	fls_u64(uint64_t x) {
327	#if LG_SIZEOF_LONG == 3
328	return fls_lu(x);
329	#elif LG_SIZEOF_LONG_LONG == 3
330	return fls_llu(x);
331	#else
332	#error No implementation for 64-bit fls()
333	#endif
334	}
335
336	static inline unsigned
337	ffs_u32(uint32_t x) {
338	#if LG_SIZEOF_INT == 2
339	return ffs_u(x);
340	#else
341	#error No implementation for 32-bit ffs()
342	#endif
343	return ffs_u(x);
344	}
345
346	static inline unsigned
347	fls_u32(uint32_t x) {
348	#if LG_SIZEOF_INT == 2
349	return fls_u(x);
350	#else
351	#error No implementation for 32-bit fls()
352	#endif
353	return fls_u(x);
354	}
355
356	static inline uint64_t
357	pow2_ceil_u64(uint64_t x) {
358	if (unlikely(x <= `1`)) {
359	return x;
360	}
361	size_t msb_on_index = fls_u64(x - `1`);
362	/*
363	* Range-check; it's on the callers to ensure that the result of this
364	* call won't overflow.
365	*/
366	assert(msb_on_index < `63`);
367	return `1ULL` << (msb_on_index + `1`);
368	}
369
370	static inline uint32_t
371	pow2_ceil_u32(uint32_t x) {
372	if (unlikely(x <= `1`)) {
373	return x;
374	}
375	size_t msb_on_index = fls_u32(x - `1`);
376	/ As above. /
377	assert(msb_on_index < `31`);
378	return `1U` << (msb_on_index + `1`);
379	}
380
381	/ Compute the smallest power of 2 that is >= x. /
382	static inline size_t
383	pow2_ceil_zu(size_t x) {
384	#if (LG_SIZEOF_PTR == 3)
385	return pow2_ceil_u64(x);
386	#else
387	return pow2_ceil_u32(x);
388	#endif
389	}
390
391	static inline unsigned
392	lg_floor(size_t x) {
393	util_assume(x != `0`);
394	#if (LG_SIZEOF_PTR == 3)
395	return fls_u64(x);
396	#else
397	return fls_u32(x);
398	#endif
399	}
400
401	static inline unsigned
402	lg_ceil(size_t x) {
403	return lg_floor(x) + ((x & (x - `1`)) == `0` ? `0` : `1`);
404	}
405
406	/ A compile-time version of lg_floor and lg_ceil. /
407	#define LG_FLOOR_1(x) 0
408	#define LG_FLOOR_2(x) (x < (1ULL << 1) ? LG_FLOOR_1(x) : 1 + LG_FLOOR_1(x >> 1))
409	#define LG_FLOOR_4(x) (x < (1ULL << 2) ? LG_FLOOR_2(x) : 2 + LG_FLOOR_2(x >> 2))
410	#define LG_FLOOR_8(x) (x < (1ULL << 4) ? LG_FLOOR_4(x) : 4 + LG_FLOOR_4(x >> 4))
411	#define LG_FLOOR_16(x) (x < (1ULL << 8) ? LG_FLOOR_8(x) : 8 + LG_FLOOR_8(x >> 8))
412	#define LG_FLOOR_32(x) (x < (1ULL << 16) ? LG_FLOOR_16(x) : 16 + LG_FLOOR_16(x >> 16))
413	#define LG_FLOOR_64(x) (x < (1ULL << 32) ? LG_FLOOR_32(x) : 32 + LG_FLOOR_32(x >> 32))
414	#if LG_SIZEOF_PTR == 2
415	# define LG_FLOOR(x) LG_FLOOR_32((x))
416	#else
417	# define LG_FLOOR(x) LG_FLOOR_64((x))
418	#endif
419
420	#define LG_CEIL(x) (LG_FLOOR(x) + (((x) & ((x) - 1)) == 0 ? 0 : 1))
421
422	#endif /* JEMALLOC_INTERNAL_BIT_UTIL_H */
423

Browse the source code of jemalloc/include/jemalloc/internal/bit_util.h