1 | /* Bit and bytes utilities. |
2 | |
3 | Bytes swap functions, reverse order of bytes: |
4 | |
5 | - _Py_bswap16(uint16_t) |
6 | - _Py_bswap32(uint32_t) |
7 | - _Py_bswap64(uint64_t) |
8 | */ |
9 | |
10 | #ifndef Py_INTERNAL_BITUTILS_H |
11 | #define Py_INTERNAL_BITUTILS_H |
12 | #ifdef __cplusplus |
13 | extern "C" { |
14 | #endif |
15 | |
16 | #ifndef Py_BUILD_CORE |
17 | # error "this header requires Py_BUILD_CORE define" |
18 | #endif |
19 | |
20 | #if defined(__GNUC__) \ |
21 | && ((__GNUC__ >= 5) || (__GNUC__ == 4) && (__GNUC_MINOR__ >= 8)) |
22 | /* __builtin_bswap16() is available since GCC 4.8, |
23 | __builtin_bswap32() is available since GCC 4.3, |
24 | __builtin_bswap64() is available since GCC 4.3. */ |
25 | # define _PY_HAVE_BUILTIN_BSWAP |
26 | #endif |
27 | |
28 | #ifdef _MSC_VER |
29 | /* Get _byteswap_ushort(), _byteswap_ulong(), _byteswap_uint64() */ |
30 | # include <intrin.h> |
31 | #endif |
32 | |
33 | static inline uint16_t |
34 | _Py_bswap16(uint16_t word) |
35 | { |
36 | #if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap16) |
37 | return __builtin_bswap16(word); |
38 | #elif defined(_MSC_VER) |
39 | Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned short)); |
40 | return _byteswap_ushort(word); |
41 | #else |
42 | // Portable implementation which doesn't rely on circular bit shift |
43 | return ( ((word & UINT16_C(0x00FF)) << 8) |
44 | | ((word & UINT16_C(0xFF00)) >> 8)); |
45 | #endif |
46 | } |
47 | |
48 | static inline uint32_t |
49 | _Py_bswap32(uint32_t word) |
50 | { |
51 | #if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap32) |
52 | return __builtin_bswap32(word); |
53 | #elif defined(_MSC_VER) |
54 | Py_BUILD_ASSERT(sizeof(word) == sizeof(unsigned long)); |
55 | return _byteswap_ulong(word); |
56 | #else |
57 | // Portable implementation which doesn't rely on circular bit shift |
58 | return ( ((word & UINT32_C(0x000000FF)) << 24) |
59 | | ((word & UINT32_C(0x0000FF00)) << 8) |
60 | | ((word & UINT32_C(0x00FF0000)) >> 8) |
61 | | ((word & UINT32_C(0xFF000000)) >> 24)); |
62 | #endif |
63 | } |
64 | |
65 | static inline uint64_t |
66 | _Py_bswap64(uint64_t word) |
67 | { |
68 | #if defined(_PY_HAVE_BUILTIN_BSWAP) || _Py__has_builtin(__builtin_bswap64) |
69 | return __builtin_bswap64(word); |
70 | #elif defined(_MSC_VER) |
71 | return _byteswap_uint64(word); |
72 | #else |
73 | // Portable implementation which doesn't rely on circular bit shift |
74 | return ( ((word & UINT64_C(0x00000000000000FF)) << 56) |
75 | | ((word & UINT64_C(0x000000000000FF00)) << 40) |
76 | | ((word & UINT64_C(0x0000000000FF0000)) << 24) |
77 | | ((word & UINT64_C(0x00000000FF000000)) << 8) |
78 | | ((word & UINT64_C(0x000000FF00000000)) >> 8) |
79 | | ((word & UINT64_C(0x0000FF0000000000)) >> 24) |
80 | | ((word & UINT64_C(0x00FF000000000000)) >> 40) |
81 | | ((word & UINT64_C(0xFF00000000000000)) >> 56)); |
82 | #endif |
83 | } |
84 | |
85 | |
86 | // Population count: count the number of 1's in 'x' |
87 | // (number of bits set to 1), also known as the hamming weight. |
88 | // |
89 | // Implementation note. CPUID is not used, to test if x86 POPCNT instruction |
90 | // can be used, to keep the implementation simple. For example, Visual Studio |
91 | // __popcnt() is not used this reason. The clang and GCC builtin function can |
92 | // use the x86 POPCNT instruction if the target architecture has SSE4a or |
93 | // newer. |
94 | static inline int |
95 | _Py_popcount32(uint32_t x) |
96 | { |
97 | #if (defined(__clang__) || defined(__GNUC__)) |
98 | |
99 | #if SIZEOF_INT >= 4 |
100 | Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned int)); |
101 | return __builtin_popcount(x); |
102 | #else |
103 | // The C standard guarantees that unsigned long will always be big enough |
104 | // to hold a uint32_t value without losing information. |
105 | Py_BUILD_ASSERT(sizeof(x) <= sizeof(unsigned long)); |
106 | return __builtin_popcountl(x); |
107 | #endif |
108 | |
109 | #else |
110 | // 32-bit SWAR (SIMD Within A Register) popcount |
111 | |
112 | // Binary: 0 1 0 1 ... |
113 | const uint32_t M1 = 0x55555555; |
114 | // Binary: 00 11 00 11. .. |
115 | const uint32_t M2 = 0x33333333; |
116 | // Binary: 0000 1111 0000 1111 ... |
117 | const uint32_t M4 = 0x0F0F0F0F; |
118 | // 256**4 + 256**3 + 256**2 + 256**1 |
119 | const uint32_t SUM = 0x01010101; |
120 | |
121 | // Put count of each 2 bits into those 2 bits |
122 | x = x - ((x >> 1) & M1); |
123 | // Put count of each 4 bits into those 4 bits |
124 | x = (x & M2) + ((x >> 2) & M2); |
125 | // Put count of each 8 bits into those 8 bits |
126 | x = (x + (x >> 4)) & M4; |
127 | // Sum of the 4 byte counts |
128 | return (uint32_t)((uint64_t)x * (uint64_t)SUM) >> 24; |
129 | #endif |
130 | } |
131 | |
132 | |
133 | // Return the index of the most significant 1 bit in 'x'. This is the smallest |
134 | // integer k such that x < 2**k. Equivalent to floor(log2(x)) + 1 for x != 0. |
135 | static inline int |
136 | _Py_bit_length(unsigned long x) |
137 | { |
138 | #if (defined(__clang__) || defined(__GNUC__)) |
139 | if (x != 0) { |
140 | // __builtin_clzl() is available since GCC 3.4. |
141 | // Undefined behavior for x == 0. |
142 | return (int)sizeof(unsigned long) * 8 - __builtin_clzl(x); |
143 | } |
144 | else { |
145 | return 0; |
146 | } |
147 | #elif defined(_MSC_VER) |
148 | // _BitScanReverse() is documented to search 32 bits. |
149 | Py_BUILD_ASSERT(sizeof(unsigned long) <= 4); |
150 | unsigned long msb; |
151 | if (_BitScanReverse(&msb, x)) { |
152 | return (int)msb + 1; |
153 | } |
154 | else { |
155 | return 0; |
156 | } |
157 | #else |
158 | const int BIT_LENGTH_TABLE[32] = { |
159 | 0, 1, 2, 2, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, |
160 | 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5 |
161 | }; |
162 | int msb = 0; |
163 | while (x >= 32) { |
164 | msb += 6; |
165 | x >>= 6; |
166 | } |
167 | msb += BIT_LENGTH_TABLE[x]; |
168 | return msb; |
169 | #endif |
170 | } |
171 | |
172 | |
173 | #ifdef __cplusplus |
174 | } |
175 | #endif |
176 | #endif /* !Py_INTERNAL_BITUTILS_H */ |
177 | |