1 | #pragma once |
2 | #ifndef FP16_FP16_H |
3 | #define FP16_FP16_H |
4 | |
5 | #if defined(__cplusplus) && (__cplusplus >= 201103L) |
6 | #include <cstdint> |
7 | #include <cmath> |
8 | #elif !defined(__OPENCL_VERSION__) |
9 | #include <stdint.h> |
10 | #include <math.h> |
11 | #endif |
12 | |
13 | #ifdef _MSC_VER |
14 | #include <intrin.h> |
15 | #endif |
16 | |
17 | #include <fp16/bitcasts.h> |
18 | |
19 | |
20 | /* |
21 | * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to |
22 | * a 32-bit floating-point number in IEEE single-precision format, in bit representation. |
23 | * |
24 | * @note The implementation doesn't use any floating-point operations. |
25 | */ |
26 | static inline uint32_t fp16_ieee_to_fp32_bits(uint16_t h) { |
27 | /* |
28 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: |
29 | * +---+-----+------------+-------------------+ |
30 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| |
31 | * +---+-----+------------+-------------------+ |
32 | * Bits 31 26-30 16-25 0-15 |
33 | * |
34 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. |
35 | */ |
36 | const uint32_t w = (uint32_t) h << 16; |
37 | /* |
38 | * Extract the sign of the input number into the high bit of the 32-bit word: |
39 | * |
40 | * +---+----------------------------------+ |
41 | * | S |0000000 00000000 00000000 00000000| |
42 | * +---+----------------------------------+ |
43 | * Bits 31 0-31 |
44 | */ |
45 | const uint32_t sign = w & UINT32_C(0x80000000); |
46 | /* |
47 | * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word: |
48 | * |
49 | * +---+-----+------------+-------------------+ |
50 | * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| |
51 | * +---+-----+------------+-------------------+ |
52 | * Bits 30 27-31 17-26 0-16 |
53 | */ |
54 | const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); |
55 | /* |
56 | * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized. |
57 | * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one. |
58 | * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift |
59 | * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the |
60 | * biased exponent into 1, and making mantissa normalized (i.e. without leading 1). |
61 | */ |
62 | #ifdef _MSC_VER |
63 | unsigned long nonsign_bsr; |
64 | _BitScanReverse(&nonsign_bsr, (unsigned long) nonsign); |
65 | uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31; |
66 | #else |
67 | uint32_t renorm_shift = __builtin_clz(nonsign); |
68 | #endif |
69 | renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; |
70 | /* |
71 | * Iff half-precision number has exponent of 15, the addition overflows it into bit 31, |
72 | * and the subsequent shift turns the high 9 bits into 1. Thus |
73 | * inf_nan_mask == |
74 | * 0x7F800000 if the half-precision number had exponent of 15 (i.e. was NaN or infinity) |
75 | * 0x00000000 otherwise |
76 | */ |
77 | const int32_t inf_nan_mask = ((int32_t) (nonsign + 0x04000000) >> 8) & INT32_C(0x7F800000); |
78 | /* |
79 | * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0. |
80 | * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus |
81 | * zero_mask == |
82 | * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h) |
83 | * 0x00000000 otherwise |
84 | */ |
85 | const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31; |
86 | /* |
87 | * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal) |
88 | * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa |
89 | * shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number. |
90 | * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias |
91 | * (0x7F for single-precision number less 0xF for half-precision number). |
92 | * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift |
93 | * is less than 0x70, this can be combined with step 3. |
94 | * 5. Binary OR with inf_nan_mask to turn the exponent into 0xFF if the input was NaN or infinity. |
95 | * 6. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero. |
96 | * 7. Combine with the sign of the input number. |
97 | */ |
98 | return sign | ((((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) | inf_nan_mask) & ~zero_mask); |
99 | } |
100 | |
101 | /* |
102 | * Convert a 16-bit floating-point number in IEEE half-precision format, in bit representation, to |
103 | * a 32-bit floating-point number in IEEE single-precision format. |
104 | * |
105 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) |
106 | * floating-point operations and bitcasts between integer and floating-point variables. |
107 | */ |
108 | static inline float fp16_ieee_to_fp32_value(uint16_t h) { |
109 | /* |
110 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: |
111 | * +---+-----+------------+-------------------+ |
112 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| |
113 | * +---+-----+------------+-------------------+ |
114 | * Bits 31 26-30 16-25 0-15 |
115 | * |
116 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. |
117 | */ |
118 | const uint32_t w = (uint32_t) h << 16; |
119 | /* |
120 | * Extract the sign of the input number into the high bit of the 32-bit word: |
121 | * |
122 | * +---+----------------------------------+ |
123 | * | S |0000000 00000000 00000000 00000000| |
124 | * +---+----------------------------------+ |
125 | * Bits 31 0-31 |
126 | */ |
127 | const uint32_t sign = w & UINT32_C(0x80000000); |
128 | /* |
129 | * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word: |
130 | * |
131 | * +-----+------------+---------------------+ |
132 | * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000| |
133 | * +-----+------------+---------------------+ |
134 | * Bits 27-31 17-26 0-16 |
135 | */ |
136 | const uint32_t two_w = w + w; |
137 | |
138 | /* |
139 | * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent |
140 | * of a single-precision floating-point number: |
141 | * |
142 | * S|Exponent | Mantissa |
143 | * +-+---+-----+------------+----------------+ |
144 | * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000| |
145 | * +-+---+-----+------------+----------------+ |
146 | * Bits | 23-31 | 0-22 |
147 | * |
148 | * Next, there are some adjustments to the exponent: |
149 | * - The exponent needs to be corrected by the difference in exponent bias between single-precision and half-precision |
150 | * formats (0x7F - 0xF = 0x70) |
151 | * - Inf and NaN values in the inputs should become Inf and NaN values after conversion to the single-precision number. |
152 | * Therefore, if the biased exponent of the half-precision input was 0x1F (max possible value), the biased exponent |
153 | * of the single-precision output must be 0xFF (max possible value). We do this correction in two steps: |
154 | * - First, we adjust the exponent by (0xFF - 0x1F) = 0xE0 (see exp_offset below) rather than by 0x70 suggested |
155 | * by the difference in the exponent bias (see above). |
156 | * - Then we multiply the single-precision result of exponent adjustment by 2**(-112) to reverse the effect of |
157 | * exponent adjustment by 0xE0 less the necessary exponent adjustment by 0x70 due to difference in exponent bias. |
158 | * The floating-point multiplication hardware would ensure than Inf and NaN would retain their value on at least |
159 | * partially IEEE754-compliant implementations. |
160 | * |
161 | * Note that the above operations do not handle denormal inputs (where biased exponent == 0). However, they also do not |
162 | * operate on denormal inputs, and do not produce denormal results. |
163 | */ |
164 | const uint32_t exp_offset = UINT32_C(0xE0) << 23; |
165 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) |
166 | const float exp_scale = 0x1.0p-112f; |
167 | #else |
168 | const float exp_scale = fp32_from_bits(UINT32_C(0x7800000)); |
169 | #endif |
170 | const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset) * exp_scale; |
171 | |
172 | /* |
173 | * Convert denormalized half-precision inputs into single-precision results (always normalized). |
174 | * Zero inputs are also handled here. |
175 | * |
176 | * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits. |
177 | * First, we shift mantissa into bits 0-9 of the 32-bit word. |
178 | * |
179 | * zeros | mantissa |
180 | * +---------------------------+------------+ |
181 | * |0000 0000 0000 0000 0000 00|MM MMMM MMMM| |
182 | * +---------------------------+------------+ |
183 | * Bits 10-31 0-9 |
184 | * |
185 | * Now, remember that denormalized half-precision numbers are represented as: |
186 | * FP16 = mantissa * 2**(-24). |
187 | * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input |
188 | * and with an exponent which would scale the corresponding mantissa bits to 2**(-24). |
189 | * A normalized single-precision floating-point number is represented as: |
190 | * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127) |
191 | * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision |
192 | * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount. |
193 | * |
194 | * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number |
195 | * is zero, the constructed single-precision number has the value of |
196 | * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5 |
197 | * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of |
198 | * the input half-precision number. |
199 | */ |
200 | const uint32_t magic_mask = UINT32_C(126) << 23; |
201 | const float magic_bias = 0.5f; |
202 | const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; |
203 | |
204 | /* |
205 | * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the |
206 | * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the |
207 | * input is either a denormal number, or zero. |
208 | * - Combine the result of conversion of exponent and mantissa with the sign of the input number. |
209 | */ |
210 | const uint32_t denormalized_cutoff = UINT32_C(1) << 27; |
211 | const uint32_t result = sign | |
212 | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); |
213 | return fp32_from_bits(result); |
214 | } |
215 | |
216 | /* |
217 | * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in |
218 | * IEEE half-precision format, in bit representation. |
219 | * |
220 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) |
221 | * floating-point operations and bitcasts between integer and floating-point variables. |
222 | */ |
223 | static inline uint16_t fp16_ieee_from_fp32_value(float f) { |
224 | #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) || defined(__GNUC__) && !defined(__STRICT_ANSI__) |
225 | const float scale_to_inf = 0x1.0p+112f; |
226 | const float scale_to_zero = 0x1.0p-110f; |
227 | #else |
228 | const float scale_to_inf = fp32_from_bits(UINT32_C(0x77800000)); |
229 | const float scale_to_zero = fp32_from_bits(UINT32_C(0x08800000)); |
230 | #endif |
231 | float base = (fabsf(f) * scale_to_inf) * scale_to_zero; |
232 | |
233 | const uint32_t w = fp32_to_bits(f); |
234 | const uint32_t shl1_w = w + w; |
235 | const uint32_t sign = w & UINT32_C(0x80000000); |
236 | uint32_t bias = shl1_w & UINT32_C(0xFF000000); |
237 | if (bias < UINT32_C(0x71000000)) { |
238 | bias = UINT32_C(0x71000000); |
239 | } |
240 | |
241 | base = fp32_from_bits((bias >> 1) + UINT32_C(0x07800000)) + base; |
242 | const uint32_t bits = fp32_to_bits(base); |
243 | const uint32_t exp_bits = (bits >> 13) & UINT32_C(0x00007C00); |
244 | const uint32_t mantissa_bits = bits & UINT32_C(0x00000FFF); |
245 | const uint32_t nonsign = exp_bits + mantissa_bits; |
246 | return (sign >> 16) | (shl1_w > UINT32_C(0xFF000000) ? UINT16_C(0x7E00) : nonsign); |
247 | } |
248 | |
249 | /* |
250 | * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to |
251 | * a 32-bit floating-point number in IEEE single-precision format, in bit representation. |
252 | * |
253 | * @note The implementation doesn't use any floating-point operations. |
254 | */ |
255 | static inline uint32_t fp16_alt_to_fp32_bits(uint16_t h) { |
256 | /* |
257 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: |
258 | * +---+-----+------------+-------------------+ |
259 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| |
260 | * +---+-----+------------+-------------------+ |
261 | * Bits 31 26-30 16-25 0-15 |
262 | * |
263 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. |
264 | */ |
265 | const uint32_t w = (uint32_t) h << 16; |
266 | /* |
267 | * Extract the sign of the input number into the high bit of the 32-bit word: |
268 | * |
269 | * +---+----------------------------------+ |
270 | * | S |0000000 00000000 00000000 00000000| |
271 | * +---+----------------------------------+ |
272 | * Bits 31 0-31 |
273 | */ |
274 | const uint32_t sign = w & UINT32_C(0x80000000); |
275 | /* |
276 | * Extract mantissa and biased exponent of the input number into the bits 0-30 of the 32-bit word: |
277 | * |
278 | * +---+-----+------------+-------------------+ |
279 | * | 0 |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| |
280 | * +---+-----+------------+-------------------+ |
281 | * Bits 30 27-31 17-26 0-16 |
282 | */ |
283 | const uint32_t nonsign = w & UINT32_C(0x7FFFFFFF); |
284 | /* |
285 | * Renorm shift is the number of bits to shift mantissa left to make the half-precision number normalized. |
286 | * If the initial number is normalized, some of its high 6 bits (sign == 0 and 5-bit exponent) equals one. |
287 | * In this case renorm_shift == 0. If the number is denormalize, renorm_shift > 0. Note that if we shift |
288 | * denormalized nonsign by renorm_shift, the unit bit of mantissa will shift into exponent, turning the |
289 | * biased exponent into 1, and making mantissa normalized (i.e. without leading 1). |
290 | */ |
291 | #ifdef _MSC_VER |
292 | unsigned long nonsign_bsr; |
293 | _BitScanReverse(&nonsign_bsr, (unsigned long) nonsign); |
294 | uint32_t renorm_shift = (uint32_t) nonsign_bsr ^ 31; |
295 | #else |
296 | uint32_t renorm_shift = __builtin_clz(nonsign); |
297 | #endif |
298 | renorm_shift = renorm_shift > 5 ? renorm_shift - 5 : 0; |
299 | /* |
300 | * Iff nonsign is 0, it overflows into 0xFFFFFFFF, turning bit 31 into 1. Otherwise, bit 31 remains 0. |
301 | * The signed shift right by 31 broadcasts bit 31 into all bits of the zero_mask. Thus |
302 | * zero_mask == |
303 | * 0xFFFFFFFF if the half-precision number was zero (+0.0h or -0.0h) |
304 | * 0x00000000 otherwise |
305 | */ |
306 | const int32_t zero_mask = (int32_t) (nonsign - 1) >> 31; |
307 | /* |
308 | * 1. Shift nonsign left by renorm_shift to normalize it (if the input was denormal) |
309 | * 2. Shift nonsign right by 3 so the exponent (5 bits originally) becomes an 8-bit field and 10-bit mantissa |
310 | * shifts into the 10 high bits of the 23-bit mantissa of IEEE single-precision number. |
311 | * 3. Add 0x70 to the exponent (starting at bit 23) to compensate the different in exponent bias |
312 | * (0x7F for single-precision number less 0xF for half-precision number). |
313 | * 4. Subtract renorm_shift from the exponent (starting at bit 23) to account for renormalization. As renorm_shift |
314 | * is less than 0x70, this can be combined with step 3. |
315 | * 5. Binary ANDNOT with zero_mask to turn the mantissa and exponent into zero if the input was zero. |
316 | * 6. Combine with the sign of the input number. |
317 | */ |
318 | return sign | (((nonsign << renorm_shift >> 3) + ((0x70 - renorm_shift) << 23)) & ~zero_mask); |
319 | } |
320 | |
321 | /* |
322 | * Convert a 16-bit floating-point number in ARM alternative half-precision format, in bit representation, to |
323 | * a 32-bit floating-point number in IEEE single-precision format. |
324 | * |
325 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) |
326 | * floating-point operations and bitcasts between integer and floating-point variables. |
327 | */ |
328 | static inline float fp16_alt_to_fp32_value(uint16_t h) { |
329 | /* |
330 | * Extend the half-precision floating-point number to 32 bits and shift to the upper part of the 32-bit word: |
331 | * +---+-----+------------+-------------------+ |
332 | * | S |EEEEE|MM MMMM MMMM|0000 0000 0000 0000| |
333 | * +---+-----+------------+-------------------+ |
334 | * Bits 31 26-30 16-25 0-15 |
335 | * |
336 | * S - sign bit, E - bits of the biased exponent, M - bits of the mantissa, 0 - zero bits. |
337 | */ |
338 | const uint32_t w = (uint32_t) h << 16; |
339 | /* |
340 | * Extract the sign of the input number into the high bit of the 32-bit word: |
341 | * |
342 | * +---+----------------------------------+ |
343 | * | S |0000000 00000000 00000000 00000000| |
344 | * +---+----------------------------------+ |
345 | * Bits 31 0-31 |
346 | */ |
347 | const uint32_t sign = w & UINT32_C(0x80000000); |
348 | /* |
349 | * Extract mantissa and biased exponent of the input number into the high bits of the 32-bit word: |
350 | * |
351 | * +-----+------------+---------------------+ |
352 | * |EEEEE|MM MMMM MMMM|0 0000 0000 0000 0000| |
353 | * +-----+------------+---------------------+ |
354 | * Bits 27-31 17-26 0-16 |
355 | */ |
356 | const uint32_t two_w = w + w; |
357 | |
358 | /* |
359 | * Shift mantissa and exponent into bits 23-28 and bits 13-22 so they become mantissa and exponent |
360 | * of a single-precision floating-point number: |
361 | * |
362 | * S|Exponent | Mantissa |
363 | * +-+---+-----+------------+----------------+ |
364 | * |0|000|EEEEE|MM MMMM MMMM|0 0000 0000 0000| |
365 | * +-+---+-----+------------+----------------+ |
366 | * Bits | 23-31 | 0-22 |
367 | * |
368 | * Next, the exponent is adjusted for the difference in exponent bias between single-precision and half-precision |
369 | * formats (0x7F - 0xF = 0x70). This operation never overflows or generates non-finite values, as the largest |
370 | * half-precision exponent is 0x1F and after the adjustment is can not exceed 0x8F < 0xFE (largest single-precision |
371 | * exponent for non-finite values). |
372 | * |
373 | * Note that this operation does not handle denormal inputs (where biased exponent == 0). However, they also do not |
374 | * operate on denormal inputs, and do not produce denormal results. |
375 | */ |
376 | const float exp_offset = UINT32_C(0x70) << 23; |
377 | const float normalized_value = fp32_from_bits((two_w >> 4) + exp_offset); |
378 | |
379 | /* |
380 | * Convert denormalized half-precision inputs into single-precision results (always normalized). |
381 | * Zero inputs are also handled here. |
382 | * |
383 | * In a denormalized number the biased exponent is zero, and mantissa has on-zero bits. |
384 | * First, we shift mantissa into bits 0-9 of the 32-bit word. |
385 | * |
386 | * zeros | mantissa |
387 | * +---------------------------+------------+ |
388 | * |0000 0000 0000 0000 0000 00|MM MMMM MMMM| |
389 | * +---------------------------+------------+ |
390 | * Bits 10-31 0-9 |
391 | * |
392 | * Now, remember that denormalized half-precision numbers are represented as: |
393 | * FP16 = mantissa * 2**(-24). |
394 | * The trick is to construct a normalized single-precision number with the same mantissa and thehalf-precision input |
395 | * and with an exponent which would scale the corresponding mantissa bits to 2**(-24). |
396 | * A normalized single-precision floating-point number is represented as: |
397 | * FP32 = (1 + mantissa * 2**(-23)) * 2**(exponent - 127) |
398 | * Therefore, when the biased exponent is 126, a unit change in the mantissa of the input denormalized half-precision |
399 | * number causes a change of the constructud single-precision number by 2**(-24), i.e. the same ammount. |
400 | * |
401 | * The last step is to adjust the bias of the constructed single-precision number. When the input half-precision number |
402 | * is zero, the constructed single-precision number has the value of |
403 | * FP32 = 1 * 2**(126 - 127) = 2**(-1) = 0.5 |
404 | * Therefore, we need to subtract 0.5 from the constructed single-precision number to get the numerical equivalent of |
405 | * the input half-precision number. |
406 | */ |
407 | const uint32_t magic_mask = UINT32_C(126) << 23; |
408 | const float magic_bias = 0.5f; |
409 | const float denormalized_value = fp32_from_bits((two_w >> 17) | magic_mask) - magic_bias; |
410 | |
411 | /* |
412 | * - Choose either results of conversion of input as a normalized number, or as a denormalized number, depending on the |
413 | * input exponent. The variable two_w contains input exponent in bits 27-31, therefore if its smaller than 2**27, the |
414 | * input is either a denormal number, or zero. |
415 | * - Combine the result of conversion of exponent and mantissa with the sign of the input number. |
416 | */ |
417 | const uint32_t denormalized_cutoff = UINT32_C(1) << 27; |
418 | const uint32_t result = sign | |
419 | (two_w < denormalized_cutoff ? fp32_to_bits(denormalized_value) : fp32_to_bits(normalized_value)); |
420 | return fp32_from_bits(result); |
421 | } |
422 | |
423 | /* |
424 | * Convert a 32-bit floating-point number in IEEE single-precision format to a 16-bit floating-point number in |
425 | * ARM alternative half-precision format, in bit representation. |
426 | * |
427 | * @note The implementation relies on IEEE-like (no assumption about rounding mode and no operations on denormals) |
428 | * floating-point operations and bitcasts between integer and floating-point variables. |
429 | */ |
430 | static inline uint16_t fp16_alt_from_fp32_value(float f) { |
431 | const uint32_t w = fp32_to_bits(f); |
432 | const uint32_t sign = w & UINT32_C(0x80000000); |
433 | const uint32_t shl1_w = w + w; |
434 | |
435 | const uint32_t shl1_max_fp16_fp32 = UINT32_C(0x8FFFC000); |
436 | const uint32_t shl1_base = shl1_w > shl1_max_fp16_fp32 ? shl1_max_fp16_fp32 : shl1_w; |
437 | uint32_t shl1_bias = shl1_base & UINT32_C(0xFF000000); |
438 | const uint32_t exp_difference = 23 - 10; |
439 | const uint32_t shl1_bias_min = (127 - 1 - exp_difference) << 24; |
440 | if (shl1_bias < shl1_bias_min) { |
441 | shl1_bias = shl1_bias_min; |
442 | } |
443 | |
444 | const float bias = fp32_from_bits((shl1_bias >> 1) + ((exp_difference + 2) << 23)); |
445 | const float base = fp32_from_bits((shl1_base >> 1) + (2 << 23)) + bias; |
446 | |
447 | const uint32_t exp_f = fp32_to_bits(base) >> 13; |
448 | return (sign >> 16) | ((exp_f & UINT32_C(0x00007C00)) + (fp32_to_bits(base) & UINT32_C(0x00000FFF))); |
449 | } |
450 | |
451 | #endif /* FP16_FP16_H */ |
452 | |