1 | /* |
2 | * Copyright (c) 2009-2015 by llvm/compiler-rt contributors |
3 | * |
4 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
5 | * of this software and associated documentation files (the "Software"), to deal |
6 | * in the Software without restriction, including without limitation the rights |
7 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
8 | * copies of the Software, and to permit persons to whom the Software is |
9 | * furnished to do so, subject to the following conditions: |
10 | * |
11 | * The above copyright notice and this permission notice shall be included in |
12 | * all copies or substantial portions of the Software. |
13 | * |
14 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
15 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
16 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
17 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
18 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
19 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
20 | * THE SOFTWARE. |
21 | * \file builtin_fp16.h |
22 | * \brief Functions for conversion between fp32 and fp16, adopted from compiler-rt. |
23 | */ |
24 | #ifndef COMPILER_RT_BUILTIN_FP16_H_ |
25 | #define COMPILER_RT_BUILTIN_FP16_H_ |
26 | |
27 | #ifdef _MSC_VER |
28 | #pragma warning(disable : 4305 4805) |
29 | #endif |
30 | |
31 | #include <cstdint> |
32 | |
33 | static inline uint32_t __clz(uint32_t x) { |
34 | // count leading zeros |
35 | int n = 32; |
36 | uint32_t y; |
37 | |
38 | y = x >> 16; |
39 | if (y) { |
40 | n = n - 16; |
41 | x = y; |
42 | } |
43 | y = x >> 8; |
44 | if (y) { |
45 | n = n - 8; |
46 | x = y; |
47 | } |
48 | y = x >> 4; |
49 | if (y) { |
50 | n = n - 4; |
51 | x = y; |
52 | } |
53 | y = x >> 2; |
54 | if (y) { |
55 | n = n - 2; |
56 | x = y; |
57 | } |
58 | y = x >> 1; |
59 | if (y) return n - 2; |
60 | return n - x; |
61 | } |
62 | |
63 | template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS, typename DST_T, typename DST_REP_T, |
64 | int DST_SIG_BITS> |
65 | static inline DST_T __truncXfYf2__(SRC_T a) { |
66 | // Various constants whose values follow from the type parameters. |
67 | // Any reasonable optimizer will fold and propagate all of these. |
68 | const int srcBits = sizeof(SRC_T) * 8; |
69 | const int srcExpBits = srcBits - SRC_SIG_BITS - 1; |
70 | const int srcInfExp = (1 << srcExpBits) - 1; |
71 | const int srcExpBias = srcInfExp >> 1; |
72 | |
73 | const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS; |
74 | const SRC_REP_T srcSignificandMask = srcMinNormal - 1; |
75 | const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS; |
76 | const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits); |
77 | const SRC_REP_T srcAbsMask = srcSignMask - 1; |
78 | const SRC_REP_T roundMask = (SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS)) - 1; |
79 | const SRC_REP_T halfway = SRC_REP_T(1) << (SRC_SIG_BITS - DST_SIG_BITS - 1); |
80 | const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1); |
81 | const SRC_REP_T srcNaNCode = srcQNaN - 1; |
82 | |
83 | const int dstBits = sizeof(DST_T) * 8; |
84 | const int dstExpBits = dstBits - DST_SIG_BITS - 1; |
85 | const int dstInfExp = (1 << dstExpBits) - 1; |
86 | const int dstExpBias = dstInfExp >> 1; |
87 | |
88 | const int underflowExponent = srcExpBias + 1 - dstExpBias; |
89 | const int overflowExponent = srcExpBias + dstInfExp - dstExpBias; |
90 | const SRC_REP_T underflow = (SRC_REP_T)underflowExponent << SRC_SIG_BITS; |
91 | const SRC_REP_T overflow = (SRC_REP_T)overflowExponent << SRC_SIG_BITS; |
92 | |
93 | const DST_REP_T dstQNaN = DST_REP_T(1) << (DST_SIG_BITS - 1); |
94 | const DST_REP_T dstNaNCode = dstQNaN - 1; |
95 | |
96 | // Break a into a sign and representation of the absolute value |
97 | union SrcExchangeType { |
98 | SRC_T f; |
99 | SRC_REP_T i; |
100 | }; |
101 | SrcExchangeType src_rep; |
102 | src_rep.f = a; |
103 | const SRC_REP_T aRep = src_rep.i; |
104 | const SRC_REP_T aAbs = aRep & srcAbsMask; |
105 | const SRC_REP_T sign = aRep & srcSignMask; |
106 | DST_REP_T absResult; |
107 | |
108 | if (aAbs - underflow < aAbs - overflow) { |
109 | // The exponent of a is within the range of normal numbers in the |
110 | // destination format. We can convert by simply right-shifting with |
111 | // rounding and adjusting the exponent. |
112 | absResult = aAbs >> (SRC_SIG_BITS - DST_SIG_BITS); |
113 | absResult -= (DST_REP_T)(srcExpBias - dstExpBias) << DST_SIG_BITS; |
114 | |
115 | const SRC_REP_T roundBits = aAbs & roundMask; |
116 | // Round to nearest |
117 | if (roundBits > halfway) absResult++; |
118 | // Ties to even |
119 | else if (roundBits == halfway) |
120 | absResult += absResult & 1; |
121 | } else if (aAbs > srcInfinity) { |
122 | // a is NaN. |
123 | // Conjure the result by beginning with infinity, setting the qNaN |
124 | // bit and inserting the (truncated) trailing NaN field. |
125 | absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS; |
126 | absResult |= dstQNaN; |
127 | absResult |= ((aAbs & srcNaNCode) >> (SRC_SIG_BITS - DST_SIG_BITS)) & dstNaNCode; |
128 | } else if (aAbs >= overflow) { |
129 | // a overflows to infinity. |
130 | absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS; |
131 | } else { |
132 | // a underflows on conversion to the destination type or is an exact |
133 | // zero. The result may be a denormal or zero. Extract the exponent |
134 | // to get the shift amount for the denormalization. |
135 | const int aExp = aAbs >> SRC_SIG_BITS; |
136 | const int shift = srcExpBias - dstExpBias - aExp + 1; |
137 | |
138 | const SRC_REP_T significand = (aRep & srcSignificandMask) | srcMinNormal; |
139 | |
140 | // Right shift by the denormalization amount with sticky. |
141 | if (shift > SRC_SIG_BITS) { |
142 | absResult = 0; |
143 | } else { |
144 | const bool sticky = significand << (srcBits - shift); |
145 | SRC_REP_T denormalizedSignificand = significand >> shift | sticky; |
146 | absResult = denormalizedSignificand >> (SRC_SIG_BITS - DST_SIG_BITS); |
147 | const SRC_REP_T roundBits = denormalizedSignificand & roundMask; |
148 | // Round to nearest |
149 | if (roundBits > halfway) absResult++; |
150 | // Ties to even |
151 | else if (roundBits == halfway) |
152 | absResult += absResult & 1; |
153 | } |
154 | } |
155 | |
156 | // Apply the signbit to (DST_T)abs(a). |
157 | const DST_REP_T result = absResult | sign >> (srcBits - dstBits); |
158 | union DstExchangeType { |
159 | DST_T f; |
160 | DST_REP_T i; |
161 | }; |
162 | DstExchangeType dst_rep; |
163 | dst_rep.i = result; |
164 | return dst_rep.f; |
165 | } |
166 | |
167 | template <typename SRC_T, typename SRC_REP_T, int SRC_SIG_BITS, typename DST_T, typename DST_REP_T, |
168 | int DST_SIG_BITS> |
169 | static inline DST_T __extendXfYf2__(SRC_T a) { |
170 | // Various constants whose values follow from the type parameters. |
171 | // Any reasonable optimizer will fold and propagate all of these. |
172 | const int srcBits = sizeof(SRC_T) * 8; |
173 | const int srcExpBits = srcBits - SRC_SIG_BITS - 1; |
174 | const int srcInfExp = (1 << srcExpBits) - 1; |
175 | const int srcExpBias = srcInfExp >> 1; |
176 | |
177 | const SRC_REP_T srcMinNormal = SRC_REP_T(1) << SRC_SIG_BITS; |
178 | const SRC_REP_T srcInfinity = (SRC_REP_T)srcInfExp << SRC_SIG_BITS; |
179 | const SRC_REP_T srcSignMask = SRC_REP_T(1) << (SRC_SIG_BITS + srcExpBits); |
180 | const SRC_REP_T srcAbsMask = srcSignMask - 1; |
181 | const SRC_REP_T srcQNaN = SRC_REP_T(1) << (SRC_SIG_BITS - 1); |
182 | const SRC_REP_T srcNaNCode = srcQNaN - 1; |
183 | |
184 | const int dstBits = sizeof(DST_T) * 8; |
185 | const int dstExpBits = dstBits - DST_SIG_BITS - 1; |
186 | const int dstInfExp = (1 << dstExpBits) - 1; |
187 | const int dstExpBias = dstInfExp >> 1; |
188 | |
189 | const DST_REP_T dstMinNormal = DST_REP_T(1) << DST_SIG_BITS; |
190 | |
191 | // Break a into a sign and representation of the absolute value |
192 | union SrcExchangeType { |
193 | SRC_T f; |
194 | SRC_REP_T i; |
195 | }; |
196 | SrcExchangeType src_rep; |
197 | src_rep.f = a; |
198 | const SRC_REP_T aRep = src_rep.i; |
199 | const SRC_REP_T aAbs = aRep & srcAbsMask; |
200 | const SRC_REP_T sign = aRep & srcSignMask; |
201 | DST_REP_T absResult; |
202 | |
203 | // If sizeof(SRC_REP_T) < sizeof(int), the subtraction result is promoted |
204 | // to (signed) int. To avoid that, explicitly cast to SRC_REP_T. |
205 | if ((SRC_REP_T)(aAbs - srcMinNormal) < srcInfinity - srcMinNormal) { |
206 | // a is a normal number. |
207 | // Extend to the destination type by shifting the significand and |
208 | // exponent into the proper position and rebiasing the exponent. |
209 | absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS); |
210 | absResult += (DST_REP_T)(dstExpBias - srcExpBias) << DST_SIG_BITS; |
211 | } |
212 | |
213 | else if (aAbs >= srcInfinity) { |
214 | // a is NaN or infinity. |
215 | // Conjure the result by beginning with infinity, then setting the qNaN |
216 | // bit (if needed) and right-aligning the rest of the trailing NaN |
217 | // payload field. |
218 | absResult = (DST_REP_T)dstInfExp << DST_SIG_BITS; |
219 | absResult |= (DST_REP_T)(aAbs & srcQNaN) << (DST_SIG_BITS - SRC_SIG_BITS); |
220 | absResult |= (DST_REP_T)(aAbs & srcNaNCode) << (DST_SIG_BITS - SRC_SIG_BITS); |
221 | } else if (aAbs) { |
222 | // a is denormal. |
223 | // renormalize the significand and clear the leading bit, then insert |
224 | // the correct adjusted exponent in the destination type. |
225 | const int scale = __clz(aAbs) - __clz(srcMinNormal); |
226 | absResult = (DST_REP_T)aAbs << (DST_SIG_BITS - SRC_SIG_BITS + scale); |
227 | absResult ^= dstMinNormal; |
228 | const int resultExponent = dstExpBias - srcExpBias - scale + 1; |
229 | absResult |= (DST_REP_T)resultExponent << DST_SIG_BITS; |
230 | } else { |
231 | // a is zero. |
232 | absResult = 0; |
233 | } |
234 | |
235 | // Apply the signbit to (DST_T)abs(a). |
236 | const DST_REP_T result = absResult | (DST_REP_T)sign << (dstBits - srcBits); |
237 | union DstExchangeType { |
238 | DST_T f; |
239 | DST_REP_T i; |
240 | }; |
241 | DstExchangeType dst_rep; |
242 | dst_rep.i = result; |
243 | return dst_rep.f; |
244 | } |
245 | |
246 | #endif // COMPILER_RT_BUILTIN_FP16_H_ |
247 | |