1 | /** |
2 | * Copyright (c) Glow Contributors. See CONTRIBUTORS file. |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | */ |
16 | #ifndef GLOW_LLVMIRCODEGEN_LIBJIT_LIBJIT_DEFS_H |
17 | #define GLOW_LLVMIRCODEGEN_LIBJIT_LIBJIT_DEFS_H |
18 | |
19 | #include <assert.h> |
20 | #include <cmath> |
21 | #include <cstdlib> |
22 | #include <math.h> |
23 | #include <stdint.h> |
24 | #include <string.h> |
25 | |
26 | #include "libjit_dim_t.h" |
27 | |
28 | #define LIBJIT_ALWAYS_INLINE static inline __attribute__((always_inline)) |
29 | |
30 | #if defined(_MSC_VER) |
31 | #include <BaseTsd.h> |
32 | typedef SSIZE_T ssize_t; |
33 | #endif |
34 | |
35 | #if defined(__clang__) |
36 | using float4 = float __attribute__((ext_vector_type(4))); |
37 | using float8 = float __attribute__((ext_vector_type(8))); |
38 | #elif defined(__GNUC__) || defined(__GNUG__) |
39 | using float4 = float __attribute__((vector_size(16))); |
40 | using float8 = float __attribute__((vector_size(32))); |
41 | #endif |
42 | |
43 | /// Loads a simd float8 value from \p ptr. |
44 | #define LoadFloat8(PTR) *((const float8 *)(PTR)) |
45 | |
46 | /// Stores the simd float8 value to \p ptr. |
47 | #define StoreFloat8(PTR, VAL) *((float8 *)(PTR)) = (VAL); |
48 | |
49 | /// Accumulate (+=) the simd float8 value to \p ptr. |
50 | #define AddFloat8(PTR, VAL) *((float8 *)(PTR)) += (VAL); |
51 | |
52 | /// Broadcast the input value to a float8. |
53 | #if defined(__clang__) |
54 | #define BroadcastFloat8(VAL) ((float8)(VAL)) |
55 | #elif defined(__GNUC__) || defined(__GNUG__) |
56 | #define BroadcastFloat8(VAL) ((VAL) - (float8){0}) |
57 | #endif |
58 | |
59 | #define MIN(a, b) (((a) < (b)) ? (a) : (b)) |
60 | #define MAX(a, b) (((a) > (b)) ? (a) : (b)) |
61 | #define AT(tensor, dims, numDims, indices, numIndices) \ |
62 | tensor[get_element_ptr(tensor, dims, numDims, indices, numIndices)] |
63 | |
64 | /// Perform an unaligned load of a float8 from a float pointer. |
65 | inline float8 LoaduFloat8(const float *p) { |
66 | float8 res; |
67 | memcpy(&res, p, sizeof(float8)); |
68 | return res; |
69 | } |
70 | |
71 | /// Perform an unaligned store to a float pointer. |
72 | inline void StoreuFloat8(float *p, float8 v) { memcpy(p, &v, sizeof(float8)); } |
73 | |
74 | /// Perform an unaligned addition to a float pointer. |
75 | inline void AdduFloat8(float *p, float8 v) { |
76 | StoreuFloat8(p, LoaduFloat8(p) + v); |
77 | } |
78 | |
79 | /// \returns the index of the element at x,y,z,w,q,r. |
80 | inline dim_t libjit_getXYZWQR(const dim_t *dims, dim_t x, dim_t y, dim_t z, |
81 | dim_t w, dim_t q, dim_t r) { |
82 | return (x * dims[1] * dims[2] * dims[3] * dims[4] * dims[5]) + |
83 | (y * dims[2] * dims[3] * dims[4] * dims[5]) + |
84 | (z * dims[3] * dims[4] * dims[5]) + (w * dims[4] * dims[5]) + |
85 | (q * dims[5]) + r; |
86 | } |
87 | |
88 | /// \returns the index of the element at x,y,z,w,q. |
89 | inline dim_t libjit_getXYZWQ(const dim_t *dims, dim_t x, dim_t y, dim_t z, |
90 | dim_t w, dim_t q) { |
91 | return (x * dims[1] * dims[2] * dims[3] * dims[4]) + |
92 | (y * dims[2] * dims[3] * dims[4]) + (z * dims[3] * dims[4]) + |
93 | (w * dims[4]) + q; |
94 | } |
95 | |
96 | /// \returns the index of the element at x,y,z,w. |
97 | inline dim_t libjit_getXYZW(const dim_t *dims, dim_t x, dim_t y, dim_t z, |
98 | dim_t w) { |
99 | return (x * dims[1] * dims[2] * dims[3]) + (y * dims[2] * dims[3]) + |
100 | (z * dims[3]) + w; |
101 | } |
102 | |
103 | /// \returns the index of the element at x,y,z. |
104 | inline dim_t libjit_getXYZ(const dim_t *dims, dim_t x, dim_t y, dim_t z) { |
105 | return (x * dims[1] * dims[2]) + (y * dims[2]) + z; |
106 | } |
107 | |
108 | /// \returns the index of the element at x,y. |
109 | inline dim_t libjit_getXY(const dim_t *dims, dim_t x, dim_t y) { |
110 | return (x * dims[1]) + y; |
111 | } |
112 | |
113 | /// Computes the function Sigmoid(x) for float \p input. |
114 | /// When the LIBJIT compile option "-ffast-math" is enabled the intermediate |
115 | /// computation expf(x) for Sigmoid operator is not handled properly for very |
116 | /// large positive values which results in NaN values for the Sigmoid output. |
117 | /// Therefore when the "-ffast-math" is enabled we compute the Sigmoid such that |
118 | /// we avoid computing large values for the "expf" function. |
119 | LIBJIT_ALWAYS_INLINE |
120 | float libjit_sigmoid_f(float input) { |
121 | #ifdef FFAST_MATH |
122 | float sigmoidVal = 1 / (1 + expf(-std::abs(input))); |
123 | return (float)(std::signbit(input)) + std::copysignf(sigmoidVal, input); |
124 | #else |
125 | float e = expf(-input); |
126 | return 1 / (e + 1); |
127 | #endif // FFAST_MATH |
128 | } |
129 | |
130 | /// Computes the function Tanh(x) for float \p input. |
131 | /// When the LIBJIT compile option "-ffast-math" is enabled the intermediate |
132 | /// computation expf(x) for Tanh operator is not handled properly for very |
133 | /// large positive values which results in NaN values for the Tanh output. |
134 | /// Therefore when the "-ffast-math" is enabled we compute the Tanh such that |
135 | /// we avoid computing large values for the "expf" function. |
136 | LIBJIT_ALWAYS_INLINE |
137 | float libjit_tanh_f(float input) { |
138 | #ifdef FFAST_MATH |
139 | float tanhVal = -1 + 2 / (expf(-2 * std::abs(input)) + 1); |
140 | return std::copysignf(tanhVal, input); |
141 | #else |
142 | return 1 - 2 / (expf(input * 2) + 1); |
143 | #endif // FFAST_MATH |
144 | } |
145 | |
146 | /// \returns the clipped value of the input to INT8 range [-128, 127]. |
147 | LIBJIT_ALWAYS_INLINE |
148 | int8_t libjit_clip_i8(int32_t val) { return (int8_t)MIN(MAX(val, -128), 127); } |
149 | |
150 | /// \returns the clipped value of the input to INT16 range [-32768, 32767]. |
151 | LIBJIT_ALWAYS_INLINE |
152 | int16_t libjit_clip_i16(int32_t val) { |
153 | return (int16_t)MIN(MAX(val, -32768), 32767); |
154 | } |
155 | |
156 | /// Scales a 32-bit or 64-bit integer to a 32-bit integer using the integer |
157 | /// shift-mult-shift method. |
158 | template <typename SrcTy = int32_t, typename DestTy = int32_t> |
159 | LIBJIT_ALWAYS_INLINE DestTy libjit_scale(SrcTy input, int32_t pre, int32_t post, |
160 | int32_t scale, int32_t offset) { |
161 | // The operation x >> post is rounded down to negative infinity. To get to |
162 | // round-nearest we add (1 << (post - 1)) to the value prior to shifting. |
163 | // Rounding is performed only when shifting right (pos > 0). |
164 | SrcTy rtn = (post > 0) ? (1 << (post - 1)) : 0; |
165 | |
166 | // NOTICE: If your tests are failing because of signed integer overflow then |
167 | // this is a bug in the test and not in the program. You should make sure that |
168 | // the inputs to the operations do not overflow. The semantics of the |
169 | // quantization process is such that the result for values that fall out of |
170 | // range is undefined. The conversion procedure will only tolerate a few bits |
171 | // of overflow and the result will be clipped. |
172 | return ((((input >> pre) * scale) + rtn) >> post) + offset; |
173 | } |
174 | |
175 | /// Applies an activation function to a FLOAT input value \p input based on |
176 | /// the activation type \p actType and the activation arguments \p actArgs. |
177 | /// NOTE: The type of the activation must be in sync with the FusedActivation |
178 | /// enumeration in glow\include\glow\Graph\Nodes.h. |
179 | LIBJIT_ALWAYS_INLINE |
180 | float libjit_activation_f(float input, int32_t actType, const float *actArgs) { |
181 | if (actType == 0) { |
182 | // No activation. |
183 | return input; |
184 | } else if (actType == 1) { |
185 | // Relu. |
186 | return MAX(input, 0); |
187 | } else if (actType == 2) { |
188 | // Clip. |
189 | return MIN(MAX(input, actArgs[0]), actArgs[1]); |
190 | } else if (actType == 3) { |
191 | // Tanh. |
192 | return libjit_tanh_f(input); |
193 | } else if (actType == 4) { |
194 | // Sigmoid. |
195 | return libjit_sigmoid_f(input); |
196 | } else { |
197 | // LeakyRelu. |
198 | return (input >= 0) ? input : actArgs[0] * input; |
199 | } |
200 | } |
201 | |
202 | /// Applies an activation function to a QUANTIZED input value \p input based on |
203 | /// the activation type \p actType and the activation arguments \p actArgs. |
204 | /// NOTE: The type of the activation must be in sync with the FusedActivation |
205 | /// enumeration in glow\include\glow\Graph\Nodes.h. |
206 | LIBJIT_ALWAYS_INLINE |
207 | int32_t libjit_activation_i32(int32_t input, int32_t offset, int32_t actType, |
208 | const int32_t *actArgs) { |
209 | if (actType == 0) { |
210 | // No activation. |
211 | return input; |
212 | } else if (actType == 1) { |
213 | // Relu. |
214 | return MAX(input, offset); |
215 | } else if (actType == 2) { |
216 | // Clip. |
217 | return MIN(MAX(input, actArgs[0]), actArgs[1]); |
218 | } else if (actType == 3) { |
219 | // Tanh. |
220 | assert(false && "Fused Tanh for quantized type not supported!" ); |
221 | return input; |
222 | } else if (actType == 4) { |
223 | // Sigmoid. |
224 | assert(false && "Fused Sigmoid for quantized type not supported!" ); |
225 | return input; |
226 | } else { |
227 | // LeakyRelu. |
228 | return (input >= offset) |
229 | ? input |
230 | : libjit_scale<int32_t>(input - offset, actArgs[0], actArgs[1], |
231 | actArgs[2], offset); |
232 | } |
233 | } |
234 | |
235 | /// Divides the 32-bit integer \p input with \p divider. The division is done |
236 | /// with rounding for better precision. Input can be both positive or negative. |
237 | /// Divider is assumed strictly positive. |
238 | LIBJIT_ALWAYS_INLINE |
239 | int32_t libjit_div_round_i32(int32_t input, int32_t divider) { |
240 | // Division rounding term which is added for positive input and subtracted |
241 | // for negative input. |
242 | int32_t rnd = (divider >> 1); |
243 | return (input > 0) ? ((input + rnd) / divider) : ((input - rnd) / divider); |
244 | } |
245 | |
246 | #ifdef _WIN32 |
247 | #define libjit_aligned_malloc(p, a, s) \ |
248 | (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno) |
249 | #define libjit_aligned_free(p) _aligned_free(p) |
250 | #else |
251 | #define libjit_aligned_malloc(p, a, s) posix_memalign(p, a, s) |
252 | #define libjit_aligned_free(p) free(p) |
253 | #endif |
254 | |
255 | /// This function computes the minimum filter index based on the the minimum |
256 | /// input index \p inp_min. |
257 | LIBJIT_ALWAYS_INLINE ssize_t libjit_conv_flt_min(ssize_t inp_min) { |
258 | return MAX(0, -inp_min); |
259 | } |
260 | |
261 | /// This function computes the maximum filter index based on the the input size |
262 | /// \p inp_size, the filter size \p flt_size and the minimum input index |
263 | /// \p inp_min. |
264 | LIBJIT_ALWAYS_INLINE ssize_t libjit_conv_flt_max(ssize_t inp_size, |
265 | ssize_t flt_size, |
266 | ssize_t inp_min) { |
267 | return MIN(flt_size, inp_size - inp_min); |
268 | } |
269 | |
270 | /// This function computes the effective filter length given the minimum filter |
271 | /// index \p flt_min and the maximum filter index \p flt_max. |
272 | LIBJIT_ALWAYS_INLINE ssize_t libjit_conv_flt_len(ssize_t flt_min, |
273 | ssize_t flt_max) { |
274 | return MAX(0, flt_max - flt_min); |
275 | } |
276 | |
277 | #endif // GLOW_LLVMIRCODEGEN_LIBJIT_LIBJIT_DEFS_H |
278 | |