1/**
2 * Copyright (c) Glow Contributors. See CONTRIBUTORS file.
3 *
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
7 *
8 * http://www.apache.org/licenses/LICENSE-2.0
9 *
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
15 */
16#ifndef GLOW_LLVMIRCODEGEN_LIBJIT_LIBJIT_DEFS_H
17#define GLOW_LLVMIRCODEGEN_LIBJIT_LIBJIT_DEFS_H
18
19#include <assert.h>
20#include <cmath>
21#include <cstdlib>
22#include <math.h>
23#include <stdint.h>
24#include <string.h>
25
26#include "libjit_dim_t.h"
27
28#define LIBJIT_ALWAYS_INLINE static inline __attribute__((always_inline))
29
30#if defined(_MSC_VER)
31#include <BaseTsd.h>
32typedef SSIZE_T ssize_t;
33#endif
34
35#if defined(__clang__)
36using float4 = float __attribute__((ext_vector_type(4)));
37using float8 = float __attribute__((ext_vector_type(8)));
38#elif defined(__GNUC__) || defined(__GNUG__)
39using float4 = float __attribute__((vector_size(16)));
40using float8 = float __attribute__((vector_size(32)));
41#endif
42
43/// Loads a simd float8 value from \p ptr.
44#define LoadFloat8(PTR) *((const float8 *)(PTR))
45
46/// Stores the simd float8 value to \p ptr.
47#define StoreFloat8(PTR, VAL) *((float8 *)(PTR)) = (VAL);
48
49/// Accumulate (+=) the simd float8 value to \p ptr.
50#define AddFloat8(PTR, VAL) *((float8 *)(PTR)) += (VAL);
51
52/// Broadcast the input value to a float8.
53#if defined(__clang__)
54#define BroadcastFloat8(VAL) ((float8)(VAL))
55#elif defined(__GNUC__) || defined(__GNUG__)
56#define BroadcastFloat8(VAL) ((VAL) - (float8){0})
57#endif
58
59#define MIN(a, b) (((a) < (b)) ? (a) : (b))
60#define MAX(a, b) (((a) > (b)) ? (a) : (b))
61#define AT(tensor, dims, numDims, indices, numIndices) \
62 tensor[get_element_ptr(tensor, dims, numDims, indices, numIndices)]
63
64/// Perform an unaligned load of a float8 from a float pointer.
65inline float8 LoaduFloat8(const float *p) {
66 float8 res;
67 memcpy(&res, p, sizeof(float8));
68 return res;
69}
70
71/// Perform an unaligned store to a float pointer.
72inline void StoreuFloat8(float *p, float8 v) { memcpy(p, &v, sizeof(float8)); }
73
74/// Perform an unaligned addition to a float pointer.
75inline void AdduFloat8(float *p, float8 v) {
76 StoreuFloat8(p, LoaduFloat8(p) + v);
77}
78
79/// \returns the index of the element at x,y,z,w,q,r.
80inline dim_t libjit_getXYZWQR(const dim_t *dims, dim_t x, dim_t y, dim_t z,
81 dim_t w, dim_t q, dim_t r) {
82 return (x * dims[1] * dims[2] * dims[3] * dims[4] * dims[5]) +
83 (y * dims[2] * dims[3] * dims[4] * dims[5]) +
84 (z * dims[3] * dims[4] * dims[5]) + (w * dims[4] * dims[5]) +
85 (q * dims[5]) + r;
86}
87
88/// \returns the index of the element at x,y,z,w,q.
89inline dim_t libjit_getXYZWQ(const dim_t *dims, dim_t x, dim_t y, dim_t z,
90 dim_t w, dim_t q) {
91 return (x * dims[1] * dims[2] * dims[3] * dims[4]) +
92 (y * dims[2] * dims[3] * dims[4]) + (z * dims[3] * dims[4]) +
93 (w * dims[4]) + q;
94}
95
96/// \returns the index of the element at x,y,z,w.
97inline dim_t libjit_getXYZW(const dim_t *dims, dim_t x, dim_t y, dim_t z,
98 dim_t w) {
99 return (x * dims[1] * dims[2] * dims[3]) + (y * dims[2] * dims[3]) +
100 (z * dims[3]) + w;
101}
102
103/// \returns the index of the element at x,y,z.
104inline dim_t libjit_getXYZ(const dim_t *dims, dim_t x, dim_t y, dim_t z) {
105 return (x * dims[1] * dims[2]) + (y * dims[2]) + z;
106}
107
108/// \returns the index of the element at x,y.
109inline dim_t libjit_getXY(const dim_t *dims, dim_t x, dim_t y) {
110 return (x * dims[1]) + y;
111}
112
113/// Computes the function Sigmoid(x) for float \p input.
114/// When the LIBJIT compile option "-ffast-math" is enabled the intermediate
115/// computation expf(x) for Sigmoid operator is not handled properly for very
116/// large positive values which results in NaN values for the Sigmoid output.
117/// Therefore when the "-ffast-math" is enabled we compute the Sigmoid such that
118/// we avoid computing large values for the "expf" function.
119LIBJIT_ALWAYS_INLINE
120float libjit_sigmoid_f(float input) {
121#ifdef FFAST_MATH
122 float sigmoidVal = 1 / (1 + expf(-std::abs(input)));
123 return (float)(std::signbit(input)) + std::copysignf(sigmoidVal, input);
124#else
125 float e = expf(-input);
126 return 1 / (e + 1);
127#endif // FFAST_MATH
128}
129
130/// Computes the function Tanh(x) for float \p input.
131/// When the LIBJIT compile option "-ffast-math" is enabled the intermediate
132/// computation expf(x) for Tanh operator is not handled properly for very
133/// large positive values which results in NaN values for the Tanh output.
134/// Therefore when the "-ffast-math" is enabled we compute the Tanh such that
135/// we avoid computing large values for the "expf" function.
136LIBJIT_ALWAYS_INLINE
137float libjit_tanh_f(float input) {
138#ifdef FFAST_MATH
139 float tanhVal = -1 + 2 / (expf(-2 * std::abs(input)) + 1);
140 return std::copysignf(tanhVal, input);
141#else
142 return 1 - 2 / (expf(input * 2) + 1);
143#endif // FFAST_MATH
144}
145
146/// \returns the clipped value of the input to INT8 range [-128, 127].
147LIBJIT_ALWAYS_INLINE
148int8_t libjit_clip_i8(int32_t val) { return (int8_t)MIN(MAX(val, -128), 127); }
149
150/// \returns the clipped value of the input to INT16 range [-32768, 32767].
151LIBJIT_ALWAYS_INLINE
152int16_t libjit_clip_i16(int32_t val) {
153 return (int16_t)MIN(MAX(val, -32768), 32767);
154}
155
156/// Scales a 32-bit or 64-bit integer to a 32-bit integer using the integer
157/// shift-mult-shift method.
158template <typename SrcTy = int32_t, typename DestTy = int32_t>
159LIBJIT_ALWAYS_INLINE DestTy libjit_scale(SrcTy input, int32_t pre, int32_t post,
160 int32_t scale, int32_t offset) {
161 // The operation x >> post is rounded down to negative infinity. To get to
162 // round-nearest we add (1 << (post - 1)) to the value prior to shifting.
163 // Rounding is performed only when shifting right (pos > 0).
164 SrcTy rtn = (post > 0) ? (1 << (post - 1)) : 0;
165
166 // NOTICE: If your tests are failing because of signed integer overflow then
167 // this is a bug in the test and not in the program. You should make sure that
168 // the inputs to the operations do not overflow. The semantics of the
169 // quantization process is such that the result for values that fall out of
170 // range is undefined. The conversion procedure will only tolerate a few bits
171 // of overflow and the result will be clipped.
172 return ((((input >> pre) * scale) + rtn) >> post) + offset;
173}
174
175/// Applies an activation function to a FLOAT input value \p input based on
176/// the activation type \p actType and the activation arguments \p actArgs.
177/// NOTE: The type of the activation must be in sync with the FusedActivation
178/// enumeration in glow\include\glow\Graph\Nodes.h.
179LIBJIT_ALWAYS_INLINE
180float libjit_activation_f(float input, int32_t actType, const float *actArgs) {
181 if (actType == 0) {
182 // No activation.
183 return input;
184 } else if (actType == 1) {
185 // Relu.
186 return MAX(input, 0);
187 } else if (actType == 2) {
188 // Clip.
189 return MIN(MAX(input, actArgs[0]), actArgs[1]);
190 } else if (actType == 3) {
191 // Tanh.
192 return libjit_tanh_f(input);
193 } else if (actType == 4) {
194 // Sigmoid.
195 return libjit_sigmoid_f(input);
196 } else {
197 // LeakyRelu.
198 return (input >= 0) ? input : actArgs[0] * input;
199 }
200}
201
202/// Applies an activation function to a QUANTIZED input value \p input based on
203/// the activation type \p actType and the activation arguments \p actArgs.
204/// NOTE: The type of the activation must be in sync with the FusedActivation
205/// enumeration in glow\include\glow\Graph\Nodes.h.
206LIBJIT_ALWAYS_INLINE
207int32_t libjit_activation_i32(int32_t input, int32_t offset, int32_t actType,
208 const int32_t *actArgs) {
209 if (actType == 0) {
210 // No activation.
211 return input;
212 } else if (actType == 1) {
213 // Relu.
214 return MAX(input, offset);
215 } else if (actType == 2) {
216 // Clip.
217 return MIN(MAX(input, actArgs[0]), actArgs[1]);
218 } else if (actType == 3) {
219 // Tanh.
220 assert(false && "Fused Tanh for quantized type not supported!");
221 return input;
222 } else if (actType == 4) {
223 // Sigmoid.
224 assert(false && "Fused Sigmoid for quantized type not supported!");
225 return input;
226 } else {
227 // LeakyRelu.
228 return (input >= offset)
229 ? input
230 : libjit_scale<int32_t>(input - offset, actArgs[0], actArgs[1],
231 actArgs[2], offset);
232 }
233}
234
235/// Divides the 32-bit integer \p input with \p divider. The division is done
236/// with rounding for better precision. Input can be both positive or negative.
237/// Divider is assumed strictly positive.
238LIBJIT_ALWAYS_INLINE
239int32_t libjit_div_round_i32(int32_t input, int32_t divider) {
240 // Division rounding term which is added for positive input and subtracted
241 // for negative input.
242 int32_t rnd = (divider >> 1);
243 return (input > 0) ? ((input + rnd) / divider) : ((input - rnd) / divider);
244}
245
246#ifdef _WIN32
247#define libjit_aligned_malloc(p, a, s) \
248 (((*(p)) = _aligned_malloc((s), (a))), *(p) ? 0 : errno)
249#define libjit_aligned_free(p) _aligned_free(p)
250#else
251#define libjit_aligned_malloc(p, a, s) posix_memalign(p, a, s)
252#define libjit_aligned_free(p) free(p)
253#endif
254
255/// This function computes the minimum filter index based on the the minimum
256/// input index \p inp_min.
257LIBJIT_ALWAYS_INLINE ssize_t libjit_conv_flt_min(ssize_t inp_min) {
258 return MAX(0, -inp_min);
259}
260
261/// This function computes the maximum filter index based on the the input size
262/// \p inp_size, the filter size \p flt_size and the minimum input index
263/// \p inp_min.
264LIBJIT_ALWAYS_INLINE ssize_t libjit_conv_flt_max(ssize_t inp_size,
265 ssize_t flt_size,
266 ssize_t inp_min) {
267 return MIN(flt_size, inp_size - inp_min);
268}
269
270/// This function computes the effective filter length given the minimum filter
271/// index \p flt_min and the maximum filter index \p flt_max.
272LIBJIT_ALWAYS_INLINE ssize_t libjit_conv_flt_len(ssize_t flt_min,
273 ssize_t flt_max) {
274 return MAX(0, flt_max - flt_min);
275}
276
277#endif // GLOW_LLVMIRCODEGEN_LIBJIT_LIBJIT_DEFS_H
278