1 | #pragma once |
2 | |
3 | #include <stdint.h> |
4 | #include <stddef.h> |
5 | |
6 | /* SSE-specific headers */ |
7 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
8 | #include <xmmintrin.h> |
9 | #endif |
10 | |
11 | /* MSVC-specific headers */ |
12 | #if defined(_MSC_VER) |
13 | #include <intrin.h> |
14 | #endif |
15 | |
16 | |
17 | struct fpu_state { |
18 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
19 | uint32_t mxcsr; |
20 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM) |
21 | uint32_t fpscr; |
22 | #elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && defined(_M_ARM64) |
23 | uint64_t fpcr; |
24 | #else |
25 | char unused; |
26 | #endif |
27 | }; |
28 | |
29 | static inline struct fpu_state get_fpu_state() { |
30 | struct fpu_state state = { 0 }; |
31 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
32 | state.mxcsr = (uint32_t) _mm_getcsr(); |
33 | #elif defined(_MSC_VER) && defined(_M_ARM) |
34 | state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0); |
35 | #elif defined(_MSC_VER) && defined(_M_ARM64) |
36 | state.fpcr = (uint64_t) _ReadStatusReg(0x5A20); |
37 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) |
38 | __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr)); |
39 | #elif defined(__GNUC__) && defined(__aarch64__) |
40 | __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr)); |
41 | #endif |
42 | return state; |
43 | } |
44 | |
45 | static inline void set_fpu_state(const struct fpu_state state) { |
46 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
47 | _mm_setcsr((unsigned int) state.mxcsr); |
48 | #elif defined(_MSC_VER) && defined(_M_ARM) |
49 | _MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0); |
50 | #elif defined(_MSC_VER) && defined(_M_ARM64) |
51 | _WriteStatusReg(0x5A20, (__int64) state.fpcr); |
52 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) |
53 | __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr)); |
54 | #elif defined(__GNUC__) && defined(__aarch64__) |
55 | __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr)); |
56 | #endif |
57 | } |
58 | |
59 | static inline void disable_fpu_denormals() { |
60 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
61 | _mm_setcsr(_mm_getcsr() | 0x8040); |
62 | #elif defined(_MSC_VER) && defined(_M_ARM) |
63 | int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0); |
64 | fpscr |= 0x1000000; |
65 | _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0); |
66 | #elif defined(_MSC_VER) && defined(_M_ARM64) |
67 | __int64 fpcr = _ReadStatusReg(0x5A20); |
68 | fpcr |= 0x1080000; |
69 | _WriteStatusReg(0x5A20, fpcr); |
70 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) |
71 | uint32_t fpscr; |
72 | #if defined(__thumb__) && !defined(__thumb2__) |
73 | __asm__ __volatile__( |
74 | "VMRS %[fpscr], fpscr\n" |
75 | "ORRS %[fpscr], %[bitmask]\n" |
76 | "VMSR fpscr, %[fpscr]\n" |
77 | : [fpscr] "=l" (fpscr) |
78 | : [bitmask] "l" (0x1000000) |
79 | : "cc" ); |
80 | #else |
81 | __asm__ __volatile__( |
82 | "VMRS %[fpscr], fpscr\n" |
83 | "ORR %[fpscr], #0x1000000\n" |
84 | "VMSR fpscr, %[fpscr]\n" |
85 | : [fpscr] "=r" (fpscr)); |
86 | #endif |
87 | #elif defined(__GNUC__) && defined(__aarch64__) |
88 | uint64_t fpcr; |
89 | __asm__ __volatile__( |
90 | "MRS %[fpcr], fpcr\n" |
91 | "ORR %w[fpcr], %w[fpcr], 0x1000000\n" |
92 | "ORR %w[fpcr], %w[fpcr], 0x80000\n" |
93 | "MSR fpcr, %[fpcr]\n" |
94 | : [fpcr] "=r" (fpcr)); |
95 | #endif |
96 | } |
97 | |
98 | static inline size_t modulo_decrement(size_t i, size_t n) { |
99 | /* Wrap modulo n, if needed */ |
100 | if (i == 0) { |
101 | i = n; |
102 | } |
103 | /* Decrement input variable */ |
104 | return i - 1; |
105 | } |
106 | |
107 | static inline size_t divide_round_up(size_t dividend, size_t divisor) { |
108 | if (dividend % divisor == 0) { |
109 | return dividend / divisor; |
110 | } else { |
111 | return dividend / divisor + 1; |
112 | } |
113 | } |
114 | |
115 | /* Windows headers define min and max macros; undefine it here */ |
116 | #ifdef min |
117 | #undef min |
118 | #endif |
119 | |
120 | static inline size_t min(size_t a, size_t b) { |
121 | return a < b ? a : b; |
122 | } |
123 | |