1#pragma once
2
3#include <stdint.h>
4#include <stddef.h>
5
6/* SSE-specific headers */
7#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
8 #include <xmmintrin.h>
9#endif
10
11/* MSVC-specific headers */
12#if defined(_MSC_VER)
13 #include <intrin.h>
14#endif
15
16
17struct fpu_state {
18#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
19 uint32_t mxcsr;
20#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0) || defined(_MSC_VER) && defined(_M_ARM)
21 uint32_t fpscr;
22#elif defined(__GNUC__) && defined(__aarch64__) || defined(_MSC_VER) && defined(_M_ARM64)
23 uint64_t fpcr;
24#else
25 char unused;
26#endif
27};
28
29static inline struct fpu_state get_fpu_state() {
30 struct fpu_state state = { 0 };
31#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
32 state.mxcsr = (uint32_t) _mm_getcsr();
33#elif defined(_MSC_VER) && defined(_M_ARM)
34 state.fpscr = (uint32_t) _MoveFromCoprocessor(10, 7, 1, 0, 0);
35#elif defined(_MSC_VER) && defined(_M_ARM64)
36 state.fpcr = (uint64_t) _ReadStatusReg(0x5A20);
37#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
38 __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr));
39#elif defined(__GNUC__) && defined(__aarch64__)
40 __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr));
41#endif
42 return state;
43}
44
45static inline void set_fpu_state(const struct fpu_state state) {
46#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
47 _mm_setcsr((unsigned int) state.mxcsr);
48#elif defined(_MSC_VER) && defined(_M_ARM)
49 _MoveToCoprocessor((int) state.fpscr, 10, 7, 1, 0, 0);
50#elif defined(_MSC_VER) && defined(_M_ARM64)
51 _WriteStatusReg(0x5A20, (__int64) state.fpcr);
52#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
53 __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr));
54#elif defined(__GNUC__) && defined(__aarch64__)
55 __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr));
56#endif
57}
58
59static inline void disable_fpu_denormals() {
60#if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || (defined(_M_IX86_FP) && _M_IX86_FP >= 1)
61 _mm_setcsr(_mm_getcsr() | 0x8040);
62#elif defined(_MSC_VER) && defined(_M_ARM)
63 int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0);
64 fpscr |= 0x1000000;
65 _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0);
66#elif defined(_MSC_VER) && defined(_M_ARM64)
67 __int64 fpcr = _ReadStatusReg(0x5A20);
68 fpcr |= 0x1080000;
69 _WriteStatusReg(0x5A20, fpcr);
70#elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && (__ARM_FP != 0)
71 uint32_t fpscr;
72 #if defined(__thumb__) && !defined(__thumb2__)
73 __asm__ __volatile__(
74 "VMRS %[fpscr], fpscr\n"
75 "ORRS %[fpscr], %[bitmask]\n"
76 "VMSR fpscr, %[fpscr]\n"
77 : [fpscr] "=l" (fpscr)
78 : [bitmask] "l" (0x1000000)
79 : "cc");
80 #else
81 __asm__ __volatile__(
82 "VMRS %[fpscr], fpscr\n"
83 "ORR %[fpscr], #0x1000000\n"
84 "VMSR fpscr, %[fpscr]\n"
85 : [fpscr] "=r" (fpscr));
86 #endif
87#elif defined(__GNUC__) && defined(__aarch64__)
88 uint64_t fpcr;
89 __asm__ __volatile__(
90 "MRS %[fpcr], fpcr\n"
91 "ORR %w[fpcr], %w[fpcr], 0x1000000\n"
92 "ORR %w[fpcr], %w[fpcr], 0x80000\n"
93 "MSR fpcr, %[fpcr]\n"
94 : [fpcr] "=r" (fpcr));
95#endif
96}
97
98static inline size_t modulo_decrement(size_t i, size_t n) {
99 /* Wrap modulo n, if needed */
100 if (i == 0) {
101 i = n;
102 }
103 /* Decrement input variable */
104 return i - 1;
105}
106
107static inline size_t divide_round_up(size_t dividend, size_t divisor) {
108 if (dividend % divisor == 0) {
109 return dividend / divisor;
110 } else {
111 return dividend / divisor + 1;
112 }
113}
114
115/* Windows headers define min and max macros; undefine it here */
116#ifdef min
117 #undef min
118#endif
119
120static inline size_t min(size_t a, size_t b) {
121 return a < b ? a : b;
122}
123