1 | /* Copyright 2019 Google LLC. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "ruy/denormal.h" |
17 | |
18 | // NOTE: this is simply a copy of pthreadpool/src/threadpool-utils.h that's not |
19 | // exposed by the pthreadpool library |
20 | // (https://github.com/Maratyszcza/pthreadpool), but with an additional C++ |
21 | // helper class to suppress floating-point denormal values. |
22 | |
23 | /* SSE-specific headers */ |
24 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ |
25 | (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
26 | #include <xmmintrin.h> |
27 | #endif |
28 | |
29 | /* MSVC-specific headers */ |
30 | #if defined(_MSC_VER) |
31 | #include <intrin.h> |
32 | #endif |
33 | |
34 | namespace ruy { |
35 | namespace { |
36 | inline struct fpu_state get_fpu_state() { |
37 | struct fpu_state state = {}; |
38 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ |
39 | (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
40 | state.mxcsr = static_cast<std::uint32_t>(_mm_getcsr()); |
41 | #elif defined(_MSC_VER) && defined(_M_ARM) |
42 | state.fpscr = |
43 | static_cast<std::uint32_t>(_MoveFromCoprocessor(10, 7, 1, 0, 0)); |
44 | #elif defined(_MSC_VER) && defined(_M_ARM64) |
45 | state.fpcr = static_cast<std::uint64_t>(_ReadStatusReg(0x5A20)); |
46 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ |
47 | (__ARM_FP != 0) |
48 | __asm__ __volatile__("VMRS %[fpscr], fpscr" : [fpscr] "=r" (state.fpscr)); |
49 | #elif defined(__GNUC__) && defined(__aarch64__) |
50 | __asm__ __volatile__("MRS %[fpcr], fpcr" : [fpcr] "=r" (state.fpcr)); |
51 | #endif |
52 | return state; |
53 | } |
54 | |
55 | inline void set_fpu_state(const struct fpu_state state) { |
56 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ |
57 | (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
58 | _mm_setcsr(static_cast<unsigned int>(state.mxcsr)); |
59 | #elif defined(_MSC_VER) && defined(_M_ARM) |
60 | _MoveToCoprocessor(static_cast<int>(state.fpscr), 10, 7, 1, 0, 0); |
61 | #elif defined(_MSC_VER) && defined(_M_ARM64) |
62 | _WriteStatusReg(0x5A20, static_cast<__int64>(state.fpcr)); |
63 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ |
64 | (__ARM_FP != 0) |
65 | __asm__ __volatile__("VMSR fpscr, %[fpscr]" : : [fpscr] "r" (state.fpscr)); |
66 | #elif defined(__GNUC__) && defined(__aarch64__) |
67 | __asm__ __volatile__("MSR fpcr, %[fpcr]" : : [fpcr] "r" (state.fpcr)); |
68 | #else |
69 | (void)state; |
70 | #endif |
71 | } |
72 | |
73 | inline void disable_fpu_denormals() { |
74 | #if defined(__SSE__) || defined(__x86_64__) || defined(_M_X64) || \ |
75 | (defined(_M_IX86_FP) && _M_IX86_FP >= 1) |
76 | _mm_setcsr(_mm_getcsr() | 0x8040); |
77 | #elif defined(_MSC_VER) && defined(_M_ARM) |
78 | int fpscr = _MoveFromCoprocessor(10, 7, 1, 0, 0); |
79 | fpscr |= 0x1000000; |
80 | _MoveToCoprocessor(fpscr, 10, 7, 1, 0, 0); |
81 | #elif defined(_MSC_VER) && defined(_M_ARM64) |
82 | __int64 fpcr = _ReadStatusReg(0x5A20); |
83 | fpcr |= 0x1080000; |
84 | _WriteStatusReg(0x5A20, fpcr); |
85 | #elif defined(__GNUC__) && defined(__arm__) && defined(__ARM_FP) && \ |
86 | (__ARM_FP != 0) |
87 | std::uint32_t fpscr; |
88 | #if defined(__thumb__) && !defined(__thumb2__) |
89 | __asm__ __volatile__( |
90 | "VMRS %[fpscr], fpscr\n" |
91 | "ORRS %[fpscr], %[bitmask]\n" |
92 | "VMSR fpscr, %[fpscr]\n" |
93 | : [fpscr] "=l" (fpscr) |
94 | : [bitmask] "l" (0x1000000) |
95 | : "cc" ); |
96 | #else |
97 | __asm__ __volatile__( |
98 | "VMRS %[fpscr], fpscr\n" |
99 | "ORR %[fpscr], #0x1000000\n" |
100 | "VMSR fpscr, %[fpscr]\n" |
101 | : [fpscr] "=r" (fpscr)); |
102 | #endif |
103 | #elif defined(__GNUC__) && defined(__aarch64__) |
104 | std::uint64_t fpcr; |
105 | __asm__ __volatile__( |
106 | "MRS %[fpcr], fpcr\n" |
107 | "ORR %w[fpcr], %w[fpcr], 0x1000000\n" |
108 | "ORR %w[fpcr], %w[fpcr], 0x80000\n" |
109 | "MSR fpcr, %[fpcr]\n" |
110 | : [fpcr] "=r" (fpcr)); |
111 | #endif |
112 | } |
113 | } // namespace |
114 | |
115 | ScopedSuppressDenormals::ScopedSuppressDenormals() { |
116 | restore_ = get_fpu_state(); |
117 | disable_fpu_denormals(); |
118 | } |
119 | |
120 | ScopedSuppressDenormals::~ScopedSuppressDenormals() { set_fpu_state(restore_); } |
121 | } // namespace ruy |
122 | |