1 | /* Copyright 2015 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | |
16 | #include "tensorflow/tsl/platform/denormal.h" |
17 | |
18 | #include "tensorflow/core/platform/cpu_info.h" |
19 | #include "tensorflow/tsl/platform/platform.h" |
20 | |
21 | // If we're on gcc 4.8 or older, there's a known bug that prevents the use of |
22 | // intrinsics when the architecture is not defined in the flags. See |
23 | // https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202 |
24 | #if !defined(__SSE3__) && !defined(__clang__) && \ |
25 | (defined(__GNUC__) && (__GNUC__ < 4) || \ |
26 | ((__GNUC__ == 4) && (__GNUC_MINOR__ < 9))) |
27 | #define GCC_WITHOUT_INTRINSICS |
28 | #endif |
29 | // Only try to use SSE3 instructions if we're on an x86 platform, and it's not |
30 | // mobile, and we're not on a known bad gcc version. |
31 | #if defined(PLATFORM_IS_X86) && !defined(IS_MOBILE_PLATFORM) && \ |
32 | !defined(GCC_WITHOUT_INTRINSICS) |
33 | #define X86_DENORM_USE_INTRINSICS |
34 | #endif |
35 | |
36 | #ifdef X86_DENORM_USE_INTRINSICS |
37 | #include <pmmintrin.h> |
38 | #endif |
39 | |
40 | // If on ARM, only access the control register if hardware floating-point |
41 | // support is available. |
42 | #if defined(PLATFORM_IS_ARM) && defined(__ARM_FP) && (__ARM_FP > 0) |
43 | #define ARM_DENORM_AVAILABLE |
44 | // Flush-to-zero bit on the ARM floating-point control register. |
45 | #define ARM_FPCR_FZ (1 << 24) |
46 | #endif |
47 | |
48 | namespace tsl { |
49 | namespace port { |
50 | |
51 | bool DenormalState::operator==(const DenormalState& other) const { |
52 | return flush_to_zero() == other.flush_to_zero() && |
53 | denormals_are_zero() == other.denormals_are_zero(); |
54 | } |
55 | |
56 | bool DenormalState::operator!=(const DenormalState& other) const { |
57 | return !(this->operator==(other)); |
58 | } |
59 | |
60 | #ifdef ARM_DENORM_AVAILABLE |
61 | // Although the ARM ACLE does have a specification for __arm_rsr/__arm_wsr |
62 | // for reading and writing to the status registers, they are not implemented |
63 | // by GCC, so we need to resort to inline assembly. |
64 | static inline void ArmSetFloatingPointControlRegister(uint32_t fpcr) { |
65 | #ifdef PLATFORM_IS_ARM64 |
66 | __asm__ __volatile__("msr fpcr, %[fpcr]" |
67 | : |
68 | : [fpcr] "r" (static_cast<uint64_t>(fpcr))); |
69 | #else |
70 | __asm__ __volatile__("vmsr fpscr, %[fpcr]" : : [fpcr] "r" (fpcr)); |
71 | #endif |
72 | } |
73 | |
74 | static inline uint32_t ArmGetFloatingPointControlRegister() { |
75 | uint32_t fpcr; |
76 | #ifdef PLATFORM_IS_ARM64 |
77 | uint64_t fpcr64; |
78 | __asm__ __volatile__("mrs %[fpcr], fpcr" : [fpcr] "=r" (fpcr64)); |
79 | fpcr = static_cast<uint32_t>(fpcr64); |
80 | #else |
81 | __asm__ __volatile__("vmrs %[fpcr], fpscr" : [fpcr] "=r" (fpcr)); |
82 | #endif |
83 | return fpcr; |
84 | } |
85 | #endif // ARM_DENORM_AVAILABLE |
86 | |
87 | bool SetDenormalState(const DenormalState& state) { |
88 | // For now, we flush denormals only on SSE 3 and ARM. Other architectures |
89 | // can be added as needed. |
90 | |
91 | #ifdef X86_DENORM_USE_INTRINSICS |
92 | if (TestCPUFeature(SSE3)) { |
93 | // Restore flags |
94 | _MM_SET_FLUSH_ZERO_MODE(state.flush_to_zero() ? _MM_FLUSH_ZERO_ON |
95 | : _MM_FLUSH_ZERO_OFF); |
96 | _MM_SET_DENORMALS_ZERO_MODE(state.denormals_are_zero() |
97 | ? _MM_DENORMALS_ZERO_ON |
98 | : _MM_DENORMALS_ZERO_OFF); |
99 | return true; |
100 | } |
101 | #endif |
102 | |
103 | #ifdef ARM_DENORM_AVAILABLE |
104 | // ARM only has one setting controlling both denormal inputs and outputs. |
105 | if (state.flush_to_zero() == state.denormals_are_zero()) { |
106 | uint32_t fpcr = ArmGetFloatingPointControlRegister(); |
107 | if (state.flush_to_zero()) { |
108 | fpcr |= ARM_FPCR_FZ; |
109 | } else { |
110 | fpcr &= ~ARM_FPCR_FZ; |
111 | } |
112 | ArmSetFloatingPointControlRegister(fpcr); |
113 | return true; |
114 | } |
115 | #endif |
116 | |
117 | // Setting denormal handling to the provided state is not supported. |
118 | return false; |
119 | } |
120 | |
121 | DenormalState GetDenormalState() { |
122 | #ifdef X86_DENORM_USE_INTRINSICS |
123 | if (TestCPUFeature(SSE3)) { |
124 | // Save existing flags |
125 | bool flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON; |
126 | bool denormals_zero_mode = |
127 | _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON; |
128 | return DenormalState(flush_zero_mode, denormals_zero_mode); |
129 | } |
130 | #endif |
131 | |
132 | #ifdef ARM_DENORM_AVAILABLE |
133 | uint32_t fpcr = ArmGetFloatingPointControlRegister(); |
134 | if ((fpcr & ARM_FPCR_FZ) != 0) { |
135 | return DenormalState(true, true); |
136 | } |
137 | #endif |
138 | |
139 | return DenormalState(false, false); |
140 | } |
141 | |
142 | ScopedRestoreFlushDenormalState::ScopedRestoreFlushDenormalState() |
143 | : denormal_state_(GetDenormalState()) {} |
144 | |
145 | ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() { |
146 | SetDenormalState(denormal_state_); |
147 | } |
148 | |
149 | ScopedFlushDenormal::ScopedFlushDenormal() { |
150 | SetDenormalState( |
151 | DenormalState(/*flush_to_zero=*/true, /*denormals_are_zero=*/true)); |
152 | } |
153 | |
154 | ScopedDontFlushDenormal::ScopedDontFlushDenormal() { |
155 | SetDenormalState( |
156 | DenormalState(/*flush_to_zero=*/false, /*denormals_are_zero=*/false)); |
157 | } |
158 | |
159 | } // namespace port |
160 | } // namespace tsl |
161 | |