1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#include "tensorflow/tsl/platform/denormal.h"
17
18#include "tensorflow/core/platform/cpu_info.h"
19#include "tensorflow/tsl/platform/platform.h"
20
21// If we're on gcc 4.8 or older, there's a known bug that prevents the use of
22// intrinsics when the architecture is not defined in the flags. See
23// https://gcc.gnu.org/bugzilla/show_bug.cgi?id=57202
24#if !defined(__SSE3__) && !defined(__clang__) && \
25 (defined(__GNUC__) && (__GNUC__ < 4) || \
26 ((__GNUC__ == 4) && (__GNUC_MINOR__ < 9)))
27#define GCC_WITHOUT_INTRINSICS
28#endif
29// Only try to use SSE3 instructions if we're on an x86 platform, and it's not
30// mobile, and we're not on a known bad gcc version.
31#if defined(PLATFORM_IS_X86) && !defined(IS_MOBILE_PLATFORM) && \
32 !defined(GCC_WITHOUT_INTRINSICS)
33#define X86_DENORM_USE_INTRINSICS
34#endif
35
36#ifdef X86_DENORM_USE_INTRINSICS
37#include <pmmintrin.h>
38#endif
39
40// If on ARM, only access the control register if hardware floating-point
41// support is available.
42#if defined(PLATFORM_IS_ARM) && defined(__ARM_FP) && (__ARM_FP > 0)
43#define ARM_DENORM_AVAILABLE
44// Flush-to-zero bit on the ARM floating-point control register.
45#define ARM_FPCR_FZ (1 << 24)
46#endif
47
48namespace tsl {
49namespace port {
50
51bool DenormalState::operator==(const DenormalState& other) const {
52 return flush_to_zero() == other.flush_to_zero() &&
53 denormals_are_zero() == other.denormals_are_zero();
54}
55
56bool DenormalState::operator!=(const DenormalState& other) const {
57 return !(this->operator==(other));
58}
59
60#ifdef ARM_DENORM_AVAILABLE
61// Although the ARM ACLE does have a specification for __arm_rsr/__arm_wsr
62// for reading and writing to the status registers, they are not implemented
63// by GCC, so we need to resort to inline assembly.
64static inline void ArmSetFloatingPointControlRegister(uint32_t fpcr) {
65#ifdef PLATFORM_IS_ARM64
66 __asm__ __volatile__("msr fpcr, %[fpcr]"
67 :
68 : [fpcr] "r"(static_cast<uint64_t>(fpcr)));
69#else
70 __asm__ __volatile__("vmsr fpscr, %[fpcr]" : : [fpcr] "r"(fpcr));
71#endif
72}
73
74static inline uint32_t ArmGetFloatingPointControlRegister() {
75 uint32_t fpcr;
76#ifdef PLATFORM_IS_ARM64
77 uint64_t fpcr64;
78 __asm__ __volatile__("mrs %[fpcr], fpcr" : [fpcr] "=r"(fpcr64));
79 fpcr = static_cast<uint32_t>(fpcr64);
80#else
81 __asm__ __volatile__("vmrs %[fpcr], fpscr" : [fpcr] "=r"(fpcr));
82#endif
83 return fpcr;
84}
85#endif // ARM_DENORM_AVAILABLE
86
87bool SetDenormalState(const DenormalState& state) {
88 // For now, we flush denormals only on SSE 3 and ARM. Other architectures
89 // can be added as needed.
90
91#ifdef X86_DENORM_USE_INTRINSICS
92 if (TestCPUFeature(SSE3)) {
93 // Restore flags
94 _MM_SET_FLUSH_ZERO_MODE(state.flush_to_zero() ? _MM_FLUSH_ZERO_ON
95 : _MM_FLUSH_ZERO_OFF);
96 _MM_SET_DENORMALS_ZERO_MODE(state.denormals_are_zero()
97 ? _MM_DENORMALS_ZERO_ON
98 : _MM_DENORMALS_ZERO_OFF);
99 return true;
100 }
101#endif
102
103#ifdef ARM_DENORM_AVAILABLE
104 // ARM only has one setting controlling both denormal inputs and outputs.
105 if (state.flush_to_zero() == state.denormals_are_zero()) {
106 uint32_t fpcr = ArmGetFloatingPointControlRegister();
107 if (state.flush_to_zero()) {
108 fpcr |= ARM_FPCR_FZ;
109 } else {
110 fpcr &= ~ARM_FPCR_FZ;
111 }
112 ArmSetFloatingPointControlRegister(fpcr);
113 return true;
114 }
115#endif
116
117 // Setting denormal handling to the provided state is not supported.
118 return false;
119}
120
121DenormalState GetDenormalState() {
122#ifdef X86_DENORM_USE_INTRINSICS
123 if (TestCPUFeature(SSE3)) {
124 // Save existing flags
125 bool flush_zero_mode = _MM_GET_FLUSH_ZERO_MODE() == _MM_FLUSH_ZERO_ON;
126 bool denormals_zero_mode =
127 _MM_GET_DENORMALS_ZERO_MODE() == _MM_DENORMALS_ZERO_ON;
128 return DenormalState(flush_zero_mode, denormals_zero_mode);
129 }
130#endif
131
132#ifdef ARM_DENORM_AVAILABLE
133 uint32_t fpcr = ArmGetFloatingPointControlRegister();
134 if ((fpcr & ARM_FPCR_FZ) != 0) {
135 return DenormalState(true, true);
136 }
137#endif
138
139 return DenormalState(false, false);
140}
141
142ScopedRestoreFlushDenormalState::ScopedRestoreFlushDenormalState()
143 : denormal_state_(GetDenormalState()) {}
144
145ScopedRestoreFlushDenormalState::~ScopedRestoreFlushDenormalState() {
146 SetDenormalState(denormal_state_);
147}
148
149ScopedFlushDenormal::ScopedFlushDenormal() {
150 SetDenormalState(
151 DenormalState(/*flush_to_zero=*/true, /*denormals_are_zero=*/true));
152}
153
154ScopedDontFlushDenormal::ScopedDontFlushDenormal() {
155 SetDenormalState(
156 DenormalState(/*flush_to_zero=*/false, /*denormals_are_zero=*/false));
157}
158
159} // namespace port
160} // namespace tsl
161