1/* Copyright 2015 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15
16#ifndef TENSORFLOW_CORE_KERNELS_QUANTIZE_AND_DEQUANTIZE_OP_H_
17#define TENSORFLOW_CORE_KERNELS_QUANTIZE_AND_DEQUANTIZE_OP_H_
18
19#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor"
20#include "tensorflow/core/framework/op_kernel.h"
21#include "tensorflow/core/framework/tensor.h"
22#include "tensorflow/core/framework/tensor_types.h"
23#include "tensorflow/core/kernels/cwise_ops.h"
24#include "tensorflow/core/platform/types.h"
25
26namespace tensorflow {
27
28enum QuantizerRoundMode {
29 // Round half up: if the fraction of y is exactly 0.5, then
30 // round(y) = y + 0.5
31 // E.g., -5.5 gets rounded to -5, -5.4 goes to -5,
32 // 5.4 goes to 5, and 5.5 goes to 6.
33 ROUND_HALF_UP,
34 // Round half to even: if the fraction of y is exactly 0.5, then round(y) is
35 // the nearest even integer to y.
36 // E.g., 23.5 gets rounded to 24, 24.5 gets rounded to 24, while -23.5 becomes
37 // -24, and -24.5 gets rounded to 24.
38 ROUND_HALF_TO_EVEN,
39};
40
41namespace functor {
42
43// TODO(pauldonnelly): 'signed_input' should really be called 'signed_output'.
44
45template <typename Device, typename T>
46struct QuantizeAndDequantizeOneScaleFunctor {
47 void operator()(const Device& d, typename TTypes<T>::ConstVec input,
48 bool signed_input, int num_bits, bool range_given,
49 Tensor* input_min_tensor, Tensor* input_max_tensor,
50 QuantizerRoundMode round_mode, bool narrow_range,
51 typename TTypes<T>::Vec output);
52};
53
54template <typename Device, typename T>
55struct QuantizeAndDequantizePerChannelFunctor {
56 void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor input,
57 bool signed_input, int num_bits, bool range_given,
58 Tensor* input_min_tensor, Tensor* input_max_tensor,
59 QuantizerRoundMode round_mode, bool narrow_range,
60 typename TTypes<T, 3>::Tensor output);
61};
62
63template <typename Device, typename T>
64struct QuantizeAndDequantizeOneScaleGradientFunctor {
65 void operator()(const Device& d, typename TTypes<T>::ConstFlat gradient,
66 typename TTypes<T>::ConstFlat input,
67 typename TTypes<T>::ConstScalar input_min,
68 typename TTypes<T>::ConstScalar input_max,
69 typename TTypes<T>::Flat input_backprop,
70 typename TTypes<T>::Scalar input_min_backprop,
71 typename TTypes<T>::Scalar input_max_backprop);
72};
73
74template <typename Device, typename T>
75struct QuantizeAndDequantizePerChannelGradientFunctor {
76 void operator()(const Device& d, typename TTypes<T, 3>::ConstTensor gradient,
77 typename TTypes<T, 3>::ConstTensor input,
78 const Tensor* input_min_tensor,
79 const Tensor* input_max_tensor,
80 typename TTypes<T, 3>::Tensor input_backprop,
81 typename TTypes<T>::Flat input_min_backprop,
82 typename TTypes<T>::Flat input_max_backprop);
83};
84
85// The implementation below runs on both CPU and GPU.
86template <typename Device, typename T, typename Func,
87 typename Vec = typename TTypes<T>::Vec,
88 typename ConstVec = typename TTypes<T>::ConstVec>
89void ClampScaleAndRound(const Device& d, ConstVec input, T min_range,
90 T max_range, T scale, T inverse_scale, Func round_func,
91 Vec output) {
92 output.device(d) = (input.cwiseMin(max_range).cwiseMax(min_range) * scale)
93 .unaryExpr(round_func) *
94 inverse_scale;
95}
96
97// The implementation below runs on both CPU and GPU.
98template <typename Device, typename T, typename Vec = typename TTypes<T>::Vec,
99 typename ConstVec = typename TTypes<T>::ConstVec>
100void ClampScaleAndRound(const Device& d, ConstVec input, T min_range,
101 T max_range, T scale, T inverse_scale,
102 QuantizerRoundMode round_mode, Vec output) {
103 switch (round_mode) {
104 case ROUND_HALF_TO_EVEN:
105 ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
106 Eigen::internal::scalar_round_half_to_even_op<T>(),
107 output);
108 break;
109 case ROUND_HALF_UP:
110 ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
111 Eigen::internal::scalar_round_up_op<T>(), output);
112 break;
113 }
114}
115
116// The implementation below runs on both CPU and GPU.
117template <typename Device, typename T, typename Func,
118 typename Vec = typename TTypes<T>::Vec,
119 typename ConstVec = typename TTypes<T>::ConstVec>
120void ScaleAndRound(const Device& d, ConstVec input, T scale, T inverse_scale,
121 Func round_func, Vec output) {
122 output.device(d) = (input * scale).unaryExpr(round_func) * inverse_scale;
123}
124
125// The implementation below runs on both CPU and GPU.
126template <typename Device, typename T, typename Vec = typename TTypes<T>::Vec,
127 typename ConstVec = typename TTypes<T>::ConstVec>
128void ScaleAndRound(const Device& d, ConstVec input, T scale, T inverse_scale,
129 QuantizerRoundMode round_mode, Vec output) {
130 switch (round_mode) {
131 case ROUND_HALF_TO_EVEN:
132 ScaleAndRound(d, input, scale, inverse_scale,
133 Eigen::internal::scalar_round_half_to_even_op<T>(), output);
134 break;
135 case ROUND_HALF_UP:
136 ScaleAndRound(d, input, scale, inverse_scale,
137 Eigen::internal::scalar_round_up_op<T>(), output);
138 break;
139 }
140}
141
142template <typename T>
143void ComputeQuantizationRange(bool signed_input, int num_bits,
144 QuantizerRoundMode round_mode, bool narrow_range,
145 T* min_range, T* max_range, T* scale,
146 T* inverse_scale) {
147 // Calculate the range for the simulated integer quantization:
148 // e.g. [-127,127] for signed = true, narrow_range = true, num_bits = 8,
149 // or [-128,127] for signed = true, narrow_range = false, num_bits = 8,
150 // or [0, 255] for signed = false, num_bits = 8.
151 const int64_t min_quantized =
152 signed_input ? narrow_range ? -(1ULL << (num_bits - 1)) + 1
153 : -(1ULL << (num_bits - 1))
154 : 0;
155 const int64_t max_quantized =
156 signed_input ? (1ULL << (num_bits - 1)) - 1 : (1ULL << num_bits) - 1;
157 // Determine the maximum scaling factor that would scale
158 // [min_range, max_range] to not exceed [min_quantized, max_quantized],
159 // while keeping 0 unchanged.
160 const T scale_from_min_side = (min_quantized * *min_range > 0)
161 ? min_quantized / *min_range
162 : std::numeric_limits<T>::max();
163 const T scale_from_max_side = (max_quantized * *max_range > 0)
164 ? max_quantized / *max_range
165 : std::numeric_limits<T>::max();
166
167 // Note: Avoids changing the side of the range that determines scale.
168 if (scale_from_min_side < scale_from_max_side) {
169 *scale = scale_from_min_side;
170 *inverse_scale = *min_range / min_quantized;
171 *max_range = max_quantized * *inverse_scale;
172 } else {
173 *scale = scale_from_max_side;
174 *inverse_scale = *max_range / max_quantized;
175 *min_range = min_quantized * *inverse_scale;
176 }
177}
178
179// The implementation below runs on both CPU and GPU.
180template <typename Device, typename T>
181struct QuantizeAndDequantizeOneScaleImpl {
182 static void Compute(const Device& d, typename TTypes<T>::ConstVec input,
183 bool signed_input, int num_bits, bool range_given,
184 Tensor* input_min_tensor, Tensor* input_max_tensor,
185 QuantizerRoundMode round_mode, bool narrow_range,
186 typename TTypes<T>::Vec output) {
187 T min_range;
188 T max_range;
189 auto input_min = input_min_tensor->scalar<T>();
190 auto input_max = input_max_tensor->scalar<T>();
191 if (!range_given) {
192 input_min.device(d) = input.minimum();
193 input_max.device(d) = input.maximum();
194 d.memcpyDeviceToHost(&min_range, input_min.data(), sizeof(T));
195 d.memcpyDeviceToHost(&max_range, input_max.data(), sizeof(T));
196 } else {
197 // Copy the range values from their respective tensors on the host.
198 min_range = input_min_tensor->scalar<T>()();
199 max_range = input_max_tensor->scalar<T>()();
200 }
201
202 T scale, inverse_scale;
203 ComputeQuantizationRange(signed_input, num_bits, round_mode, narrow_range,
204 &min_range, &max_range, &scale, &inverse_scale);
205
206 if (range_given) {
207 // Note: The clamping here is to avoid overflow in the quantized type.
208 // The semantics of the op does not guarantee to clamp to the specified
209 // min_range and max_range - because we may have changed either min_range
210 // or max_range.
211 ClampScaleAndRound(d, input, min_range, max_range, scale, inverse_scale,
212 round_mode, output);
213 } else {
214 ScaleAndRound(d, input, scale, inverse_scale, round_mode, output);
215 }
216 }
217};
218
219// The implementation below runs on both CPU and GPU.
220
221template <typename Device, typename T>
222struct QuantizeAndDequantizePerChannelImpl {
223 static void Compute(const Device& d, typename TTypes<T, 3>::ConstTensor input,
224 bool signed_input, int num_bits, bool range_given,
225 Tensor* input_min_tensor, Tensor* input_max_tensor,
226 QuantizerRoundMode round_mode, bool narrow_range,
227 typename TTypes<T, 3>::Tensor output) {
228 using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
229 int num_channels = input.dimension(1);
230 auto input_min = input_min_tensor->vec<T>();
231 auto input_max = input_max_tensor->vec<T>();
232 std::vector<T> min_range(num_channels);
233 std::vector<T> max_range(num_channels);
234
235 if (!range_given) {
236 Eigen::IndexList<Eigen::type2index<0>, Eigen::type2index<2> > reduce_dims;
237 input_min.device(d) = input.minimum(reduce_dims);
238 input_max.device(d) = input.maximum(reduce_dims);
239 d.memcpyDeviceToHost(min_range.data(), input_min.data(),
240 num_channels * sizeof(T));
241 d.memcpyDeviceToHost(max_range.data(), input_max.data(),
242 num_channels * sizeof(T));
243 } else {
244 // Copy the range values from their respective tensors on the host.
245 std::memcpy(min_range.data(), input_min_tensor->vec<T>().data(),
246 num_channels * sizeof(T));
247 std::memcpy(max_range.data(), input_max_tensor->vec<T>().data(),
248 num_channels * sizeof(T));
249 }
250
251 for (Index i = 0; i < num_channels; ++i) {
252 const auto input_chip = input.template chip<1>(i);
253 auto output_chip = output.template chip<1>(i);
254
255 T scale, inverse_scale;
256 ComputeQuantizationRange(signed_input, num_bits, round_mode, narrow_range,
257 &min_range[i], &max_range[i], &scale,
258 &inverse_scale);
259 if (range_given) {
260 ClampScaleAndRound(d, input_chip, min_range[i], max_range[i], scale,
261 inverse_scale, round_mode, output_chip);
262 } else {
263 ScaleAndRound(d, input_chip, scale, inverse_scale, round_mode,
264 output_chip);
265 }
266 }
267 }
268};
269
270template <typename Device, typename T>
271struct QuantizeAndDequantizeOneScaleGradientImpl {
272 static void Compute(const Device& d, typename TTypes<T>::ConstFlat gradient,
273 typename TTypes<T>::ConstFlat input,
274 typename TTypes<T>::ConstScalar input_min,
275 typename TTypes<T>::ConstScalar input_max,
276 typename TTypes<T>::Flat input_backprop,
277 typename TTypes<T>::Scalar input_min_backprop,
278 typename TTypes<T>::Scalar input_max_backprop) {
279 const T min_val = input_min();
280 const T max_val = input_max();
281 const auto in_range =
282 (input >= min_val && input <= max_val)
283 .select(input.constant(1.0f), input.constant(0.0f));
284 input_backprop.device(d) = gradient * in_range;
285 input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
286 input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
287 }
288};
289
290template <typename Device, typename T>
291struct QuantizeAndDequantizePerChannelGradientImpl {
292 static void Compute(const Device& d,
293 typename TTypes<T, 3>::ConstTensor gradient,
294 typename TTypes<T, 3>::ConstTensor input,
295 const Tensor* input_min_tensor,
296 const Tensor* input_max_tensor,
297 typename TTypes<T, 3>::Tensor input_backprop,
298 typename TTypes<T>::Flat input_min_backprop,
299 typename TTypes<T>::Flat input_max_backprop) {
300 using Index = typename tensorflow::TTypes<T>::ConstTensor::Index;
301 auto input_min = input_min_tensor->vec<T>();
302 auto input_max = input_max_tensor->vec<T>();
303 int num_channels = input.dimension(1);
304 for (Index i = 0; i < num_channels; ++i) {
305 const auto gradient_chip = gradient.template chip<1>(i);
306 const auto input_chip = input.template chip<1>(i);
307 const T min_val = input_min(i);
308 const T max_val = input_max(i);
309 const auto in_range =
310 (input_chip >= min_val && input_chip <= max_val)
311 .select(input_chip.constant(1.0f), input_chip.constant(0.0f));
312 input_backprop.template chip<1>(i).device(d) = gradient_chip * in_range;
313 }
314 input_min_backprop.device(d) = input_min_backprop.constant(0.0f);
315 input_max_backprop.device(d) = input_max_backprop.constant(0.0f);
316 }
317};
318
319} // end of namespace functor
320} // end of namespace tensorflow
321
322#endif // TENSORFLOW_CORE_KERNELS_QUANTIZE_AND_DEQUANTIZE_OP_H_
323