1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | #include "tensorflow/lite/kernels/internal/reference/sub.h" |
16 | |
17 | #include <stddef.h> |
18 | #include <stdint.h> |
19 | |
20 | #include <algorithm> |
21 | #include <limits> |
22 | |
23 | #include "tensorflow/lite/c/builtin_op_data.h" |
24 | #include "tensorflow/lite/c/common.h" |
25 | #include "tensorflow/lite/kernels/internal/compatibility.h" |
26 | #include "tensorflow/lite/kernels/internal/optimized/cpu_check.h" |
27 | #include "tensorflow/lite/kernels/internal/optimized/integer_ops/sub.h" |
28 | #include "tensorflow/lite/kernels/internal/optimized/neon_check.h" |
29 | #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" |
30 | #include "tensorflow/lite/kernels/internal/quantization_util.h" |
31 | #include "tensorflow/lite/kernels/internal/reference/add.h" |
32 | #include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h" |
33 | #include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h" |
34 | #include "tensorflow/lite/kernels/internal/reference/reference_ops.h" |
35 | #include "tensorflow/lite/kernels/internal/tensor.h" |
36 | #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" |
37 | #include "tensorflow/lite/kernels/internal/types.h" |
38 | #include "tensorflow/lite/kernels/kernel_util.h" |
39 | |
40 | namespace tflite { |
41 | namespace ops { |
42 | namespace builtin { |
43 | namespace sub { |
44 | |
45 | // This file has three implementation of Sub. |
46 | enum KernelType { |
47 | kReference, |
48 | kGenericOptimized, // Neon-free |
49 | kNeonOptimized, |
50 | }; |
51 | |
52 | constexpr int kInputTensor1 = 0; |
53 | constexpr int kInputTensor2 = 1; |
54 | constexpr int kOutputTensor = 0; |
55 | |
56 | struct OpData { |
57 | bool requires_broadcast; |
58 | |
59 | // These fields are used in both the general 8-bit -> 8bit quantized path, |
60 | // and the special 16-bit -> 16bit quantized path |
61 | int input1_shift; |
62 | int input2_shift; |
63 | int32 output_activation_min; |
64 | int32 output_activation_max; |
65 | |
66 | // These fields are used only in the general 8-bit -> 8bit quantized path |
67 | int32 input1_multiplier; |
68 | int32 input2_multiplier; |
69 | int32 output_multiplier; |
70 | int output_shift; |
71 | int left_shift; |
72 | int32 input1_offset; |
73 | int32 input2_offset; |
74 | int32 output_offset; |
75 | |
76 | // This parameter is used to indicate whether |
77 | // parameter scale is power of two. |
78 | // It is used in 16-bit -> 16-bit quantization. |
79 | bool pot_scale_int16; |
80 | }; |
81 | |
82 | void* Init(TfLiteContext* context, const char* buffer, size_t length) { |
83 | auto* data = new OpData; |
84 | data->requires_broadcast = false; |
85 | return data; |
86 | } |
87 | |
88 | void Free(TfLiteContext* context, void* buffer) { |
89 | delete reinterpret_cast<OpData*>(buffer); |
90 | } |
91 | |
92 | TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context, |
93 | const TfLiteTensor* input_1, |
94 | const TfLiteTensor* input_2, |
95 | TfLiteTensor* output, TfLiteSubParams* params, |
96 | OpData* op_params) { |
97 | TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 || |
98 | output->type == kTfLiteInt8 || |
99 | output->type == kTfLiteInt16); |
100 | const auto& input1_quantization_params = input_1->params; |
101 | const auto& input2_quantization_params = input_2->params; |
102 | const auto& output_quantization_params = output->params; |
103 | int32_t integer_type_min = 0; |
104 | int32_t integer_type_max = 0; |
105 | if (output->type == kTfLiteUInt8) { |
106 | integer_type_min = std::numeric_limits<uint8_t>::min(); |
107 | integer_type_max = std::numeric_limits<uint8_t>::max(); |
108 | } else if (output->type == kTfLiteInt16) { |
109 | integer_type_min = std::numeric_limits<int16_t>::min(); |
110 | integer_type_max = std::numeric_limits<int16_t>::max(); |
111 | } else { |
112 | // output->type == kTfLiteInt8 |
113 | integer_type_min = std::numeric_limits<int8_t>::min(); |
114 | integer_type_max = std::numeric_limits<int8_t>::max(); |
115 | } |
116 | |
117 | TF_LITE_ENSURE(context, |
118 | input1_quantization_params.zero_point >= integer_type_min); |
119 | TF_LITE_ENSURE(context, |
120 | input1_quantization_params.zero_point <= integer_type_max); |
121 | TF_LITE_ENSURE(context, |
122 | input2_quantization_params.zero_point >= integer_type_min); |
123 | TF_LITE_ENSURE(context, |
124 | input2_quantization_params.zero_point <= integer_type_max); |
125 | TF_LITE_ENSURE(context, |
126 | output_quantization_params.zero_point >= integer_type_min); |
127 | TF_LITE_ENSURE(context, |
128 | output_quantization_params.zero_point <= integer_type_max); |
129 | |
130 | op_params->input1_offset = -input1_quantization_params.zero_point; |
131 | op_params->input2_offset = -input2_quantization_params.zero_point; |
132 | op_params->output_offset = output_quantization_params.zero_point; |
133 | |
134 | // The shift is set to 15 in case of 16-bit and 20 in case of 8-bit, |
135 | // accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 << |
136 | // 31, therefore the addition will still fit in a 32 bit accumulator. |
137 | op_params->left_shift = output->type == kTfLiteInt16 ? 15 : 20; |
138 | const double twice_max_input_scale = |
139 | 2 * std::max(input1_quantization_params.scale, |
140 | input2_quantization_params.scale); |
141 | const double real_input1_multiplier = |
142 | input1_quantization_params.scale / twice_max_input_scale; |
143 | const double real_input2_multiplier = |
144 | input2_quantization_params.scale / twice_max_input_scale; |
145 | const double real_output_multiplier = |
146 | twice_max_input_scale / |
147 | ((1 << op_params->left_shift) * output_quantization_params.scale); |
148 | |
149 | tflite::QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier, |
150 | &op_params->input1_multiplier, |
151 | &op_params->input1_shift); |
152 | tflite::QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier, |
153 | &op_params->input2_multiplier, |
154 | &op_params->input2_shift); |
155 | tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier, |
156 | &op_params->output_multiplier, |
157 | &op_params->output_shift); |
158 | |
159 | TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized( |
160 | context, params->activation, output, &op_params->output_activation_min, |
161 | &op_params->output_activation_max)); |
162 | |
163 | return kTfLiteOk; |
164 | } |
165 | |
166 | TfLiteStatus PrepareInt16SubOpPOT(TfLiteContext* context, |
167 | const TfLiteTensor* input1, |
168 | const TfLiteTensor* input2, |
169 | TfLiteTensor* output, TfLiteSubParams* params, |
170 | OpData* data) { |
171 | // 16bit -> 16bit special quantized path, supporting only a rather |
172 | // narrow case of quantization parameters: zero_points must all be 0 |
173 | // ("symmetric quantization") and scales must be power-of-two (which |
174 | // we abbreviate as "POT" below). The intended use case for this path |
175 | // is in LSTM cells, where, due to the constraints of implementing |
176 | // some of the math in these LSTM cells in fixed-point arithmetic, |
177 | // we need to have such symmetric, power-of-two quantization |
178 | // (Fixed-point formats are inherently symmetric, power-of-two). |
179 | TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0); |
180 | TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0); |
181 | TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); |
182 | |
183 | int input1_scale_log2_rounded; |
184 | bool input1_scale_is_pot = |
185 | CheckedLog2(input1->params.scale, &input1_scale_log2_rounded); |
186 | TF_LITE_ENSURE(context, input1_scale_is_pot); |
187 | |
188 | int input2_scale_log2_rounded; |
189 | bool input2_scale_is_pot = |
190 | CheckedLog2(input2->params.scale, &input2_scale_log2_rounded); |
191 | TF_LITE_ENSURE(context, input2_scale_is_pot); |
192 | |
193 | int output_scale_log2_rounded; |
194 | bool output_scale_is_pot = |
195 | CheckedLog2(output->params.scale, &output_scale_log2_rounded); |
196 | TF_LITE_ENSURE(context, output_scale_is_pot); |
197 | |
198 | data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded; |
199 | data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded; |
200 | |
201 | // Shifting of one input is supported. The graph quantization should ensure |
202 | // that the other input matches the output. |
203 | TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0); |
204 | TF_LITE_ENSURE(context, data->input1_shift <= 0); |
205 | TF_LITE_ENSURE(context, data->input2_shift <= 0); |
206 | |
207 | TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized( |
208 | context, params->activation, output, &data->output_activation_min, |
209 | &data->output_activation_max)); |
210 | return kTfLiteOk; |
211 | } |
212 | |
213 | TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { |
214 | OpData* data = reinterpret_cast<OpData*>(node->user_data); |
215 | auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data); |
216 | |
217 | TF_LITE_ENSURE_EQ(context, NumInputs(node), 2); |
218 | TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1); |
219 | |
220 | const TfLiteTensor* input1; |
221 | TF_LITE_ENSURE_OK(context, |
222 | GetInputSafe(context, node, kInputTensor1, &input1)); |
223 | const TfLiteTensor* input2; |
224 | TF_LITE_ENSURE_OK(context, |
225 | GetInputSafe(context, node, kInputTensor2, &input2)); |
226 | TfLiteTensor* output; |
227 | TF_LITE_ENSURE_OK(context, |
228 | GetOutputSafe(context, node, kOutputTensor, &output)); |
229 | |
230 | TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type); |
231 | output->type = input2->type; |
232 | |
233 | data->requires_broadcast = !HaveSameShapes(input1, input2); |
234 | |
235 | TfLiteIntArray* output_size = nullptr; |
236 | if (data->requires_broadcast) { |
237 | TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast( |
238 | context, input1, input2, &output_size)); |
239 | } else { |
240 | output_size = TfLiteIntArrayCopy(input1->dims); |
241 | } |
242 | |
243 | // 8bit -> 8bit general quantized path, with general rescalings |
244 | // as well as, 16bit -> 16bit with general rescalings |
245 | |
246 | // There are two implementations of SUB operator in case of |
247 | // 16bit input depending on whether the scale parameter is |
248 | // the power of 2 or not. Currently only implementation for |
249 | // general case is used, but we need to use another implementation |
250 | // for older versions. |
251 | bool general_scale_int16 = false; |
252 | |
253 | bool input1_scale_is_pot = false; |
254 | bool input2_scale_is_pot = false; |
255 | bool output_scale_is_pot = false; |
256 | |
257 | int input1_scale_log2_rounded{0}; |
258 | int input2_scale_log2_rounded{0}; |
259 | int output_scale_log2_rounded{0}; |
260 | |
261 | if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 && |
262 | output->type == kTfLiteInt16) { |
263 | TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0); |
264 | TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0); |
265 | TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); |
266 | |
267 | general_scale_int16 = !params || !params->pot_scale_int16; |
268 | |
269 | if (!general_scale_int16) { |
270 | // Do preparation in the case of the scale parameter is power of 2. |
271 | input1_scale_is_pot = |
272 | CheckedLog2(input1->params.scale, &input1_scale_log2_rounded); |
273 | |
274 | input2_scale_is_pot = |
275 | CheckedLog2(input2->params.scale, &input2_scale_log2_rounded); |
276 | |
277 | output_scale_is_pot = |
278 | CheckedLog2(output->params.scale, &output_scale_log2_rounded); |
279 | |
280 | general_scale_int16 = |
281 | !input1_scale_is_pot || !input2_scale_is_pot || !output_scale_is_pot; |
282 | } |
283 | } |
284 | |
285 | data->pot_scale_int16 = !general_scale_int16; |
286 | |
287 | if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || |
288 | general_scale_int16) { |
289 | TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2, |
290 | output, params, data)); |
291 | } else if (output->type == kTfLiteInt16) { |
292 | // LSTM-special case with scale parameter of POT |
293 | TF_LITE_ENSURE_OK(context, PrepareInt16SubOpPOT(context, input1, input2, |
294 | output, params, data)); |
295 | } |
296 | |
297 | return context->ResizeTensor(context, output, output_size); |
298 | } |
299 | |
300 | template <KernelType kernel_type, typename data_type> |
301 | void EvalSubImpl(TfLiteContext* context, TfLiteNode* node, |
302 | TfLiteSubParams* params, const OpData* data, |
303 | const TfLiteTensor* input1, const TfLiteTensor* input2, |
304 | bool requires_broadcast, TfLiteTensor* output) { |
305 | data_type output_activation_min, output_activation_max; |
306 | CalculateActivationRange(params->activation, &output_activation_min, |
307 | &output_activation_max); |
308 | tflite::ArithmeticParams op_params; |
309 | SetActivationParams(output_activation_min, output_activation_max, &op_params); |
310 | |
311 | switch (kernel_type) { |
312 | case kReference: |
313 | if (requires_broadcast) { |
314 | reference_ops::BroadcastSubSlow( |
315 | op_params, GetTensorShape(input1), GetTensorData<data_type>(input1), |
316 | GetTensorShape(input2), GetTensorData<data_type>(input2), |
317 | GetTensorShape(output), GetTensorData<data_type>(output)); |
318 | } else { |
319 | reference_ops::SubWithActivation( |
320 | op_params, GetTensorShape(input1), GetTensorData<data_type>(input1), |
321 | GetTensorShape(input2), GetTensorData<data_type>(input2), |
322 | GetTensorShape(output), GetTensorData<data_type>(output)); |
323 | } |
324 | break; |
325 | case kGenericOptimized: |
326 | case kNeonOptimized: |
327 | if (requires_broadcast) { |
328 | optimized_ops::BroadcastSubSlow( |
329 | op_params, GetTensorShape(input1), GetTensorData<data_type>(input1), |
330 | GetTensorShape(input2), GetTensorData<data_type>(input2), |
331 | GetTensorShape(output), GetTensorData<data_type>(output)); |
332 | } else { |
333 | optimized_ops::SubWithActivation( |
334 | op_params, GetTensorShape(input1), GetTensorData<data_type>(input1), |
335 | GetTensorShape(input2), GetTensorData<data_type>(input2), |
336 | GetTensorShape(output), GetTensorData<data_type>(output)); |
337 | } |
338 | break; |
339 | } |
340 | } |
341 | |
342 | template <KernelType kernel_type> |
343 | void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params, |
344 | const OpData* data, const TfLiteTensor* input1, |
345 | const TfLiteTensor* input2, TfLiteTensor* output) { |
346 | const bool requires_broadcast = data->requires_broadcast; |
347 | switch (output->type) { |
348 | case kTfLiteInt32: |
349 | EvalSubImpl<kernel_type, int32_t>(context, node, params, data, input1, |
350 | input2, requires_broadcast, output); |
351 | break; |
352 | case kTfLiteFloat32: |
353 | EvalSubImpl<kernel_type, float>(context, node, params, data, input1, |
354 | input2, requires_broadcast, output); |
355 | break; |
356 | case kTfLiteInt64: |
357 | EvalSubImpl<kernel_type, int64_t>(context, node, params, data, input1, |
358 | input2, requires_broadcast, output); |
359 | break; |
360 | |
361 | default: |
362 | TF_LITE_KERNEL_LOG(context, "output type %s is not supported." , |
363 | TfLiteTypeGetName(output->type)); |
364 | } |
365 | } |
366 | |
367 | template <KernelType kernel_type> |
368 | void EvalQuantized(TfLiteContext* context, TfLiteNode* node, |
369 | TfLiteSubParams* params, const OpData* data, |
370 | const TfLiteTensor* input1, const TfLiteTensor* input2, |
371 | TfLiteTensor* output) { |
372 | tflite::ArithmeticParams op_params; |
373 | op_params.left_shift = data->left_shift; |
374 | op_params.input1_offset = data->input1_offset; |
375 | op_params.input1_multiplier = data->input1_multiplier; |
376 | op_params.input1_shift = data->input1_shift; |
377 | op_params.input2_offset = data->input2_offset; |
378 | op_params.input2_multiplier = data->input2_multiplier; |
379 | op_params.input2_shift = data->input2_shift; |
380 | op_params.output_offset = data->output_offset; |
381 | op_params.output_multiplier = data->output_multiplier; |
382 | op_params.output_shift = data->output_shift; |
383 | SetActivationParams(data->output_activation_min, data->output_activation_max, |
384 | &op_params); |
385 | |
386 | const bool need_broadcast = optimized_ops::ProcessBroadcastShapes( |
387 | GetTensorShape(input1), GetTensorShape(input2), &op_params); |
388 | |
389 | #define TF_LITE_SUB(type, opname, data_type) \ |
390 | type::opname(op_params, GetTensorShape(input1), \ |
391 | GetTensorData<data_type>(input1), GetTensorShape(input2), \ |
392 | GetTensorData<data_type>(input2), GetTensorShape(output), \ |
393 | GetTensorData<data_type>(output)) |
394 | if (output->type == kTfLiteInt8) { |
395 | if (need_broadcast) { |
396 | TF_LITE_SUB(reference_ops, BroadcastQuantSubSlow, int8_t); |
397 | } else { |
398 | TF_LITE_SUB(reference_ops, Sub, int8_t); |
399 | } |
400 | } else if (!data->pot_scale_int16) { |
401 | if (kernel_type == kReference) { |
402 | if (need_broadcast) { |
403 | TF_LITE_SUB(reference_ops, BroadcastQuantSubSlow, int16_t); |
404 | } else { |
405 | TF_LITE_SUB(reference_ops, Sub, int16_t); |
406 | } |
407 | } else { |
408 | if (need_broadcast) { |
409 | TF_LITE_SUB(optimized_integer_ops, BroadcastSubDispatch, int16_t); |
410 | } else { |
411 | TF_LITE_SUB(optimized_integer_ops, Sub, int16_t); |
412 | } |
413 | } |
414 | } else if (output->type == kTfLiteUInt8) { |
415 | if (need_broadcast) { |
416 | TF_LITE_SUB(reference_ops, BroadcastQuantSubSlow, uint8_t); |
417 | } else { |
418 | TF_LITE_SUB(reference_ops, Sub, uint8_t); |
419 | } |
420 | } else { |
421 | if (kernel_type == kReference) { |
422 | if (need_broadcast) { |
423 | TF_LITE_SUB(reference_ops, BroadcastSub16POTSlow, int16_t); |
424 | } else { |
425 | TF_LITE_SUB(reference_ops, Sub16, int16_t); |
426 | } |
427 | } else { |
428 | if (need_broadcast) { |
429 | TF_LITE_SUB(optimized_ops, BroadcastSub16POTSlow, int16_t); |
430 | } else { |
431 | TF_LITE_SUB(optimized_ops, Sub16, int16_t); |
432 | } |
433 | } |
434 | } |
435 | #undef TF_LITE_SUB |
436 | } |
437 | |
438 | template <KernelType kernel_type> |
439 | TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { |
440 | auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data); |
441 | OpData* data = reinterpret_cast<OpData*>(node->user_data); |
442 | |
443 | const TfLiteTensor* input1; |
444 | TF_LITE_ENSURE_OK(context, |
445 | GetInputSafe(context, node, kInputTensor1, &input1)); |
446 | const TfLiteTensor* input2; |
447 | TF_LITE_ENSURE_OK(context, |
448 | GetInputSafe(context, node, kInputTensor2, &input2)); |
449 | TfLiteTensor* output; |
450 | TF_LITE_ENSURE_OK(context, |
451 | GetOutputSafe(context, node, kOutputTensor, &output)); |
452 | |
453 | if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 || |
454 | output->type == kTfLiteInt64) { |
455 | EvalSub<kernel_type>(context, node, params, data, input1, input2, output); |
456 | } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 || |
457 | output->type == kTfLiteInt16) { |
458 | EvalQuantized<kernel_type>(context, node, params, data, input1, input2, |
459 | output); |
460 | } else { |
461 | TF_LITE_KERNEL_LOG( |
462 | context, |
463 | "output type %d is not supported, requires float|uint8|int32 types." , |
464 | output->type); |
465 | return kTfLiteError; |
466 | } |
467 | |
468 | return kTfLiteOk; |
469 | } |
470 | |
471 | } // namespace sub |
472 | |
473 | TfLiteRegistration* Register_SUB_REF() { |
474 | static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare, |
475 | sub::Eval<sub::kReference>}; |
476 | return &r; |
477 | } |
478 | |
479 | TfLiteRegistration* Register_SUB_GENERIC_OPT() { |
480 | static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare, |
481 | sub::Eval<sub::kGenericOptimized>}; |
482 | return &r; |
483 | } |
484 | |
485 | TfLiteRegistration* Register_SUB_NEON_OPT() { |
486 | static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare, |
487 | sub::Eval<sub::kNeonOptimized>}; |
488 | return &r; |
489 | } |
490 | |
491 | TfLiteRegistration* Register_SUB() { |
492 | #ifdef USE_NEON |
493 | return Register_SUB_NEON_OPT(); |
494 | #else |
495 | return Register_SUB_GENERIC_OPT(); |
496 | #endif |
497 | } |
498 | |
499 | } // namespace builtin |
500 | } // namespace ops |
501 | } // namespace tflite |
502 | |