1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15#include "tensorflow/lite/kernels/internal/optimized/integer_ops/add.h"
16
17#include <stddef.h>
18#include <stdint.h>
19
20#include <algorithm>
21
22#include "tensorflow/lite/c/builtin_op_data.h"
23#include "tensorflow/lite/c/common.h"
24#include "tensorflow/lite/kernels/internal/compatibility.h"
25#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
26#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
27#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
28#include "tensorflow/lite/kernels/internal/quantization_util.h"
29#include "tensorflow/lite/kernels/internal/reference/add.h"
30#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
31#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
32#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
33#include "tensorflow/lite/kernels/internal/tensor.h"
34#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
35#include "tensorflow/lite/kernels/internal/types.h"
36#include "tensorflow/lite/kernels/kernel_util.h"
37#include "tensorflow/lite/kernels/op_macros.h"
38
39namespace tflite {
40namespace ops {
41namespace builtin {
42namespace add {
43// This file has three implementation of Add.
44enum KernelType {
45 kReference,
46 kGenericOptimized, // Neon-free
47 kNeonOptimized,
48};
49
50constexpr int kInputTensor1 = 0;
51constexpr int kInputTensor2 = 1;
52constexpr int kOutputTensor = 0;
53
54struct OpData {
55 // These fields are used in both the general 8-bit -> 8bit quantized path,
56 // and the special 16-bit -> 16bit quantized path
57 int input1_shift;
58 int input2_shift;
59 int32 output_activation_min;
60 int32 output_activation_max;
61
62 // These fields are used only in the general 8-bit -> 8bit quantized path
63 int32 input1_multiplier;
64 int32 input2_multiplier;
65 int32 output_multiplier;
66 int output_shift;
67 int left_shift;
68 int32 input1_offset;
69 int32 input2_offset;
70 int32 output_offset;
71
72 // This parameter is used to indicate whether
73 // parameter scale is power of two.
74 // It is used in 16-bit -> 16-bit quantization.
75 bool pot_scale_int16;
76};
77
78void* Init(TfLiteContext* context, const char* buffer, size_t length) {
79 auto* data = new OpData;
80 return data;
81}
82
83void Free(TfLiteContext* context, void* buffer) {
84 delete reinterpret_cast<OpData*>(buffer);
85}
86
87TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
88 auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
89 OpData* data = reinterpret_cast<OpData*>(node->user_data);
90
91 TF_LITE_ENSURE_EQ(context, NumInputs(node), 2);
92 TF_LITE_ENSURE_EQ(context, NumOutputs(node), 1);
93
94 const TfLiteTensor* input1;
95 TF_LITE_ENSURE_OK(context,
96 GetInputSafe(context, node, kInputTensor1, &input1));
97 const TfLiteTensor* input2;
98 TF_LITE_ENSURE_OK(context,
99 GetInputSafe(context, node, kInputTensor2, &input2));
100 TfLiteTensor* output;
101 TF_LITE_ENSURE_OK(context,
102 GetOutputSafe(context, node, kOutputTensor, &output));
103
104 TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
105 output->type = input2->type;
106
107 const bool requires_broadcast = !HaveSameShapes(input1, input2);
108
109 TfLiteIntArray* output_size = nullptr;
110 if (requires_broadcast) {
111 TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
112 context, input1, input2, &output_size));
113 } else {
114 output_size = TfLiteIntArrayCopy(input1->dims);
115 }
116
117 // 8bit -> 8bit general quantized path, with general rescalings
118 // as well as, int16 -> int16 with general rescalings
119
120 // There are two implementations of ADD operator in case of
121 // 16bit input/output depending on whether the scale parameter is
122 // the power of 2 or not. Currently only implementation for
123 // general case is used, but we need to use another implementation
124 // for older versions.
125 bool general_scale_int16 = false;
126
127 bool input1_scale_is_pot = false;
128 bool input2_scale_is_pot = false;
129 bool output_scale_is_pot = false;
130
131 int input1_scale_log2_rounded{0};
132 int input2_scale_log2_rounded{0};
133 int output_scale_log2_rounded{0};
134
135 if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
136 output->type == kTfLiteInt16) {
137 // In case of int16, quantization is symmetic and
138 // zero point should be zero.
139 TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
140 TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
141 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
142
143 general_scale_int16 = !params || !params->pot_scale_int16;
144
145 if (!general_scale_int16) {
146 // Do preparation in the case of the scale parameter is power of 2.
147
148 input1_scale_is_pot =
149 CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
150
151 input2_scale_is_pot =
152 CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
153
154 output_scale_is_pot =
155 CheckedLog2(output->params.scale, &output_scale_log2_rounded);
156
157 general_scale_int16 =
158 !input1_scale_is_pot || !input2_scale_is_pot || !output_scale_is_pot;
159 }
160 }
161
162 data->pot_scale_int16 = !general_scale_int16;
163
164 if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
165 general_scale_int16) {
166 // 8bit -> 8bit general quantized path, with general rescalings
167 // as well as, 16bit -> 16bit with general rescalings
168 data->input1_offset = -input1->params.zero_point;
169 data->input2_offset = -input2->params.zero_point;
170 data->output_offset = output->params.zero_point;
171
172 // The shift is set to 15 for 16-bit and 20 in case of 8-bit, accordingly.
173 // In case of 16-bit we have 65535 << 15 which is less than 1 << 31,
174 // therefore the addition will still fit in a 32 bit accumulator.
175 data->left_shift = general_scale_int16 ? 15 : 20;
176 const double twice_max_input_scale =
177 2 * std::max(input1->params.scale, input2->params.scale);
178 const double real_input1_multiplier =
179 input1->params.scale / twice_max_input_scale;
180 const double real_input2_multiplier =
181 input2->params.scale / twice_max_input_scale;
182 const double real_output_multiplier =
183 twice_max_input_scale /
184 ((1 << data->left_shift) * output->params.scale);
185
186 QuantizeMultiplierSmallerThanOneExp(
187 real_input1_multiplier, &data->input1_multiplier, &data->input1_shift);
188
189 QuantizeMultiplierSmallerThanOneExp(
190 real_input2_multiplier, &data->input2_multiplier, &data->input2_shift);
191
192 QuantizeMultiplierSmallerThanOneExp(
193 real_output_multiplier, &data->output_multiplier, &data->output_shift);
194
195 TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
196 context, params->activation, output, &data->output_activation_min,
197 &data->output_activation_max));
198 } else if (output->type == kTfLiteInt16) {
199 // 16bit -> 16bit special quantized path, supporting only a rather
200 // narrow case of quantization parameters: zero_points must all be 0
201 // ("symmetric quantization") and scales must be power-of-two (which
202 // we abbreviate as "POT" below). The intended use case for this path
203 // is in LSTM cells, where, due to the constraints of implementing
204 // some of the math in these LSTM cells in fixed-point arithmetic,
205 // we need to have such symmetric, power-of-two quantization
206 // (Fixed-point formats are inherently symmetric, power-of-two).
207 TF_LITE_ENSURE_EQ(context, input1->params.zero_point, 0);
208 TF_LITE_ENSURE_EQ(context, input2->params.zero_point, 0);
209 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
210
211 TF_LITE_ENSURE(context, input1_scale_is_pot);
212 TF_LITE_ENSURE(context, input2_scale_is_pot);
213 TF_LITE_ENSURE(context, output_scale_is_pot);
214
215 data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
216 data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
217
218 // Shifting of one input is supported. The graph quantization should ensure
219 // that the other input matches the output.
220 TF_LITE_ENSURE(context, data->input1_shift == 0 || data->input2_shift == 0);
221 TF_LITE_ENSURE(context, data->input1_shift <= 0);
222 TF_LITE_ENSURE(context, data->input2_shift <= 0);
223
224 TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
225 context, params->activation, output, &data->output_activation_min,
226 &data->output_activation_max));
227 }
228
229 return context->ResizeTensor(context, output, output_size);
230}
231
232template <KernelType kernel_type>
233void EvalAdd(TfLiteContext* context, TfLiteNode* node, TfLiteAddParams* params,
234 const OpData* data, const TfLiteTensor* input1,
235 const TfLiteTensor* input2, TfLiteTensor* output) {
236 tflite::ArithmeticParams op_params;
237 const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
238 GetTensorShape(input1), GetTensorShape(input2), &op_params);
239#define TF_LITE_ADD(type, opname, data_type) \
240 data_type output_activation_min, output_activation_max; \
241 CalculateActivationRange(params->activation, &output_activation_min, \
242 &output_activation_max); \
243 SetActivationParams(output_activation_min, output_activation_max, \
244 &op_params); \
245 type::opname(op_params, GetTensorShape(input1), \
246 GetTensorData<data_type>(input1), GetTensorShape(input2), \
247 GetTensorData<data_type>(input2), GetTensorShape(output), \
248 GetTensorData<data_type>(output))
249 if (output->type == kTfLiteInt32) {
250 if (kernel_type == kReference) {
251 if (need_broadcast) {
252 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int32_t);
253 } else {
254 TF_LITE_ADD(reference_ops, Add, int32_t);
255 }
256 } else {
257 if (need_broadcast) {
258 TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int32_t);
259 } else {
260 TF_LITE_ADD(optimized_ops, Add, int32_t);
261 }
262 }
263 } else if (output->type == kTfLiteInt64) {
264 if (kernel_type == kReference) {
265 if (need_broadcast) {
266 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int64_t);
267 } else {
268 TF_LITE_ADD(reference_ops, Add, int64_t);
269 }
270 } else {
271 if (need_broadcast) {
272 TF_LITE_ADD(optimized_ops, BroadcastAdd4DSlow, int64_t);
273 } else {
274 TF_LITE_ADD(optimized_ops, Add, int64_t);
275 }
276 }
277 } else if (output->type == kTfLiteFloat32) {
278 if (kernel_type == kReference) {
279 if (need_broadcast) {
280 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, float);
281 } else {
282 TF_LITE_ADD(reference_ops, Add, float);
283 }
284 } else {
285 if (need_broadcast) {
286 TF_LITE_ADD(optimized_ops, BroadcastAddDispatch, float);
287 } else {
288 TF_LITE_ADD(optimized_ops, Add, float);
289 }
290 }
291 }
292#undef TF_LITE_ADD
293}
294
295template <KernelType kernel_type>
296TfLiteStatus EvalAddQuantized(TfLiteContext* context, TfLiteNode* node,
297 TfLiteAddParams* params, const OpData* data,
298 const TfLiteTensor* input1,
299 const TfLiteTensor* input2,
300 TfLiteTensor* output) {
301 if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
302 !data->pot_scale_int16) {
303 tflite::ArithmeticParams op_params;
304 op_params.left_shift = data->left_shift;
305 op_params.input1_offset = data->input1_offset;
306 op_params.input1_multiplier = data->input1_multiplier;
307 op_params.input1_shift = data->input1_shift;
308 op_params.input2_offset = data->input2_offset;
309 op_params.input2_multiplier = data->input2_multiplier;
310 op_params.input2_shift = data->input2_shift;
311 op_params.output_offset = data->output_offset;
312 op_params.output_multiplier = data->output_multiplier;
313 op_params.output_shift = data->output_shift;
314 SetActivationParams(data->output_activation_min,
315 data->output_activation_max, &op_params);
316 bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
317 GetTensorShape(input1), GetTensorShape(input2), &op_params);
318#define TF_LITE_ADD(type, opname, dtype) \
319 type::opname(op_params, GetTensorShape(input1), \
320 GetTensorData<dtype>(input1), GetTensorShape(input2), \
321 GetTensorData<dtype>(input2), GetTensorShape(output), \
322 GetTensorData<dtype>(output));
323 if (output->type == kTfLiteInt8) {
324 if (kernel_type == kReference) {
325 if (need_broadcast) {
326 TF_LITE_ADD(reference_integer_ops, BroadcastAdd4DSlow, int8_t);
327 } else {
328 TF_LITE_ADD(reference_integer_ops, Add, int8_t);
329 }
330 } else {
331 if (need_broadcast) {
332 TF_LITE_ADD(optimized_integer_ops, BroadcastAddDispatch, int8_t);
333 } else {
334 TF_LITE_ADD(optimized_integer_ops, Add, int8_t);
335 }
336 }
337 } else if (output->type == kTfLiteInt16) {
338 if (need_broadcast) {
339 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, int16_t);
340 } else {
341 if (kernel_type == kReference) {
342 reference_ops::Add(
343 op_params, GetTensorShape(input1), GetTensorData<int16_t>(input1),
344 GetTensorShape(input2), GetTensorData<int16_t>(input2),
345 GetTensorShape(output), GetTensorData<int16_t>(output), false);
346 } else {
347 TF_LITE_ADD(optimized_integer_ops, Add, int16_t);
348 }
349 }
350 } else {
351 if (kernel_type == kReference) {
352 if (need_broadcast) {
353 TF_LITE_ADD(reference_ops, BroadcastAdd4DSlow, uint8_t);
354 } else {
355 TF_LITE_ADD(reference_ops, Add, uint8_t);
356 }
357 } else {
358 if (need_broadcast) {
359 TF_LITE_ADD(optimized_ops, BroadcastAddDispatch, uint8_t);
360 } else {
361 TF_LITE_ADD(optimized_ops, Add, uint8_t);
362 }
363 }
364 }
365#undef TF_LITE_ADD
366 } else if (output->type == kTfLiteInt16) {
367 tflite::ArithmeticParams op_params;
368 op_params.input1_shift = data->input1_shift;
369 op_params.input2_shift = data->input2_shift;
370 SetActivationParams(data->output_activation_min,
371 data->output_activation_max, &op_params);
372#define TF_LITE_ADD(type, opname) \
373 type::opname(op_params, GetTensorShape(input1), \
374 GetTensorData<int16_t>(input1), GetTensorShape(input2), \
375 GetTensorData<int16_t>(input2), GetTensorShape(output), \
376 GetTensorData<int16_t>(output))
377 // The quantized version of Add doesn't support activations, so we
378 // always use BroadcastAdd.
379 if (kernel_type == kReference) {
380 TF_LITE_ADD(reference_ops, Add);
381 } else {
382 TF_LITE_ADD(optimized_ops, Add);
383 }
384#undef TF_LITE_ADD
385 }
386
387 return kTfLiteOk;
388}
389
390template <KernelType kernel_type>
391TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
392 auto* params = reinterpret_cast<TfLiteAddParams*>(node->builtin_data);
393 OpData* data = reinterpret_cast<OpData*>(node->user_data);
394
395 const TfLiteTensor* input1;
396 TF_LITE_ENSURE_OK(context,
397 GetInputSafe(context, node, kInputTensor1, &input1));
398 const TfLiteTensor* input2;
399 TF_LITE_ENSURE_OK(context,
400 GetInputSafe(context, node, kInputTensor2, &input2));
401 TfLiteTensor* output;
402 TF_LITE_ENSURE_OK(context,
403 GetOutputSafe(context, node, kOutputTensor, &output));
404
405 if (output->type == kTfLiteFloat32 || output->type == kTfLiteInt32 ||
406 output->type == kTfLiteInt64) {
407 EvalAdd<kernel_type>(context, node, params, data, input1, input2, output);
408 } else if (output->type == kTfLiteUInt8 || output->type == kTfLiteInt8 ||
409 output->type == kTfLiteInt16) {
410 TF_LITE_ENSURE_OK(context,
411 EvalAddQuantized<kernel_type>(context, node, params, data,
412 input1, input2, output));
413 } else {
414 TF_LITE_UNSUPPORTED_TYPE(context, output->type, "Add");
415 }
416
417 return kTfLiteOk;
418}
419
420} // namespace add
421
422TfLiteRegistration* Register_ADD_REF() {
423 static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
424 add::Eval<add::kReference>};
425 return &r;
426}
427
428TfLiteRegistration* Register_ADD_GENERIC_OPT() {
429 static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
430 add::Eval<add::kGenericOptimized>};
431 return &r;
432}
433
434TfLiteRegistration* Register_ADD_NEON_OPT() {
435 static TfLiteRegistration r = {add::Init, add::Free, add::Prepare,
436 add::Eval<add::kNeonOptimized>};
437 return &r;
438}
439
440TfLiteRegistration* Register_ADD() {
441#ifdef USE_NEON
442 return Register_ADD_NEON_OPT();
443#else
444 return Register_ADD_GENERIC_OPT();
445#endif
446}
447
448} // namespace builtin
449} // namespace ops
450} // namespace tflite
451