sub.cc source code [tensorflow/tensorflow/lite/kernels/sub.cc]

1	/ Copyright 2017 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15	#include "tensorflow/lite/kernels/internal/reference/sub.h"
16
17	#include <stddef.h>
18	#include <stdint.h>
19
20	#include <algorithm>
21	#include <limits>
22
23	#include "tensorflow/lite/c/builtin_op_data.h"
24	#include "tensorflow/lite/c/common.h"
25	#include "tensorflow/lite/kernels/internal/compatibility.h"
26	#include "tensorflow/lite/kernels/internal/optimized/cpu_check.h"
27	#include "tensorflow/lite/kernels/internal/optimized/integer_ops/sub.h"
28	#include "tensorflow/lite/kernels/internal/optimized/neon_check.h"
29	#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
30	#include "tensorflow/lite/kernels/internal/quantization_util.h"
31	#include "tensorflow/lite/kernels/internal/reference/add.h"
32	#include "tensorflow/lite/kernels/internal/reference/integer_ops/add.h"
33	#include "tensorflow/lite/kernels/internal/reference/process_broadcast_shapes.h"
34	#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
35	#include "tensorflow/lite/kernels/internal/tensor.h"
36	#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
37	#include "tensorflow/lite/kernels/internal/types.h"
38	#include "tensorflow/lite/kernels/kernel_util.h"
39
40	namespace tflite {
41	namespace ops {
42	namespace builtin {
43	namespace sub {
44
45	// This file has three implementation of Sub.
46	enum KernelType {
47	kReference,
48	kGenericOptimized, // Neon-free
49	kNeonOptimized,
50	};
51
52	constexpr int kInputTensor1 = `0`;
53	constexpr int kInputTensor2 = `1`;
54	constexpr int kOutputTensor = `0`;
55
56	struct OpData {
57	bool requires_broadcast;
58
59	// These fields are used in both the general 8-bit -> 8bit quantized path,
60	// and the special 16-bit -> 16bit quantized path
61	int input1_shift;
62	int input2_shift;
63	int32 output_activation_min;
64	int32 output_activation_max;
65
66	// These fields are used only in the general 8-bit -> 8bit quantized path
67	int32 input1_multiplier;
68	int32 input2_multiplier;
69	int32 output_multiplier;
70	int output_shift;
71	int left_shift;
72	int32 input1_offset;
73	int32 input2_offset;
74	int32 output_offset;
75
76	// This parameter is used to indicate whether
77	// parameter scale is power of two.
78	// It is used in 16-bit -> 16-bit quantization.
79	bool pot_scale_int16;
80	};
81
82	void* Init(TfLiteContext* context, const char* buffer, size_t length) {
83	auto* data = new OpData;
84	data->requires_broadcast = false;
85	return data;
86	}
87
88	void Free(TfLiteContext* context, void* buffer) {
89	delete reinterpret_cast<OpData*>(buffer);
90	}
91
92	TfLiteStatus PrepareGeneralSubOp(TfLiteContext* context,
93	const TfLiteTensor* input_1,
94	const TfLiteTensor* input_2,
95	TfLiteTensor* output, TfLiteSubParams* params,
96	OpData* op_params) {
97	TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 \|\|
98	output->type == kTfLiteInt8 \|\|
99	output->type == kTfLiteInt16);
100	const auto& input1_quantization_params = input_1->params;
101	const auto& input2_quantization_params = input_2->params;
102	const auto& output_quantization_params = output->params;
103	int32_t integer_type_min = `0`;
104	int32_t integer_type_max = `0`;
105	if (output->type == kTfLiteUInt8) {
106	integer_type_min = std::numeric_limits<uint8_t>::min();
107	integer_type_max = std::numeric_limits<uint8_t>::max();
108	} else if (output->type == kTfLiteInt16) {
109	integer_type_min = std::numeric_limits<int16_t>::min();
110	integer_type_max = std::numeric_limits<int16_t>::max();
111	} else {
112	// output->type == kTfLiteInt8
113	integer_type_min = std::numeric_limits<int8_t>::min();
114	integer_type_max = std::numeric_limits<int8_t>::max();
115	}
116
117	TF_LITE_ENSURE(context,
118	input1_quantization_params.zero_point >= integer_type_min);
119	TF_LITE_ENSURE(context,
120	input1_quantization_params.zero_point <= integer_type_max);
121	TF_LITE_ENSURE(context,
122	input2_quantization_params.zero_point >= integer_type_min);
123	TF_LITE_ENSURE(context,
124	input2_quantization_params.zero_point <= integer_type_max);
125	TF_LITE_ENSURE(context,
126	output_quantization_params.zero_point >= integer_type_min);
127	TF_LITE_ENSURE(context,
128	output_quantization_params.zero_point <= integer_type_max);
129
130	op_params->input1_offset = -input1_quantization_params.zero_point;
131	op_params->input2_offset = -input2_quantization_params.zero_point;
132	op_params->output_offset = output_quantization_params.zero_point;
133
134	// The shift is set to 15 in case of 16-bit and 20 in case of 8-bit,
135	// accordingly. In case of 16-bit we have 65535 << 15 which is less than 1 <<
136	// 31, therefore the addition will still fit in a 32 bit accumulator.
137	op_params->left_shift = output->type == kTfLiteInt16 ? `15` : `20`;
138	const double twice_max_input_scale =
139	`2` * std::max(input1_quantization_params.scale,
140	input2_quantization_params.scale);
141	const double real_input1_multiplier =
142	input1_quantization_params.scale / twice_max_input_scale;
143	const double real_input2_multiplier =
144	input2_quantization_params.scale / twice_max_input_scale;
145	const double real_output_multiplier =
146	twice_max_input_scale /
147	((`1` << op_params->left_shift) * output_quantization_params.scale);
148
149	tflite::QuantizeMultiplierSmallerThanOneExp(real_input1_multiplier,
150	&op_params->input1_multiplier,
151	&op_params->input1_shift);
152	tflite::QuantizeMultiplierSmallerThanOneExp(real_input2_multiplier,
153	&op_params->input2_multiplier,
154	&op_params->input2_shift);
155	tflite::QuantizeMultiplierSmallerThanOneExp(real_output_multiplier,
156	&op_params->output_multiplier,
157	&op_params->output_shift);
158
159	TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
160	context, params->activation, output, &op_params->output_activation_min,
161	&op_params->output_activation_max));
162
163	return kTfLiteOk;
164	}
165
166	TfLiteStatus PrepareInt16SubOpPOT(TfLiteContext* context,
167	const TfLiteTensor* input1,
168	const TfLiteTensor* input2,
169	TfLiteTensor* output, TfLiteSubParams* params,
170	OpData* data) {
171	// 16bit -> 16bit special quantized path, supporting only a rather
172	// narrow case of quantization parameters: zero_points must all be 0
173	// ("symmetric quantization") and scales must be power-of-two (which
174	// we abbreviate as "POT" below). The intended use case for this path
175	// is in LSTM cells, where, due to the constraints of implementing
176	// some of the math in these LSTM cells in fixed-point arithmetic,
177	// we need to have such symmetric, power-of-two quantization
178	// (Fixed-point formats are inherently symmetric, power-of-two).
179	TF_LITE_ENSURE_EQ(context, input1->params.zero_point, `0`);
180	TF_LITE_ENSURE_EQ(context, input2->params.zero_point, `0`);
181	TF_LITE_ENSURE_EQ(context, output->params.zero_point, `0`);
182
183	int input1_scale_log2_rounded;
184	bool input1_scale_is_pot =
185	CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
186	TF_LITE_ENSURE(context, input1_scale_is_pot);
187
188	int input2_scale_log2_rounded;
189	bool input2_scale_is_pot =
190	CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
191	TF_LITE_ENSURE(context, input2_scale_is_pot);
192
193	int output_scale_log2_rounded;
194	bool output_scale_is_pot =
195	CheckedLog2(output->params.scale, &output_scale_log2_rounded);
196	TF_LITE_ENSURE(context, output_scale_is_pot);
197
198	data->input1_shift = input1_scale_log2_rounded - output_scale_log2_rounded;
199	data->input2_shift = input2_scale_log2_rounded - output_scale_log2_rounded;
200
201	// Shifting of one input is supported. The graph quantization should ensure
202	// that the other input matches the output.
203	TF_LITE_ENSURE(context, data->input1_shift == `0` \|\| data->input2_shift == `0`);
204	TF_LITE_ENSURE(context, data->input1_shift <= `0`);
205	TF_LITE_ENSURE(context, data->input2_shift <= `0`);
206
207	TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
208	context, params->activation, output, &data->output_activation_min,
209	&data->output_activation_max));
210	return kTfLiteOk;
211	}
212
213	TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
214	OpData* data = reinterpret_cast<OpData*>(node->user_data);
215	auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
216
217	TF_LITE_ENSURE_EQ(context, NumInputs(node), `2`);
218	TF_LITE_ENSURE_EQ(context, NumOutputs(node), `1`);
219
220	const TfLiteTensor* input1;
221	TF_LITE_ENSURE_OK(context,
222	GetInputSafe(context, node, kInputTensor1, &input1));
223	const TfLiteTensor* input2;
224	TF_LITE_ENSURE_OK(context,
225	GetInputSafe(context, node, kInputTensor2, &input2));
226	TfLiteTensor* output;
227	TF_LITE_ENSURE_OK(context,
228	GetOutputSafe(context, node, kOutputTensor, &output));
229
230	TF_LITE_ENSURE_TYPES_EQ(context, input1->type, input2->type);
231	output->type = input2->type;
232
233	data->requires_broadcast = !HaveSameShapes(input1, input2);
234
235	TfLiteIntArray* output_size = nullptr;
236	if (data->requires_broadcast) {
237	TF_LITE_ENSURE_OK(context, CalculateShapeForBroadcast(
238	context, input1, input2, &output_size));
239	} else {
240	output_size = TfLiteIntArrayCopy(input1->dims);
241	}
242
243	// 8bit -> 8bit general quantized path, with general rescalings
244	// as well as, 16bit -> 16bit with general rescalings
245
246	// There are two implementations of SUB operator in case of
247	// 16bit input depending on whether the scale parameter is
248	// the power of 2 or not. Currently only implementation for
249	// general case is used, but we need to use another implementation
250	// for older versions.
251	bool general_scale_int16 = false;
252
253	bool input1_scale_is_pot = false;
254	bool input2_scale_is_pot = false;
255	bool output_scale_is_pot = false;
256
257	int input1_scale_log2_rounded{`0`};
258	int input2_scale_log2_rounded{`0`};
259	int output_scale_log2_rounded{`0`};
260
261	if (input1->type == kTfLiteInt16 && input2->type == kTfLiteInt16 &&
262	output->type == kTfLiteInt16) {
263	TF_LITE_ENSURE_EQ(context, input1->params.zero_point, `0`);
264	TF_LITE_ENSURE_EQ(context, input2->params.zero_point, `0`);
265	TF_LITE_ENSURE_EQ(context, output->params.zero_point, `0`);
266
267	general_scale_int16 = !params \|\| !params->pot_scale_int16;
268
269	if (!general_scale_int16) {
270	// Do preparation in the case of the scale parameter is power of 2.
271	input1_scale_is_pot =
272	CheckedLog2(input1->params.scale, &input1_scale_log2_rounded);
273
274	input2_scale_is_pot =
275	CheckedLog2(input2->params.scale, &input2_scale_log2_rounded);
276
277	output_scale_is_pot =
278	CheckedLog2(output->params.scale, &output_scale_log2_rounded);
279
280	general_scale_int16 =
281	!input1_scale_is_pot \|\| !input2_scale_is_pot \|\| !output_scale_is_pot;
282	}
283	}
284
285	data->pot_scale_int16 = !general_scale_int16;
286
287	if (output->type == kTfLiteUInt8 \|\| output->type == kTfLiteInt8 \|\|
288	general_scale_int16) {
289	TF_LITE_ENSURE_OK(context, PrepareGeneralSubOp(context, input1, input2,
290	output, params, data));
291	} else if (output->type == kTfLiteInt16) {
292	// LSTM-special case with scale parameter of POT
293	TF_LITE_ENSURE_OK(context, PrepareInt16SubOpPOT(context, input1, input2,
294	output, params, data));
295	}
296
297	return context->ResizeTensor(context, output, output_size);
298	}
299
300	template <KernelType kernel_type, typename data_type>
301	void EvalSubImpl(TfLiteContext* context, TfLiteNode* node,
302	TfLiteSubParams* params, const OpData* data,
303	const TfLiteTensor* input1, const TfLiteTensor* input2,
304	bool requires_broadcast, TfLiteTensor* output) {
305	data_type output_activation_min, output_activation_max;
306	CalculateActivationRange(params->activation, &output_activation_min,
307	&output_activation_max);
308	tflite::ArithmeticParams op_params;
309	SetActivationParams(output_activation_min, output_activation_max, &op_params);
310
311	switch (kernel_type) {
312	case kReference:
313	if (requires_broadcast) {
314	reference_ops::BroadcastSubSlow(
315	op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
316	GetTensorShape(input2), GetTensorData<data_type>(input2),
317	GetTensorShape(output), GetTensorData<data_type>(output));
318	} else {
319	reference_ops::SubWithActivation(
320	op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
321	GetTensorShape(input2), GetTensorData<data_type>(input2),
322	GetTensorShape(output), GetTensorData<data_type>(output));
323	}
324	break;
325	case kGenericOptimized:
326	case kNeonOptimized:
327	if (requires_broadcast) {
328	optimized_ops::BroadcastSubSlow(
329	op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
330	GetTensorShape(input2), GetTensorData<data_type>(input2),
331	GetTensorShape(output), GetTensorData<data_type>(output));
332	} else {
333	optimized_ops::SubWithActivation(
334	op_params, GetTensorShape(input1), GetTensorData<data_type>(input1),
335	GetTensorShape(input2), GetTensorData<data_type>(input2),
336	GetTensorShape(output), GetTensorData<data_type>(output));
337	}
338	break;
339	}
340	}
341
342	template <KernelType kernel_type>
343	void EvalSub(TfLiteContext* context, TfLiteNode* node, TfLiteSubParams* params,
344	const OpData* data, const TfLiteTensor* input1,
345	const TfLiteTensor* input2, TfLiteTensor* output) {
346	const bool requires_broadcast = data->requires_broadcast;
347	switch (output->type) {
348	case kTfLiteInt32:
349	EvalSubImpl<kernel_type, int32_t>(context, node, params, data, input1,
350	input2, requires_broadcast, output);
351	break;
352	case kTfLiteFloat32:
353	EvalSubImpl<kernel_type, float>(context, node, params, data, input1,
354	input2, requires_broadcast, output);
355	break;
356	case kTfLiteInt64:
357	EvalSubImpl<kernel_type, int64_t>(context, node, params, data, input1,
358	input2, requires_broadcast, output);
359	break;
360
361	default:
362	TF_LITE_KERNEL_LOG(context, "output type %s is not supported.",
363	TfLiteTypeGetName(output->type));
364	}
365	}
366
367	template <KernelType kernel_type>
368	void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
369	TfLiteSubParams* params, const OpData* data,
370	const TfLiteTensor* input1, const TfLiteTensor* input2,
371	TfLiteTensor* output) {
372	tflite::ArithmeticParams op_params;
373	op_params.left_shift = data->left_shift;
374	op_params.input1_offset = data->input1_offset;
375	op_params.input1_multiplier = data->input1_multiplier;
376	op_params.input1_shift = data->input1_shift;
377	op_params.input2_offset = data->input2_offset;
378	op_params.input2_multiplier = data->input2_multiplier;
379	op_params.input2_shift = data->input2_shift;
380	op_params.output_offset = data->output_offset;
381	op_params.output_multiplier = data->output_multiplier;
382	op_params.output_shift = data->output_shift;
383	SetActivationParams(data->output_activation_min, data->output_activation_max,
384	&op_params);
385
386	const bool need_broadcast = optimized_ops::ProcessBroadcastShapes(
387	GetTensorShape(input1), GetTensorShape(input2), &op_params);
388
389	#define TF_LITE_SUB(type, opname, data_type) \
390	type::opname(op_params, GetTensorShape(input1), \
391	GetTensorData<data_type>(input1), GetTensorShape(input2), \
392	GetTensorData<data_type>(input2), GetTensorShape(output), \
393	GetTensorData<data_type>(output))
394	if (output->type == kTfLiteInt8) {
395	if (need_broadcast) {
396	TF_LITE_SUB(reference_ops, BroadcastQuantSubSlow, int8_t);
397	} else {
398	TF_LITE_SUB(reference_ops, Sub, int8_t);
399	}
400	} else if (!data->pot_scale_int16) {
401	if (kernel_type == kReference) {
402	if (need_broadcast) {
403	TF_LITE_SUB(reference_ops, BroadcastQuantSubSlow, int16_t);
404	} else {
405	TF_LITE_SUB(reference_ops, Sub, int16_t);
406	}
407	} else {
408	if (need_broadcast) {
409	TF_LITE_SUB(optimized_integer_ops, BroadcastSubDispatch, int16_t);
410	} else {
411	TF_LITE_SUB(optimized_integer_ops, Sub, int16_t);
412	}
413	}
414	} else if (output->type == kTfLiteUInt8) {
415	if (need_broadcast) {
416	TF_LITE_SUB(reference_ops, BroadcastQuantSubSlow, uint8_t);
417	} else {
418	TF_LITE_SUB(reference_ops, Sub, uint8_t);
419	}
420	} else {
421	if (kernel_type == kReference) {
422	if (need_broadcast) {
423	TF_LITE_SUB(reference_ops, BroadcastSub16POTSlow, int16_t);
424	} else {
425	TF_LITE_SUB(reference_ops, Sub16, int16_t);
426	}
427	} else {
428	if (need_broadcast) {
429	TF_LITE_SUB(optimized_ops, BroadcastSub16POTSlow, int16_t);
430	} else {
431	TF_LITE_SUB(optimized_ops, Sub16, int16_t);
432	}
433	}
434	}
435	#undef TF_LITE_SUB
436	}
437
438	template <KernelType kernel_type>
439	TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
440	auto* params = reinterpret_cast<TfLiteSubParams*>(node->builtin_data);
441	OpData* data = reinterpret_cast<OpData*>(node->user_data);
442
443	const TfLiteTensor* input1;
444	TF_LITE_ENSURE_OK(context,
445	GetInputSafe(context, node, kInputTensor1, &input1));
446	const TfLiteTensor* input2;
447	TF_LITE_ENSURE_OK(context,
448	GetInputSafe(context, node, kInputTensor2, &input2));
449	TfLiteTensor* output;
450	TF_LITE_ENSURE_OK(context,
451	GetOutputSafe(context, node, kOutputTensor, &output));
452
453	if (output->type == kTfLiteFloat32 \|\| output->type == kTfLiteInt32 \|\|
454	output->type == kTfLiteInt64) {
455	EvalSub<kernel_type>(context, node, params, data, input1, input2, output);
456	} else if (output->type == kTfLiteUInt8 \|\| output->type == kTfLiteInt8 \|\|
457	output->type == kTfLiteInt16) {
458	EvalQuantized<kernel_type>(context, node, params, data, input1, input2,
459	output);
460	} else {
461	TF_LITE_KERNEL_LOG(
462	context,
463	"output type %d is not supported, requires float\|uint8\|int32 types.",
464	output->type);
465	return kTfLiteError;
466	}
467
468	return kTfLiteOk;
469	}
470
471	} // namespace sub
472
473	TfLiteRegistration* Register_SUB_REF() {
474	static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
475	sub::Eval<sub::kReference>};
476	return &r;
477	}
478
479	TfLiteRegistration* Register_SUB_GENERIC_OPT() {
480	static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
481	sub::Eval<sub::kGenericOptimized>};
482	return &r;
483	}
484
485	TfLiteRegistration* Register_SUB_NEON_OPT() {
486	static TfLiteRegistration r = {sub::Init, sub::Free, sub::Prepare,
487	sub::Eval<sub::kNeonOptimized>};
488	return &r;
489	}
490
491	TfLiteRegistration* Register_SUB() {
492	#ifdef USE_NEON
493	return Register_SUB_NEON_OPT();
494	#else
495	return Register_SUB_GENERIC_OPT();
496	#endif
497	}
498
499	} // namespace builtin
500	} // namespace ops
501	} // namespace tflite
502

Browse the source code of tensorflow/tensorflow/lite/kernels/sub.cc