fully_connected.cc source code [tensorflow/tensorflow/lite/kernels/fully_connected.cc]

1	/ Copyright 2017 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15
16	#include "tensorflow/lite/kernels/internal/optimized/integer_ops/fully_connected.h"
17
18	#include <algorithm>
19	#include <cstddef>
20	#include <cstdint>
21
22	#include "tensorflow/lite/c/builtin_op_data.h"
23	#include "tensorflow/lite/c/common.h"
24	#include "tensorflow/lite/kernels/cpu_backend_context.h"
25	#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
26	#include "tensorflow/lite/kernels/internal/optimized/sparse_ops/fully_connected.h"
27	#include "tensorflow/lite/kernels/internal/quantization_util.h"
28	#include "tensorflow/lite/kernels/internal/reference/fully_connected.h"
29	#include "tensorflow/lite/kernels/internal/reference/integer_ops/fully_connected.h"
30	#include "tensorflow/lite/kernels/internal/reference/reference_ops.h"
31	#include "tensorflow/lite/kernels/internal/reference/sparse_ops/fully_connected.h"
32	#include "tensorflow/lite/kernels/internal/tensor.h"
33	#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
34	#include "tensorflow/lite/kernels/internal/tensor_utils.h"
35	#include "tensorflow/lite/kernels/internal/types.h"
36	#include "tensorflow/lite/kernels/kernel_util.h"
37
38	namespace tflite {
39	namespace ops {
40	namespace builtin {
41	namespace fully_connected {
42
43	namespace {
44	bool SupportedSparsityFormat(const TfLiteSparsity& sparsity) {
45	if (sparsity.dim_metadata[`0`].format == kTfLiteDimDense &&
46	sparsity.dim_metadata[`1`].format == kTfLiteDimSparseCSR) {
47	return true;
48	}
49
50	return false;
51	}
52
53	static const int kDimMetadataSizeRandomSparse = `2`;
54	static const int kDimMetadataSizeBlockSparse = `3`;
55
56	TfLiteStatus CreateLedgerTensor(const TfLiteSparsity* sparsity,
57	TfLiteContext* context, TfLiteTensor* ledger) {
58	TF_LITE_ENSURE(context, sparsity != nullptr);
59	ledger->name = "FC_ledger";
60	ledger->type = kTfLiteUInt8;
61	ledger->allocation_type = kTfLiteArenaRwPersistent;
62	TfLiteIntArray* ledger_size = TfLiteIntArrayCreate(`1`);
63	ledger_size->data[`0`] = sparsity->dim_metadata[`1`].array_indices->size +
64	sparsity->dim_metadata[`1`].array_segments->size - `1`;
65	return context->ResizeTensor(context, ledger, ledger_size);
66	}
67
68	TfLiteStatus PopulateLedgerData(const TfLiteSparsity* sparsity,
69	TfLiteContext* context, uint8_t* ledger_data) {
70	TF_LITE_ENSURE(context, sparsity != nullptr);
71	const auto* array_segments = sparsity->dim_metadata[`1`].array_segments;
72	const auto* array_indices = sparsity->dim_metadata[`1`].array_indices;
73	int output_data_ptr = `0`;
74
75	for (int i = `0`; i < array_segments->size - `1`; i++) {
76	int row_start = array_segments->data[i];
77	int row_end = array_segments->data[i + `1`];
78	if (row_end - row_start > UINT8_MAX) {
79	return kTfLiteError;
80	}
81	// Copy num of non-zero blocks in row i.
82	ledger_data[output_data_ptr] = static_cast<uint8_t>(row_end - row_start);
83	output_data_ptr++;
84
85	for (int j = row_start; j < row_end; j++) {
86	if (array_indices->data[j] > UINT8_MAX) {
87	return kTfLiteError;
88	}
89	// Copy indices of non-zero blocks in row i.
90	ledger_data[output_data_ptr] =
91	static_cast<uint8_t>(array_indices->data[j]);
92	output_data_ptr++;
93	}
94	}
95	return kTfLiteOk;
96	}
97
98	} // namespace
99
100	// This file has four implementations of FullyConnected
101	enum KernelType {
102	kReference,
103	kGenericOptimized,
104	kLegacyPie, // Legacy path used by the PIE team and related clients.
105	};
106
107	struct OpData {
108	// The scaling factor from input to output (aka the 'real multiplier') can
109	// be represented as a fixed point multiplier plus a left shift.
110	int32_t output_multiplier;
111	int output_shift;
112	// Per channel output multiplier and shift.
113	std::vector<int32_t> per_channel_output_multiplier;
114	std::vector<int> per_channel_output_shift;
115	// The range of the fused activation layer. For example for kNone and
116	// uint8_t these would be 0 and 255.
117	int32_t output_activation_min;
118	int32_t output_activation_max;
119	// The index of the temporary tensor where the quantized inputs are cached.
120	int scratch_tensor_index;
121	bool compute_row_sums = false;
122	// Only used for sparse hybrid fully connected kernels.
123	bool ledger_initialized;
124	};
125
126	constexpr int kInputTensor = `0`;
127	constexpr int kWeightsTensor = `1`;
128	constexpr int kBiasTensor = `2`;
129	constexpr int kOutputTensor = `0`;
130	constexpr int kShuffledInputWorkspaceTensor = `1`;
131
132	inline TfLiteStatus CheckTypes(TfLiteContext* context,
133	const TfLiteTensor* input,
134	const TfLiteTensor* filter,
135	const TfLiteTensor* bias, TfLiteTensor* output,
136	TfLiteFullyConnectedParams* params) {
137	const bool is_quantized =
138	((filter->type == kTfLiteUInt8) \|\| (filter->type == kTfLiteInt8));
139	const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
140	const bool is_shuffled =
141	is_quantized && (params->weights_format ==
142	kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8);
143
144	// optional bias tensor.
145	const bool is_optional_bias_float = !bias \|\| (bias->type == kTfLiteFloat32);
146	const bool is_optional_bias_int =
147	!bias \|\| (bias->type == kTfLiteInt32) \|\| (bias->type == kTfLiteInt64);
148
149	if (is_quantized) {
150	if (is_shuffled) {
151	TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteUInt8);
152	TF_LITE_ENSURE_TYPES_EQ(context, filter->type, kTfLiteUInt8);
153	TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteInt16);
154	TF_LITE_ENSURE_EQ(context, is_optional_bias_int, true);
155	} else if (is_hybrid) {
156	TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
157	TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
158	TF_LITE_ENSURE_EQ(context, is_optional_bias_float, true);
159	} else {
160	TF_LITE_ENSURE(context, input->type == kTfLiteUInt8 \|\|
161	input->type == kTfLiteInt8 \|\|
162	input->type == kTfLiteInt16);
163	TF_LITE_ENSURE(context, output->type == kTfLiteUInt8 \|\|
164	output->type == kTfLiteInt8 \|\|
165	output->type == kTfLiteInt16);
166	TF_LITE_ENSURE_EQ(context, is_optional_bias_int, true);
167	}
168	} else {
169	// Only float32 is supported currently
170	TF_LITE_ENSURE_TYPES_EQ(context, input->type, kTfLiteFloat32);
171	TF_LITE_ENSURE_TYPES_EQ(context, output->type, kTfLiteFloat32);
172	TF_LITE_ENSURE_TYPES_EQ(context, filter->type, kTfLiteFloat32);
173	TF_LITE_ENSURE_EQ(context, is_optional_bias_float, true);
174	}
175
176	return kTfLiteOk;
177	}
178
179	void* Init(TfLiteContext* context, const char* buffer, size_t length) {
180	// This is a builtin op, so we don't use the contents in 'buffer', if any.
181	// Instead, we allocate a new object to carry information from Prepare() to
182	// Eval().
183	auto* op_data = new OpData ();
184	context->AddTensors(context, /tensors_to_add=/`6`,
185	&op_data->scratch_tensor_index);
186	return op_data;
187	}
188
189	void Free(TfLiteContext* context, void* buffer) {
190	delete reinterpret_cast<OpData*>(buffer);
191	}
192
193	TfLiteStatus PrepareImpl(TfLiteContext* context, TfLiteNode* node) {
194	auto* params =
195	reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
196	OpData* data = reinterpret_cast<OpData*>(node->user_data);
197	// Check we have all the inputs and outputs we need.
198	TF_LITE_ENSURE(context, node->inputs->size == `2` \|\| node->inputs->size == `3`);
199	// Shuffled formats need a workspace to store the shuffled input activations.
200	const int expected_outputs_count =
201	params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault ? `1`
202	: `2`;
203	TF_LITE_ENSURE_EQ(context, node->outputs->size, expected_outputs_count);
204
205	const TfLiteTensor* input;
206	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
207	const TfLiteTensor* filter;
208	TF_LITE_ENSURE_OK(context,
209	GetInputSafe(context, node, kWeightsTensor, &filter));
210	const TfLiteTensor* bias =
211	(node->inputs->size == `3`)
212	? GetOptionalInputTensor(context, node, kBiasTensor)
213	: nullptr;
214	TfLiteTensor* output;
215	TF_LITE_ENSURE_OK(context,
216	GetOutputSafe(context, node, kOutputTensor, &output));
217
218	// Check proper datatype match among all Input Tensors
219	TF_LITE_ENSURE_STATUS(
220	CheckTypes(context, input, filter, bias, output, params));
221
222	// Check all the parameters of tensor match within themselves and match the
223	// input configuration.
224	int input_size = `1`;
225	for (int i = `0`; i < input->dims->size; i++) {
226	input_size *= input->dims->data[i];
227	}
228
229	TF_LITE_ENSURE_EQ(context, NumDimensions(filter), `2`);
230
231	// When the second dimension size of the filter tensor is 0, we need to
232	// generate the output shape early to avoid dividing by 0.
233	if (filter->dims->data[`1`] == `0`) {
234	TfLiteIntArray* output_size_array;
235	if (params->keep_num_dims) {
236	output_size_array = TfLiteIntArrayCopy(input->dims);
237	output_size_array->data[output_size_array->size - `1`] =
238	filter->dims->data[`0`];
239	} else {
240	output_size_array = TfLiteIntArrayCreate(`2`);
241	// If `keep_num_dims` is false, we need to flatten the output tensor to
242	// have rank 2.
243	int batch_size = `1`;
244	for (int i = `0`; i < input->dims->size - `1`; ++i)
245	batch_size *= input->dims->data[i];
246	output_size_array->data[`0`] = batch_size;
247	output_size_array->data[`1`] = filter->dims->data[`0`];
248	}
249	TF_LITE_ENSURE_OK(
250	context, context->ResizeTensor(context, output, output_size_array));
251	return kTfLiteOk;
252	}
253
254	const int batch_size = input_size / filter->dims->data[`1`];
255	const int num_units = filter->dims->data[`0`];
256
257	if (bias) {
258	TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, `0`));
259	}
260
261	// Note that quantized inference requires that all tensors have their
262	// parameters set. This is usually done during quantized training.
263	if (input->type == kTfLiteUInt8 \|\| input->type == kTfLiteInt8 \|\|
264	input->type == kTfLiteInt16) {
265	// Populate scalar quantization parameters.
266	double real_multiplier = `0.0`;
267	TF_LITE_ENSURE_STATUS(GetQuantizedConvolutionMultipler(
268	context, input, filter, bias, output, &real_multiplier));
269	int exponent;
270	QuantizeMultiplier(real_multiplier, &data->output_multiplier, &exponent);
271	data->output_shift = exponent;
272
273	// Populate per-channel quantization parameters, if per-channel
274	// quantization.
275	TF_LITE_ENSURE_EQ(context, input->quantization.type,
276	kTfLiteAffineQuantization);
277	TF_LITE_ENSURE_EQ(context, filter->quantization.type,
278	kTfLiteAffineQuantization);
279	const auto* affine_quantization =
280	reinterpret_cast<TfLiteAffineQuantization*>(
281	filter->quantization.params);
282	TF_LITE_ENSURE(context, affine_quantization);
283	TF_LITE_ENSURE(context, affine_quantization->scale);
284	const int per_channel_quantization_size = affine_quantization->scale->size;
285	const bool is_per_channel = per_channel_quantization_size > `1`;
286	if (is_per_channel) {
287	// Currently only Int8/Int16 is supported for per channel quantization.
288	TF_LITE_ENSURE(context,
289	input->type == kTfLiteInt8 \|\| input->type == kTfLiteInt16);
290	TF_LITE_ENSURE_EQ(context, filter->type, kTfLiteInt8);
291	TF_LITE_ENSURE_EQ(context, affine_quantization->scale->size,
292	per_channel_quantization_size);
293	TF_LITE_ENSURE_EQ(
294	context, per_channel_quantization_size,
295	filter->dims->data[affine_quantization->quantized_dimension]);
296	// Populate multiplier and shift using affine quantization.
297	const float input_scale = input->params.scale;
298	const float output_scale = output->params.scale;
299	const float* filter_scales = affine_quantization->scale->data;
300	data->per_channel_output_multiplier.resize(per_channel_quantization_size);
301	data->per_channel_output_shift.resize(per_channel_quantization_size);
302	int32_t* per_channel_multiplier =
303	data->per_channel_output_multiplier.data();
304	int32_t* per_channel_shift = data->per_channel_output_shift.data();
305	for (int i = `0`; i < per_channel_quantization_size; ++i) {
306	const float scale = filter_scales[i];
307	const double filter_scale = static_cast<double>(scale);
308	const double effective_output_scale = static_cast<double>(input_scale) *
309	filter_scale /
310	static_cast<double>(output_scale);
311	int32_t significand;
312	int channel_shift;
313	QuantizeMultiplier(effective_output_scale, &significand,
314	&channel_shift);
315	per_channel_multiplier[i] = significand;
316	per_channel_shift[i] = channel_shift;
317	}
318	}
319
320	TF_LITE_ENSURE_STATUS(CalculateActivationRangeQuantized(
321	context, params->activation, output, &data->output_activation_min,
322	&data->output_activation_max));
323	}
324
325	if (input->type == kTfLiteInt16 && output->type == kTfLiteInt16) {
326	TF_LITE_ENSURE_EQ(context, input->params.zero_point, `0`);
327	TF_LITE_ENSURE_EQ(context, output->params.zero_point, `0`);
328	}
329
330	// If we have to perform on-the-fly quantization (with quantized weights and
331	// float inputs) first we need to quantize the inputs. Allocate a temporary
332	// buffer to store the intermediate quantized values.
333	// Additionally, we allocate a temporary buffer to store the accumulated
334	// quantized values prior to multiplication by the scaling factor.
335	const bool is_hybrid =
336	(input->type == kTfLiteFloat32 &&
337	(filter->type == kTfLiteUInt8 \|\| filter->type == kTfLiteInt8));
338	const bool is_sparse = filter->sparsity != nullptr;
339	if (is_hybrid) {
340	TfLiteIntArrayFree(node->temporaries);
341	data->compute_row_sums = true;
342	if (is_sparse) {
343	node->temporaries = TfLiteIntArrayCreate(`6`);
344	} else {
345	node->temporaries = TfLiteIntArrayCreate(`5`);
346	}
347	node->temporaries->data[`0`] = data->scratch_tensor_index;
348
349	TfLiteTensor* input_quantized;
350	TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /index=/`0`,
351	&input_quantized));
352	input_quantized->type = filter->type;
353	input_quantized->allocation_type = kTfLiteArenaRw;
354
355	TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
356	TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
357	input_quantized_size));
358
359	node->temporaries->data[`1`] = data->scratch_tensor_index + `1`;
360	TfLiteTensor* scaling_factors;
361	TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /index=/`1`,
362	&scaling_factors));
363	scaling_factors->type = kTfLiteFloat32;
364	scaling_factors->allocation_type = kTfLiteArenaRw;
365
366	int scaling_dims[`1`] = {batch_size};
367	if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, `1`, scaling_dims)) {
368	TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(`1`);
369	scaling_factors_size->data[`0`] = batch_size;
370	TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
371	scaling_factors_size));
372	}
373
374	node->temporaries->data[`2`] = data->scratch_tensor_index + `2`;
375	TfLiteTensor* accum_scratch;
376	TF_LITE_ENSURE_OK(
377	context, GetTemporarySafe(context, node, /index=/`2`, &accum_scratch));
378	accum_scratch->type = kTfLiteInt32;
379	accum_scratch->allocation_type = kTfLiteArenaRw;
380	int accum_scratch_dims[`2`] = {num_units, batch_size};
381	if (!TfLiteIntArrayEqualsArray(accum_scratch->dims, `2`,
382	accum_scratch_dims)) {
383	TfLiteIntArray* accum_size = TfLiteIntArrayCreate(`2`);
384	accum_size->data[`0`] = num_units;
385	accum_size->data[`1`] = batch_size;
386	TF_LITE_ENSURE_OK(
387	context, context->ResizeTensor(context, accum_scratch, accum_size));
388	}
389
390	node->temporaries->data[`3`] = data->scratch_tensor_index + `3`;
391	TfLiteTensor* input_offsets;
392	TF_LITE_ENSURE_OK(
393	context, GetTemporarySafe(context, node, /index=/`3`, &input_offsets));
394	input_offsets->type = kTfLiteInt32;
395	input_offsets->allocation_type = kTfLiteArenaRw;
396	if (!TfLiteIntArrayEqualsArray(input_offsets->dims, `1`, scaling_dims)) {
397	TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(`1`);
398	input_offsets_size->data[`0`] = batch_size;
399	TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
400	input_offsets_size));
401	}
402	node->temporaries->data[`4`] = data->scratch_tensor_index + `4`;
403	TfLiteTensor* row_sums;
404	TF_LITE_ENSURE_OK(context,
405	GetTemporarySafe(context, node, /index=/`4`, &row_sums));
406	row_sums->type = kTfLiteInt32;
407	row_sums->allocation_type = kTfLiteArenaRwPersistent;
408	int row_sums_dims[`1`] = {num_units};
409	if (!TfLiteIntArrayEqualsArray(row_sums->dims, `1`, row_sums_dims)) {
410	TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(`1`);
411	row_sums_size->data[`0`] = row_sums_dims[`0`];
412	TF_LITE_ENSURE_OK(
413	context, context->ResizeTensor(context, row_sums, row_sums_size));
414	}
415
416	if (is_sparse) {
417	data->ledger_initialized = false;
418	node->temporaries->data[`5`] = data->scratch_tensor_index + `5`;
419	TfLiteTensor* filter_ledger =
420	&context->tensors[node->temporaries->data[`5`]];
421	auto status =
422	CreateLedgerTensor(filter->sparsity, context, filter_ledger);
423	if (status != kTfLiteOk) return status;
424	}
425	}
426
427	// Resize output.
428	TfLiteIntArray* output_size_array = nullptr;
429	if (params->keep_num_dims) {
430	// When number of dimensions are kept the filter operates along the last
431	// dimensions. In other words, for an input tensor with shape
432	// [batch_size, ..., n_inputs] and a filter of shape [n_inputs, n_units]
433	// this Op produces an output of shape [batch_size, ..., n_units].
434	TF_LITE_ENSURE_EQ(context, input->dims->data[input->dims->size - `1`],
435	SizeOfDimension(filter, `1`));
436	output_size_array = TfLiteIntArrayCopy(input->dims);
437	output_size_array->data[output_size_array->size - `1`] = num_units;
438	} else {
439	// Otherwise, the output is (potentially flattened to) a 2-D matrix.
440	output_size_array = TfLiteIntArrayCreate(`2`);
441	output_size_array->data[`0`] = batch_size;
442	output_size_array->data[`1`] = num_units;
443	}
444	TF_LITE_ENSURE_OK(context,
445	context->ResizeTensor(context, output, output_size_array));
446
447	return kTfLiteOk;
448	}
449
450	template <KernelType kernel_type>
451	TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
452	// Check for supported activation types.
453	auto* params =
454	reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
455	const TfLiteTensor* filter;
456	TF_LITE_ENSURE_OK(context,
457	GetInputSafe(context, node, kWeightsTensor, &filter));
458	const TfLiteTensor* input;
459	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
460	const bool is_quantized =
461	((filter->type == kTfLiteUInt8) \|\| (filter->type == kTfLiteInt8));
462	const bool is_hybrid = is_quantized && (input->type == kTfLiteFloat32);
463	const bool is_pie = kernel_type == kLegacyPie;
464
465	// Pie and hybrid path supports all kinds of fused activations, otherwise only
466	// clipping activations are supported.
467	if (!is_pie && !is_hybrid) {
468	TF_LITE_ENSURE(context, params->activation == kTfLiteActNone \|\|
469	params->activation == kTfLiteActRelu \|\|
470	params->activation == kTfLiteActReluN1To1 \|\|
471	params->activation == kTfLiteActRelu6);
472	}
473	return PrepareImpl(context, node);
474	}
475
476	TfLiteStatus EvalPie(TfLiteContext* context, TfLiteNode* node,
477	TfLiteFullyConnectedParams* params, OpData* data,
478	const TfLiteTensor* input, const TfLiteTensor* filter,
479	const TfLiteTensor* bias, TfLiteTensor* output) {
480	int total_input_size = `1`;
481	for (int i = `0`; i < input->dims->size; i++) {
482	total_input_size *= input->dims->data[i];
483	}
484
485	int input_size = filter->dims->data[`1`];
486	const int batch_size = total_input_size / filter->dims->data[`1`];
487	const int num_units = filter->dims->data[`0`];
488
489	// Output = bias if bias tensor exists.
490	if (bias) {
491	tensor_utils::VectorBatchVectorAssign(GetTensorData<float>(bias), num_units,
492	batch_size,
493	GetTensorData<float>(output));
494	} else {
495	std::fill_n(GetTensorData<float>(output), batch_size * num_units, `0.0f`);
496	}
497
498	// Compute output += weight input*
499	tensor_utils::MatrixBatchVectorMultiplyAccumulate(
500	GetTensorData<float>(filter), num_units, input_size,
501	GetTensorData<float>(input), batch_size, GetTensorData<float>(output));
502
503	// Apply activation function
504	tensor_utils::ApplyActivationToVector(
505	GetTensorData<float>(output), batch_size * num_units, params->activation,
506	GetTensorData<float>(output));
507
508	return kTfLiteOk;
509	}
510
511	TfLiteStatus EvalHybridDense(
512	TfLiteContext* context, TfLiteNode* node,
513	TfLiteFullyConnectedParams* params, OpData* data, const TfLiteTensor* input,
514	const TfLiteTensor* filter, const TfLiteTensor* bias,
515	TfLiteTensor* input_quantized, TfLiteTensor* scaling_factors,
516	TfLiteTensor* accum_scratch, TfLiteTensor* row_sums,
517	TfLiteTensor* input_offsets, TfLiteTensor* output) {
518	int total_input_size = `1`;
519	for (int i = `0`; i < input->dims->size; i++) {
520	total_input_size *= input->dims->data[i];
521	}
522
523	const int input_size = filter->dims->data[`1`];
524	const int batch_size = total_input_size / filter->dims->data[`1`];
525	const int num_units = filter->dims->data[`0`];
526
527	// Output = bias if bias tensor exists.
528	if (bias) {
529	tensor_utils::VectorBatchVectorAssign(GetTensorData<float>(bias), num_units,
530	batch_size,
531	GetTensorData<float>(output));
532	} else {
533	std::fill_n(GetTensorData<float>(output), batch_size * num_units, `0.0f`);
534	}
535
536	// Save matrix multiplication computation for all zero input.
537	if (tensor_utils::IsZeroVector(GetTensorData<float>(input),
538	total_input_size)) {
539	tensor_utils::ApplyActivationToVector(
540	GetTensorData<float>(output), batch_size * num_units,
541	params->activation, GetTensorData<float>(output));
542	return kTfLiteOk;
543	}
544
545	// Quantize input from float to uint8 + quantization params (scaling factor).
546	float* scaling_factors_ptr = GetTensorData<float>(scaling_factors);
547	int32_t* input_offset_ptr = nullptr;
548	int32_t* row_sums_ptr = nullptr;
549	if (params->asymmetric_quantize_inputs) {
550	input_offset_ptr = GetTensorData<int32_t>(input_offsets);
551	row_sums_ptr = GetTensorData<int32_t>(row_sums);
552	}
553	int8_t* quant_data = GetTensorData<int8_t>(input_quantized);
554	const int8_t* filter_data = GetTensorData<int8_t>(filter);
555	const float* input_ptr = GetTensorData<float>(input);
556	tensor_utils::BatchQuantizeFloats(
557	input_ptr, batch_size, input_size, quant_data, scaling_factors_ptr,
558	input_offset_ptr, params->asymmetric_quantize_inputs);
559	for (int b = `0`; b < batch_size; ++b) {
560	// Incorporate scaling of the filter.
561	scaling_factors_ptr[b] *= filter->params.scale;
562	}
563
564	// Compute output += weight quantized_input*
565	int32_t* scratch = GetTensorData<int32_t>(accum_scratch);
566	tensor_utils::MatrixBatchVectorMultiplyAccumulate(
567	filter_data, num_units, input_size, quant_data, scaling_factors_ptr,
568	batch_size, GetTensorData<float>(output), /per_channel_scale=/nullptr,
569	input_offset_ptr, scratch, row_sums_ptr, &data->compute_row_sums,
570	CpuBackendContext::GetFromContext(context));
571
572	// Apply activation function to floats.
573	tensor_utils::ApplyActivationToVector(
574	GetTensorData<float>(output), batch_size * num_units, params->activation,
575	GetTensorData<float>(output));
576	return kTfLiteOk;
577	}
578
579	void EvalSparseHybridImpl(TfLiteContext* context, TfLiteNode* node,
580	TfLiteFullyConnectedParams* params, OpData* data,
581	const TfLiteTensor* input, const TfLiteTensor* filter,
582	const TfLiteTensor* bias, int thread_start,
583	int thread_end, TfLiteTensor* input_quantized,
584	TfLiteTensor* scaling_factors,
585	TfLiteTensor* accum_scratch, TfLiteTensor* row_sums,
586	TfLiteTensor* input_offsets, TfLiteTensor* output) {
587	ruy::profiler::ScopeLabel label("FullyConnected");
588	ruy::profiler::ScopeLabel inner_label("Sparse Hybrid Kernel");
589	const auto& input_shape = GetTensorShape(input);
590	const auto& output_shape = GetTensorShape(output);
591	const auto& filter_shape = GetTensorShape(filter);
592	const int input_dims_count = input_shape.DimensionsCount();
593	const int output_dims_count = output_shape.DimensionsCount();
594	const int filter_dims_count = filter_shape.DimensionsCount();
595	const int batch_size = thread_end - thread_start;
596	const int input_depth = MatchingDim(filter_shape, filter_dims_count - `1`,
597	input_shape, input_dims_count - `1`);
598	const int output_depth = MatchingDim(filter_shape, filter_dims_count - `2`,
599	output_shape, output_dims_count - `1`);
600	const int per_thread_input_size = batch_size * input_depth;
601
602	const float* per_thread_input =
603	GetTensorData<float>(input) + thread_start * input_depth;
604	float* per_thread_output =
605	GetTensorData<float>(output) + thread_start * output_depth;
606
607	// Output = bias if bias tensor exists.
608	if (bias) {
609	tensor_utils::VectorBatchVectorAssign(GetTensorData<float>(bias),
610	output_depth, batch_size,
611	per_thread_output);
612	} else {
613	std::fill_n(per_thread_output, batch_size * output_depth, `0.0f`);
614	}
615
616	// Save matrix multiplication computation for all zero input.
617	if (tensor_utils::IsZeroVector(per_thread_input, per_thread_input_size)) {
618	tensor_utils::ApplyActivationToVector(
619	per_thread_output, batch_size * output_depth, params->activation,
620	per_thread_output);
621	return;
622	}
623
624	// Quantize input from float to uint8 + quantization params (scaling factor).
625	float* scaling_factors_ptr =
626	GetTensorData<float>(scaling_factors) + thread_start;
627	int32_t* input_offset_ptr = nullptr;
628	int32_t* row_sums_ptr = nullptr;
629	if (params->asymmetric_quantize_inputs) {
630	input_offset_ptr = GetTensorData<int32_t>(input_offsets) + thread_start;
631	row_sums_ptr = GetTensorData<int32_t>(row_sums);
632	}
633	int8_t* quant_data =
634	GetTensorData<int8_t>(input_quantized) + thread_start * input_depth;
635	tensor_utils::BatchQuantizeFloats(per_thread_input, batch_size, input_depth,
636	quant_data, scaling_factors_ptr,
637	input_offset_ptr,
638	params->asymmetric_quantize_inputs);
639	for (int b = `0`; b < batch_size; ++b) {
640	// Incorporate scaling of the filter.
641	scaling_factors_ptr[b] *= filter->params.scale;
642	}
643
644	if (params->asymmetric_quantize_inputs) {
645	float* per_thread_output_ptr = per_thread_output;
646	for (int b = `0`; b < batch_size; ++b) {
647	const float scaled_zp = scaling_factors_ptr[b] * input_offset_ptr[b];
648	for (int row = `0`; row < output_depth; ++row) {
649	per_thread_output_ptr++ -= scaled_zp row_sums_ptr[row];
650	}
651	}
652	}
653
654	// Compute output += weight quantized_input*
655	TfLiteTensor* filter_ledger = &context->tensors[node->temporaries->data[`5`]];
656	tensor_utils::SparseMatrixBatchVectorMultiplyAccumulate(
657	GetTensorData<int8_t>(filter), GetTensorData<uint8_t>(filter_ledger),
658	output_depth, input_depth, quant_data, scaling_factors_ptr, batch_size,
659	per_thread_output);
660
661	// Apply activation function to floats.
662	tensor_utils::ApplyActivationToVector(per_thread_output,
663	batch_size * output_depth,
664	params->activation, per_thread_output);
665	}
666
667	struct SparseHybridFullyConnectedTask : cpu_backend_threadpool::Task {
668	SparseHybridFullyConnectedTask(
669	TfLiteContext* context, TfLiteNode* node,
670	TfLiteFullyConnectedParams* params, OpData* data,
671	const TfLiteTensor* input, const TfLiteTensor* filter,
672	const TfLiteTensor* bias, const int thread_start, const int thread_end,
673	TfLiteTensor* input_quantized, TfLiteTensor* scaling_factors,
674	TfLiteTensor* accum_scratch, TfLiteTensor* row_sums,
675	TfLiteTensor* input_offsets, TfLiteTensor* output)
676	: context(context),
677	node(node),
678	params(params),
679	data(data),
680	input(input),
681	filter(filter),
682	bias(bias),
683	thread_start(thread_start),
684	thread_end(thread_end),
685	input_quantized(input_quantized),
686	scaling_factors(scaling_factors),
687	accum_scratch(accum_scratch),
688	row_sums(row_sums),
689	input_offsets(input_offsets),
690	output(output) {}
691
692	void Run() override {
693	EvalSparseHybridImpl(context, node, params, data, input, filter, bias,
694	thread_start, thread_end, input_quantized,
695	scaling_factors, accum_scratch, row_sums,
696	input_offsets, output);
697	}
698
699	private:
700	TfLiteContext* context;
701	TfLiteNode* node;
702	TfLiteFullyConnectedParams* params;
703	OpData* data;
704	const TfLiteTensor* input;
705	const TfLiteTensor* filter;
706	const TfLiteTensor* bias;
707	const int thread_start;
708	const int thread_end;
709	TfLiteTensor* input_quantized;
710	TfLiteTensor* scaling_factors;
711	TfLiteTensor* accum_scratch;
712	TfLiteTensor* row_sums;
713	TfLiteTensor* input_offsets;
714	TfLiteTensor* output;
715	};
716
717	TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
718	TfLiteFullyConnectedParams* params, OpData* data,
719	const TfLiteTensor* input, const TfLiteTensor* filter,
720	const TfLiteTensor* bias, TfLiteTensor* input_quantized,
721	TfLiteTensor* scaling_factors,
722	TfLiteTensor* accum_scratch, TfLiteTensor* row_sums,
723	TfLiteTensor* input_offsets, TfLiteTensor* output) {
724	const auto& output_shape = GetTensorShape(output);
725	CpuBackendContext* cpu_backend_context =
726	CpuBackendContext::GetFromContext(context);
727	const bool is_dense = filter->sparsity == nullptr;
728	if (is_dense) {
729	return EvalHybridDense(context, node, params, data, input, filter, bias,
730	input_quantized, scaling_factors, accum_scratch,
731	row_sums, input_offsets, output);
732	}
733
734	TfLiteTensor* filter_ledger = &context->tensors[node->temporaries->data[`5`]];
735	if (!data->ledger_initialized) {
736	PopulateLedgerData(filter->sparsity, context,
737	GetTensorData<uint8_t>(filter_ledger));
738	data->ledger_initialized = true;
739	}
740
741	// The multi-threaded kernel slices the workload along the batch dimension. If
742	// there's not enough batches of data, the number of threads used is equal to
743	// the batch size.
744	// TODO(b/173442777): If needed, we can improve this later with slicing along
745	// the row dimension of the weight.
746	const int max_threads = cpu_backend_context->max_num_threads();
747	const int batches =
748	FlatSizeSkipDim(output_shape, output_shape.DimensionsCount() - `1`);
749	const int thread_count = std::max(`1`, std::min(batches, max_threads));
750	if (params->asymmetric_quantize_inputs && data->compute_row_sums) {
751	// Precompute row sums.
752	static const int kBlockSize = `16`;
753	const uint8_t* ledger_ptr = GetTensorData<uint8_t>(filter_ledger);
754	const int8_t* row_ptr = GetTensorData<int8_t>(filter);
755	const int output_depth = filter->dims->data[`0`];
756	int32_t* row_sums_ptr = GetTensorData<int32_t>(row_sums);
757	for (int row = `0`; row < output_depth; ++row) {
758	int32_t row_sum = `0`;
759	int num_nonzero_blocks = *ledger_ptr++;
760	for (int i = `0`; i < num_nonzero_blocks; ++i, ++ledger_ptr) {
761	for (int c = `0`; c < kBlockSize; c++) {
762	row_sum += (*row_ptr++);
763	}
764	}
765	row_sums_ptr[row] = row_sum;
766	}
767	data->compute_row_sums = false;
768	}
769	std::vector<SparseHybridFullyConnectedTask> tasks;
770	tasks.reserve(thread_count);
771	int thread_start = `0`;
772	for (int i = `0`; i < thread_count; ++i) {
773	// This makes sure the workload is relatively balanced when batches is not
774	// a multiple of thread_count. The first mod(batches, thread_count) tasks
775	// need to process one more batch than the rest.
776	int thread_end = thread_start + batches / thread_count;
777	if (i < batches % thread_count) thread_end++;
778
779	tasks.emplace_back(context, node, params, data, input, filter, bias,
780	thread_start, thread_end, input_quantized,
781	scaling_factors, accum_scratch, row_sums, input_offsets,
782	output);
783	thread_start = thread_end;
784	}
785	cpu_backend_threadpool::Execute(tasks.size(), tasks.data(),
786	cpu_backend_context);
787	return kTfLiteOk;
788	}
789
790	namespace {
791	template <KernelType kernel_type>
792	void FullyConnectedInt8(const OpData* data, const TfLiteTensor* input,
793	const TfLiteTensor* filter, const TfLiteTensor* bias,
794	TfLiteTensor* output,
795	CpuBackendContext* cpu_backend_context) {
796	FullyConnectedParams op_params;
797	op_params.input_offset = -input->params.zero_point;
798	op_params.weights_offset = -filter->params.zero_point;
799	op_params.output_offset = output->params.zero_point;
800	op_params.output_multiplier = data->output_multiplier;
801	op_params.output_shift = data->output_shift;
802	op_params.quantized_activation_min = data->output_activation_min;
803	op_params.quantized_activation_max = data->output_activation_max;
804	op_params.lhs_cacheable = IsConstantTensor(filter);
805	op_params.rhs_cacheable = IsConstantTensor(input);
806	if (kernel_type == kReference) {
807	reference_integer_ops::FullyConnected(
808	op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
809	GetTensorShape(filter), GetTensorData<int8_t>(filter),
810	GetTensorShape(bias), GetTensorData<int32_t>(bias),
811	GetTensorShape(output), GetTensorData<int8_t>(output));
812	} else {
813	optimized_integer_ops::FullyConnected(
814	op_params, GetTensorShape(input), GetTensorData<int8_t>(input),
815	GetTensorShape(filter), GetTensorData<int8_t>(filter),
816	GetTensorShape(bias), GetTensorData<int32_t>(bias),
817	GetTensorShape(output), GetTensorData<int8_t>(output),
818	cpu_backend_context);
819	}
820	}
821
822	template <KernelType kernel_type>
823	void FullyConnectedInt16(const OpData* data, const TfLiteTensor* input,
824	const TfLiteTensor* filter, const TfLiteTensor* bias,
825	TfLiteTensor* output) {
826	FullyConnectedParams op_params;
827	op_params.weights_offset = -filter->params.zero_point;
828	op_params.output_multiplier = data->output_multiplier;
829	op_params.output_shift = data->output_shift;
830	op_params.quantized_activation_min = data->output_activation_min;
831	op_params.quantized_activation_max = data->output_activation_max;
832	if (bias && bias->type == kTfLiteInt64) {
833	reference_integer_ops::FullyConnected(
834	op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
835	GetTensorShape(filter), GetTensorData<int8_t>(filter),
836	GetTensorShape(bias), GetTensorData<int64_t>(bias),
837	GetTensorShape(output), GetTensorData<int16_t>(output));
838	} else {
839	reference_integer_ops::FullyConnected(
840	op_params, GetTensorShape(input), GetTensorData<int16_t>(input),
841	GetTensorShape(filter), GetTensorData<int8_t>(filter),
842	GetTensorShape(bias), GetTensorData<int32_t>(bias),
843	GetTensorShape(output), GetTensorData<int16_t>(output));
844	}
845	}
846
847	template <KernelType kernel_type>
848	void FullyConnectedPerChannelInt8(const OpData* data, const TfLiteTensor* input,
849	const TfLiteTensor* filter,
850	const TfLiteTensor* bias,
851	TfLiteTensor* output,
852	CpuBackendContext* cpu_backend_context) {
853	// FullyConnectedPerChannel ops spec is that weights are symmetric.
854	// op_params.weights_offset is not set (filter.params.zero_point is not used),
855	// since it will be always assumed to be 0.
856	FullyConnectedParams op_params;
857	op_params.input_offset = -input->params.zero_point;
858	op_params.output_offset = output->params.zero_point;
859	op_params.quantized_activation_min = data->output_activation_min;
860	op_params.quantized_activation_max = data->output_activation_max;
861	op_params.lhs_cacheable = IsConstantTensor(filter);
862	op_params.rhs_cacheable = IsConstantTensor(input);
863	if (kernel_type == kReference) {
864	reference_integer_ops::FullyConnectedPerChannel(
865	op_params, data->per_channel_output_multiplier.data(),
866	data->per_channel_output_shift.data(), GetTensorShape(input),
867	GetTensorData<int8_t>(input), GetTensorShape(filter),
868	GetTensorData<int8_t>(filter), GetTensorShape(bias),
869	GetTensorData<int32_t>(bias), GetTensorShape(output),
870	GetTensorData<int8_t>(output));
871	} else {
872	optimized_integer_ops::FullyConnectedPerChannel(
873	op_params, data->per_channel_output_multiplier.data(),
874	data->per_channel_output_shift.data(), GetTensorShape(input),
875	GetTensorData<int8_t>(input), GetTensorShape(filter),
876	GetTensorData<int8_t>(filter), GetTensorShape(bias),
877	GetTensorData<int32_t>(bias), GetTensorShape(output),
878	GetTensorData<int8_t>(output), cpu_backend_context);
879	}
880	}
881
882	template <KernelType kernel_type>
883	void FullyConnectedPerChannelInt16(const OpData* data,
884	const TfLiteTensor* input,
885	const TfLiteTensor* filter,
886	const TfLiteTensor* bias,
887	TfLiteTensor* output) {
888	// FullyConnectedPerChannel ops spec is that weights are symmetric.
889	// op_params.weights_offset is not set (filter.params.zero_point is not used),
890	// since it will be always assumed to be 0.
891	FullyConnectedParams op_params;
892	op_params.quantized_activation_min = data->output_activation_min;
893	op_params.quantized_activation_max = data->output_activation_max;
894	if (bias && bias->type == kTfLiteInt64) {
895	reference_integer_ops::FullyConnectedPerChannel(
896	op_params, data->per_channel_output_multiplier.data(),
897	data->per_channel_output_shift.data(), GetTensorShape(input),
898	GetTensorData<int16_t>(input), GetTensorShape(filter),
899	GetTensorData<int8_t>(filter), GetTensorShape(bias),
900	GetTensorData<int64_t>(bias), GetTensorShape(output),
901	GetTensorData<int16_t>(output));
902	} else {
903	reference_integer_ops::FullyConnectedPerChannel(
904	op_params, data->per_channel_output_multiplier.data(),
905	data->per_channel_output_shift.data(), GetTensorShape(input),
906	GetTensorData<int16_t>(input), GetTensorShape(filter),
907	GetTensorData<int8_t>(filter), GetTensorShape(bias),
908	GetTensorData<int32_t>(bias), GetTensorShape(output),
909	GetTensorData<int16_t>(output));
910	}
911	}
912
913	} // namespace
914
915	// Verifies that sparsity values are valid given input/weight/output.
916	bool VerifySparsity(const RuntimeShape& weights_shape,
917	const RuntimeShape& input_shape,
918	const RuntimeShape& output_shape,
919	const TfLiteSparsity* sparsity) {
920	const int weights_dims_count = weights_shape.DimensionsCount();
921	const int output_dims_count = output_shape.DimensionsCount();
922	const int w0_size = sparsity->dim_metadata[`0`].dense_size;
923	const int accum_depth = weights_shape.Dims(weights_dims_count - `1`);
924	const int output_elements = output_shape.FlatSize();
925	const int input_elements = input_shape.FlatSize();
926	const int batches = FlatSizeSkipDim(output_shape, output_dims_count - `1`);
927	const int output_depth = MatchingDim(weights_shape, weights_dims_count - `2`,
928	output_shape, output_dims_count - `1`);
929	const int max_batch_index = batches - `1`;
930	const int max_output = max_batch_index * output_depth + w0_size;
931	const int max_batch_depth = accum_depth * max_batch_index;
932
933	// Verify output size is enough.
934	if (output_elements < max_output) return false;
935
936	// Verify index from sparse in input is valid.
937	for (int i = `0`; i < sparsity->dim_metadata[`1`].array_indices->size; ++i) {
938	if (input_elements <=
939	max_batch_depth + sparsity->dim_metadata[`1`].array_indices->data[i])
940	return false;
941	}
942	return true;
943	}
944
945	template <KernelType kernel_type>
946	TfLiteStatus EvalQuantized(TfLiteContext* context, TfLiteNode* node,
947	TfLiteFullyConnectedParams* params, OpData* data,
948	const TfLiteTensor* input,
949	const TfLiteTensor* filter, const TfLiteTensor* bias,
950	TfLiteTensor* output) {
951	const bool is_per_channel = data->per_channel_output_multiplier.size() > `1`;
952	int32_t input_offset = -input->params.zero_point;
953	int32_t filter_offset = -filter->params.zero_point;
954	int32_t output_offset = output->params.zero_point;
955	// Only the Pie path supports quantized models and float inputs/outputs.
956	if (input->type == kTfLiteFloat32) {
957	TfLiteTensor* input_quantized;
958	TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /index=/`0`,
959	&input_quantized));
960	TfLiteTensor* scaling_factors;
961	TF_LITE_ENSURE_OK(context, GetTemporarySafe(context, node, /index=/`1`,
962	&scaling_factors));
963	TfLiteTensor* accum_scratch;
964	TF_LITE_ENSURE_OK(
965	context, GetTemporarySafe(context, node, /index=/`2`, &accum_scratch));
966	TfLiteTensor* input_offsets;
967	TF_LITE_ENSURE_OK(
968	context, GetTemporarySafe(context, node, /index=/`3`, &input_offsets));
969	TfLiteTensor* row_sums;
970	TF_LITE_ENSURE_OK(context,
971	GetTemporarySafe(context, node, /index=/`4`, &row_sums));
972	return EvalHybrid(context, node, params, data, input, filter, bias,
973	input_quantized, scaling_factors, accum_scratch, row_sums,
974	input_offsets, output);
975	} else {
976	FullyConnectedParams op_params;
977	op_params.input_offset = input_offset;
978	op_params.weights_offset = filter_offset;
979	op_params.output_offset = output_offset;
980	op_params.output_multiplier = data->output_multiplier;
981	op_params.output_shift = data->output_shift;
982	op_params.quantized_activation_min = data->output_activation_min;
983	op_params.quantized_activation_max = data->output_activation_max;
984	op_params.lhs_cacheable = IsConstantTensor(filter);
985	op_params.rhs_cacheable = IsConstantTensor(input);
986	switch (output->type) {
987	case kTfLiteUInt8:
988	if (kernel_type == kReference) {
989	reference_ops::FullyConnected(
990	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
991	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
992	GetTensorShape(bias), GetTensorData<int32_t>(bias),
993	GetTensorShape(output), GetTensorData<uint8_t>(output));
994	} else {
995	optimized_ops::FullyConnected(
996	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
997	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
998	GetTensorShape(bias), GetTensorData<int32_t>(bias),
999	GetTensorShape(output), GetTensorData<uint8_t>(output),
1000	CpuBackendContext::GetFromContext(context));
1001	}
1002	break;
1003	case kTfLiteInt8:
1004	if (filter->sparsity != nullptr) {
1005	const TfLiteSparsity& sparsity = *filter->sparsity;
1006	const auto input_shape = GetTensorShape(input);
1007	const auto filter_shape = GetTensorShape(filter);
1008	const auto output_shape = GetTensorShape(output);
1009	const auto bias_shape = GetTensorShape(bias);
1010	if (filter_offset != `0`) {
1011	TF_LITE_KERNEL_LOG(context,
1012	"Quantized and sparse fully-connected format "
1013	"supports symmetric weight quantization only.");
1014	return kTfLiteError;
1015	}
1016	if (!SupportedSparsityFormat(sparsity) \|\|
1017	!VerifySparsity(filter_shape, input_shape, output_shape,
1018	&sparsity)) {
1019	TF_LITE_KERNEL_LOG(
1020	context,
1021	"Invalid quantized and sparse fully-connected format.");
1022	return kTfLiteError;
1023	}
1024	if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
1025	sparsity.dim_metadata[`2`].dense_size == `16`) {
1026	// Block sparse with block size of 1x16.
1027	optimized_ops::FullyConnectedSparseWeight1x16(
1028	sparsity, op_params, input_shape, GetTensorData<int8_t>(input),
1029	filter_shape, GetTensorData<int8_t>(filter), bias_shape,
1030	GetTensorData<int32_t>(bias), output_shape,
1031	GetTensorData<int8_t>(output),
1032	CpuBackendContext::GetFromContext(context));
1033	} else {
1034	TF_LITE_KERNEL_LOG(
1035	context, "Unsupported sparse fully-connected weight format.");
1036	return kTfLiteError;
1037	}
1038	} else {
1039	is_per_channel ? FullyConnectedPerChannelInt8<kernel_type>(
1040	data, input, filter, bias, output,
1041	CpuBackendContext::GetFromContext(context))
1042	: FullyConnectedInt8<kernel_type>(
1043	data, input, filter, bias, output,
1044	CpuBackendContext::GetFromContext(context));
1045	}
1046	break;
1047	case kTfLiteInt16:
1048	if (input->type == kTfLiteInt16) {
1049	// To avoid 32bit accum overflow, it enables RUY only
1050	// when zero_point is 0.
1051	bool has_non_zero_point = input->params.zero_point \|\|
1052	filter->params.zero_point \|\|
1053	output->params.zero_point;
1054	if (kernel_type == kReference \|\| has_non_zero_point \|\|
1055	(bias && bias->type == kTfLiteInt64)) {
1056	is_per_channel ? FullyConnectedPerChannelInt16<kernel_type>(
1057	data, input, filter, bias, output)
1058	: FullyConnectedInt16<kernel_type>(
1059	data, input, filter, bias, output);
1060	} else {
1061	is_per_channel
1062	? optimized_integer_ops::FullyConnectedPerChannel(
1063	op_params, data->per_channel_output_multiplier.data(),
1064	data->per_channel_output_shift.data(),
1065	GetTensorShape(input), GetTensorData<int16_t>(input),
1066	GetTensorShape(filter), GetTensorData<int8_t>(filter),
1067	GetTensorShape(bias), GetTensorData<int32_t>(bias),
1068	GetTensorShape(output), GetTensorData<int16_t>(output),
1069	CpuBackendContext::GetFromContext(context))
1070	: optimized_integer_ops::FullyConnected(
1071	op_params, GetTensorShape(input),
1072	GetTensorData<int16_t>(input), GetTensorShape(filter),
1073	GetTensorData<int8_t>(filter), GetTensorShape(bias),
1074	GetTensorData<int32_t>(bias), GetTensorShape(output),
1075	GetTensorData<int16_t>(output),
1076	CpuBackendContext::GetFromContext(context));
1077	}
1078	} else if (kernel_type == kReference) {
1079	reference_ops::FullyConnected(
1080	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
1081	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
1082	GetTensorShape(bias), GetTensorData<int32_t>(bias),
1083	GetTensorShape(output), GetTensorData<int16_t>(output));
1084	} else {
1085	optimized_ops::FullyConnected(
1086	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
1087	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
1088	GetTensorShape(bias), GetTensorData<int32_t>(bias),
1089	GetTensorShape(output), GetTensorData<int16_t>(output),
1090	CpuBackendContext::GetFromContext(context));
1091	}
1092	break;
1093	default:
1094	TF_LITE_KERNEL_LOG(context,
1095	"Quantized FullyConnected expects output data "
1096	"type uint8, int8 or int16");
1097	return kTfLiteError;
1098	}
1099	}
1100
1101	return kTfLiteOk;
1102	}
1103
1104	template <KernelType kernel_type>
1105	TfLiteStatus EvalShuffledQuantized(TfLiteContext* context, TfLiteNode* node,
1106	TfLiteFullyConnectedParams* params,
1107	OpData* data, const TfLiteTensor* input,
1108	const TfLiteTensor* filter,
1109	const TfLiteTensor* bias,
1110	TfLiteTensor* output,
1111	TfLiteTensor* shuffled_input_workspace) {
1112	// TODO(b/110697972) decide more consistently if / how / where we want
1113	// to perform this kind of runtime data type checks.
1114	if (shuffled_input_workspace->type != kTfLiteUInt8) {
1115	TF_LITE_KERNEL_LOG(context, "Unexpected data type");
1116	return kTfLiteError;
1117	}
1118
1119	#define TF_LITE_SHUFFLED_FULLY_CONNECTED(type) \
1120	{ \
1121	type::ShuffledFullyConnected( \
1122	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), \
1123	GetTensorShape(filter), GetTensorData<uint8_t>(filter), \
1124	GetTensorShape(bias), GetTensorData<int32_t>(bias), \
1125	GetTensorShape(output), GetTensorData<int16_t>(output), \
1126	GetTensorData<uint8_t>(shuffled_input_workspace), \
1127	CpuBackendContext::GetFromContext(context)); \
1128	}
1129	FullyConnectedParams op_params;
1130	op_params.output_multiplier = data->output_multiplier;
1131	op_params.output_shift = data->output_shift;
1132	op_params.quantized_activation_min = data->output_activation_min;
1133	op_params.quantized_activation_max = data->output_activation_max;
1134	op_params.lhs_cacheable = IsConstantTensor(filter);
1135	op_params.rhs_cacheable = IsConstantTensor(input);
1136	if (kernel_type == kReference) {
1137	reference_ops::ShuffledFullyConnected(
1138	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
1139	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
1140	GetTensorShape(bias), GetTensorData<int32_t>(bias),
1141	GetTensorShape(output), GetTensorData<int16_t>(output),
1142	GetTensorData<uint8_t>(shuffled_input_workspace));
1143	} else {
1144	optimized_ops::ShuffledFullyConnected(
1145	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
1146	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
1147	GetTensorShape(bias), GetTensorData<int32_t>(bias),
1148	GetTensorShape(output), GetTensorData<int16_t>(output),
1149	GetTensorData<uint8_t>(shuffled_input_workspace),
1150	CpuBackendContext::GetFromContext(context));
1151	}
1152	#undef TF_LITE_SHUFFLED_FULLY_CONNECTED
1153
1154	return kTfLiteOk;
1155	}
1156
1157	template <KernelType kernel_type>
1158	TfLiteStatus EvalFloat(TfLiteContext* context, TfLiteNode* node,
1159	TfLiteFullyConnectedParams* params, OpData* data,
1160	const TfLiteTensor* input, const TfLiteTensor* filter,
1161	const TfLiteTensor* bias, TfLiteTensor* output) {
1162	float output_activation_min, output_activation_max;
1163	CalculateActivationRange(params->activation, &output_activation_min,
1164	&output_activation_max);
1165	if (kernel_type == kReference) {
1166	FullyConnectedParams op_params;
1167	op_params.float_activation_min = output_activation_min;
1168	op_params.float_activation_max = output_activation_max;
1169	if (filter->sparsity != nullptr) {
1170	const auto& sparsity = *filter->sparsity;
1171	reference_ops::FullyConnectedSparseWeight(
1172	sparsity, op_params, GetTensorShape(input),
1173	GetTensorData<float>(input), GetTensorShape(filter),
1174	GetTensorData<float>(filter), GetTensorShape(bias),
1175	GetTensorData<float>(bias), GetTensorShape(output),
1176	GetTensorData<float>(output));
1177	} else {
1178	reference_ops::FullyConnected(
1179	op_params, GetTensorShape(input), GetTensorData<float>(input),
1180	GetTensorShape(filter), GetTensorData<float>(filter),
1181	GetTensorShape(bias), GetTensorData<float>(bias),
1182	GetTensorShape(output), GetTensorData<float>(output));
1183	}
1184	} else if (kernel_type == kLegacyPie) {
1185	return EvalPie(context, node, params, data, input, filter, bias, output);
1186	} else {
1187	FullyConnectedParams op_params;
1188	op_params.float_activation_min = output_activation_min;
1189	op_params.float_activation_max = output_activation_max;
1190	if (filter->sparsity != nullptr) {
1191	const auto& sparsity = *filter->sparsity;
1192	if (!SupportedSparsityFormat(sparsity)) {
1193	TF_LITE_KERNEL_LOG(context,
1194	"Unsupported sparse fully-connected weight format.");
1195	return kTfLiteError;
1196	}
1197	const auto& input_shape = GetTensorShape(input);
1198	const auto& filter_shape = GetTensorShape(filter);
1199	const auto& output_shape = GetTensorShape(output);
1200	const auto& bias_shape = GetTensorShape(bias);
1201	if (!VerifySparsity(filter_shape, input_shape, output_shape, &sparsity)) {
1202	TF_LITE_KERNEL_LOG(context, "Invalid sparse fully-connected format.");
1203	return kTfLiteError;
1204	}
1205
1206	if (sparsity.dim_metadata_size == kDimMetadataSizeRandomSparse) {
1207	// Random sparse.
1208	optimized_ops::FullyConnectedSparseWeight(
1209	sparsity, op_params, // Disable formatting
1210	input_shape, GetTensorData<float>(input), // Disable formatting
1211	filter_shape, GetTensorData<float>(filter), // Disable formatting
1212	bias_shape, GetTensorData<float>(bias), // Disable formatting
1213	output_shape, GetTensorData<float>(output));
1214	} else if (sparsity.dim_metadata_size == kDimMetadataSizeBlockSparse &&
1215	sparsity.dim_metadata[`2`].dense_size == `4`) {
1216	// Block sparse with block size of 1x4.
1217	optimized_ops::FullyConnectedSparseWeight1x4(
1218	sparsity, op_params, // Disable formatting
1219	input_shape, GetTensorData<float>(input), // Disable formatting
1220	filter_shape, GetTensorData<float>(filter), // Disable formatting
1221	bias_shape, GetTensorData<float>(bias), // Disable formatting
1222	output_shape, GetTensorData<float>(output),
1223	CpuBackendContext::GetFromContext(context));
1224	} else {
1225	TF_LITE_KERNEL_LOG(context,
1226	"Unsupported sparse fully-connected weight format.");
1227	return kTfLiteError;
1228	}
1229
1230	} else {
1231	op_params.lhs_cacheable = IsConstantTensor(filter);
1232	op_params.rhs_cacheable = IsConstantTensor(input);
1233	optimized_ops::FullyConnected(
1234	op_params, GetTensorShape(input), GetTensorData<float>(input),
1235	GetTensorShape(filter), GetTensorData<float>(filter),
1236	GetTensorShape(bias), GetTensorData<float>(bias),
1237	GetTensorShape(output), GetTensorData<float>(output),
1238	CpuBackendContext::GetFromContext(context));
1239	}
1240	}
1241
1242	return kTfLiteOk;
1243	}
1244
1245	template <KernelType kernel_type>
1246	TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
1247	auto* params =
1248	reinterpret_cast<TfLiteFullyConnectedParams*>(node->builtin_data);
1249	OpData* data = reinterpret_cast<OpData*>(node->user_data);
1250
1251	const TfLiteTensor* input;
1252	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, kInputTensor, &input));
1253	const TfLiteTensor* filter;
1254	TF_LITE_ENSURE_OK(context,
1255	GetInputSafe(context, node, kWeightsTensor, &filter));
1256	const TfLiteTensor* bias =
1257	(node->inputs->size == `3`)
1258	? GetOptionalInputTensor(context, node, kBiasTensor)
1259	: nullptr;
1260	TfLiteTensor* output;
1261	TF_LITE_ENSURE_OK(context,
1262	GetOutputSafe(context, node, kOutputTensor, &output));
1263	// Do nothing if expected output is empty.
1264	if (NumElements(output) == `0`) {
1265	return kTfLiteOk;
1266	}
1267
1268	if (filter->dims->data[`1`] == `0`) {
1269	memset(output->data.data, `0`, output->bytes);
1270	return kTfLiteOk;
1271	}
1272
1273	switch (filter->type) {
1274	case kTfLiteFloat32:
1275	return EvalFloat<kernel_type>(context, node, params, data, input, filter,
1276	bias, output);
1277	case kTfLiteUInt8:
1278	if (params->weights_format ==
1279	kTfLiteFullyConnectedWeightsFormatShuffled4x16Int8) {
1280	TfLiteTensor* shuffled_input_workspace;
1281	TF_LITE_ENSURE_OK(
1282	context, GetOutputSafe(context, node, kShuffledInputWorkspaceTensor,
1283	&shuffled_input_workspace));
1284	return EvalShuffledQuantized<kernel_type>(context, node, params, data,
1285	input, filter, bias, output,
1286	shuffled_input_workspace);
1287	} else if (params->weights_format ==
1288	kTfLiteFullyConnectedWeightsFormatDefault) {
1289	return EvalQuantized<kernel_type>(context, node, params, data, input,
1290	filter, bias, output);
1291	} else {
1292	TF_LITE_KERNEL_LOG(context, "Unhandled fully-connected weights format");
1293	return kTfLiteError;
1294	}
1295	case kTfLiteInt8:
1296	if (params->weights_format == kTfLiteFullyConnectedWeightsFormatDefault) {
1297	return EvalQuantized<kernel_type>(context, node, params, data, input,
1298	filter, bias, output);
1299	} else {
1300	TF_LITE_KERNEL_LOG(context, "Unhandled fully-connected weights format");
1301	return kTfLiteError;
1302	}
1303	default:
1304	TF_LITE_KERNEL_LOG(context,
1305	"Filter data type %s currently not supported.",
1306	TfLiteTypeGetName(filter->type));
1307	return kTfLiteError;
1308	}
1309	return kTfLiteOk;
1310	}
1311
1312	} // namespace fully_connected
1313
1314	TfLiteRegistration* Register_FULLY_CONNECTED_REF() {
1315	static TfLiteRegistration r = {
1316	fully_connected::Init, fully_connected::Free,
1317	fully_connected::Prepare<fully_connected::kReference>,
1318	fully_connected::Eval<fully_connected::kReference>};
1319	return &r;
1320	}
1321
1322	TfLiteRegistration* Register_FULLY_CONNECTED_GENERIC_OPT() {
1323	static TfLiteRegistration r = {
1324	fully_connected::Init, fully_connected::Free,
1325	fully_connected::Prepare<fully_connected::kGenericOptimized>,
1326	fully_connected::Eval<fully_connected::kGenericOptimized>};
1327	return &r;
1328	}
1329
1330	// Legacy path for PIE clients.
1331	TfLiteRegistration* Register_FULLY_CONNECTED_PIE() {
1332	static TfLiteRegistration r = {
1333	fully_connected::Init, fully_connected::Free,
1334	fully_connected::Prepare<fully_connected::kLegacyPie>,
1335	fully_connected::Eval<fully_connected::kLegacyPie>};
1336	return &r;
1337	}
1338
1339	TfLiteRegistration* Register_FULLY_CONNECTED() {
1340	return Register_FULLY_CONNECTED_GENERIC_OPT();
1341	}
1342
1343	} // namespace builtin
1344	} // namespace ops
1345	} // namespace tflite
1346

Browse the source code of tensorflow/tensorflow/lite/kernels/fully_connected.cc