conv.cc source code [tensorflow/tensorflow/lite/kernels/conv.cc]

1	/ Copyright 2017 The TensorFlow Authors. All Rights Reserved.*
2
3	Licensed under the Apache License, Version 2.0 (the "License");
4	you may not use this file except in compliance with the License.
5	You may obtain a copy of the License at
6
7	http://www.apache.org/licenses/LICENSE-2.0
8
9	Unless required by applicable law or agreed to in writing, software
10	distributed under the License is distributed on an "AS IS" BASIS,
11	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12	See the License for the specific language governing permissions and
13	limitations under the License.
14	==============================================================================/*
15	#include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h"
16
17	#include <stddef.h>
18
19	#include <cstdint>
20	#include <vector>
21
22	// Only use multi-threaded Eigen if ruy is disabled.
23	#if !defined(TFLITE_WITH_RUY)
24	#define TFLITE_WITH_MULTITHREADED_EIGEN
25	#endif
26
27	#include "tensorflow/lite/c/builtin_op_data.h"
28	#include "tensorflow/lite/c/common.h"
29	#include "tensorflow/lite/kernels/cpu_backend_context.h"
30	#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
31	#include "tensorflow/lite/kernels/eigen_support.h"
32	#endif
33	#include "tensorflow/lite/kernels/internal/compatibility.h"
34	#include "tensorflow/lite/kernels/internal/types.h"
35	// b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
36	#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
37	#include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
38	#endif
39	#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
40	#include "tensorflow/lite/kernels/internal/quantization_util.h"
41	#include "tensorflow/lite/kernels/internal/reference/conv.h"
42	#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
43	#include "tensorflow/lite/kernels/internal/tensor.h"
44	#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
45	#include "tensorflow/lite/kernels/internal/tensor_utils.h"
46	#include "tensorflow/lite/kernels/kernel_util.h"
47	#include "tensorflow/lite/kernels/padding.h"
48	#include "tensorflow/lite/util.h"
49
50	namespace tflite {
51	namespace ops {
52	namespace builtin {
53	namespace conv {
54
55	// This file has 4 implementation of Conv.
56	enum KernelType {
57	kReference,
58	kGenericOptimized, // Neon-free
59	// kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
60	// are available and kGenericOptimized when we must use only one thread.
61	kMultithreadOptimized,
62	// The kernel uses use CBLAS interface for matrix multiplication.
63	// It's fast when an optimized CBLAS implementation is available (e.g. Apple
64	// Accelerate Framework), and it's slow when falling back to naive
65	// implementation.
66	kCblasOptimized,
67	};
68
69	const int kTensorNotAllocated = -`1`;
70
71	static constexpr size_t kMaxIm2colBufferSizeMobile = `1024` * `1024` * `1024`; // 1GB
72
73	struct OpData {
74	// IDs are the arbitrary identifiers used by TF Lite to identify and access
75	// memory buffers.
76	int im2col_id = kTensorNotAllocated;
77	int hwcn_weights_id = kTensorNotAllocated;
78	int input_quantized_id = kTensorNotAllocated;
79	int scaling_factors_id = kTensorNotAllocated;
80	int input_offset_id = kTensorNotAllocated;
81	int accum_scratch_id = kTensorNotAllocated;
82	// Row sums are used to cache filter sums for hybrid zero-point calculations.
83	int row_sums_id = kTensorNotAllocated;
84
85	TfLitePaddingValues padding;
86	// The scaling factor from input to output (aka the 'real multiplier') can
87	// be represented as a fixed point multiplier plus a left shift.
88	int32_t output_multiplier;
89	int output_shift;
90
91	// Per channel output multiplier and shift.
92	std::vector<int32_t> per_channel_output_multiplier;
93	std::vector<int> per_channel_output_shift;
94
95	// The range of the fused activation layer. For example for kNone and
96	// uint8_t these would be 0 and 255.
97	int32_t output_activation_min;
98	int32_t output_activation_max;
99	// Indexes are the offset to the memory buffer in the array used to keep track
100	// of the allocated temporaries.
101	int32_t im2col_index;
102	int32_t hwcn_weights_index;
103	int32_t input_quantized_index;
104	int32_t scaling_factors_index;
105	int32_t accum_scratch_index;
106	int32_t input_offset_index;
107	int32_t row_sums_index;
108
109	bool need_hwcn_weights = false;
110	bool have_weights_been_transposed = false;
111	bool need_im2col = false;
112	// If it's true, it means im2col is needed but gets disabled because the
113	// temporary im2col tensor requires too much memory (i.e.
114	// >= kMaxIm2colBufferSize);
115	bool im2col_oversized = false;
116
117	bool supports_multithreaded_kernel = false;
118	bool is_hybrid_per_channel = false;
119	bool compute_hybrid_row_sums = true;
120
121	// Number of convolution groups.
122	int32_t groups = `1`;
123	};
124
125	inline PaddingType RuntimePaddingType(TfLitePadding padding) {
126	switch (padding) {
127	case TfLitePadding::kTfLitePaddingSame:
128	return PaddingType::kSame;
129	case TfLitePadding::kTfLitePaddingValid:
130	return PaddingType::kValid;
131	case TfLitePadding::kTfLitePaddingUnknown:
132	default:
133	return PaddingType::kNone;
134	}
135	}
136
137	void* Init(TfLiteContext* context, const char* buffer, size_t length) {
138	// This is a builtin op, so we don't use the contents in 'buffer', if any.
139	// Instead, we allocate a new object to use as scratch space for im2col, and
140	// to carry information from Prepare() to Eval().
141	auto* data = new OpData;
142	#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
143	eigen_support::IncrementUsageCounter(context);
144	#endif
145	return data;
146	}
147
148	void Free(TfLiteContext* context, void* buffer) {
149	#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
150	eigen_support::DecrementUsageCounter(context);
151	#endif
152	delete reinterpret_cast<OpData*>(buffer);
153	}
154
155	// Naive implementation of transpose for floats. Could be optimized to be more
156	// cache friendly, but for now it's a one-time cost on first run, and we would
157	// prefer to remove the need to do this at all eventually.
158	void TransposeFloatTensor(const TfLiteTensor* input, TfLiteTensor* output) {
159	const int rows = output->dims->data[`1`];
160	const int cols = output->dims->data[`0`];
161	const float* input_data = GetTensorData<float>(input);
162	float* output_data = GetTensorData<float>(output);
163	for (int i = `0`; i < rows; ++i) {
164	for (int j = `0`; j < cols; ++j) {
165	const float in_value = input_data[i * cols + j];
166	output_data[j * rows + i] = in_value;
167	}
168	}
169	}
170
171	// Check if im2col needs to be allocated, as some version of optimized Conv dont
172	// use it. If any change is supporting im2col in any of the Conv versions, then
173	// it should be updated here as well
174	bool IsIm2ColRequired(const TfLiteTensor* input, TfLiteConvParams* params,
175	const TfLiteTensor* filter, OpData* data, bool is_hybrid,
176	KernelType kernel_type) {
177	// If HWCN weights are required, Im2Col not required
178	if (data->need_hwcn_weights) return false;
179
180	// segregate based on dilated conv & non-dialated conv
181	const bool need_dilated_im2col =
182	params->dilation_width_factor != `1` \|\| params->dilation_height_factor != `1`;
183	const bool need_non_dilated_im2col =
184	params->stride_width != `1` \|\| params->stride_height != `1` \|\|
185	filter->dims->data[`2`] != `1` \|\| filter->dims->data[`1`] != `1`;
186
187	const bool need_im2col = need_dilated_im2col \|\| need_non_dilated_im2col;
188
189	// Return early as basic requirement is not met
190	if (!need_im2col) return false;
191
192	switch (kernel_type) {
193	case kReference:
194	if (is_hybrid) {
195	return true;
196	} else {
197	return false;
198	}
199	case kGenericOptimized:
200	case kCblasOptimized:
201	// `need_im2col` is always satisfied.
202	return true;
203	case kMultithreadOptimized:
204	if (input->type == kTfLiteUInt8 \|\| //
205	input->type == kTfLiteInt8 \|\| //
206	input->type == kTfLiteInt16 \|\| // quantized.
207	!data->supports_multithreaded_kernel) {
208	return true;
209	} else {
210	return false;
211	}
212	default:
213	return false;
214	}
215	}
216
217	// Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
218	// Note: `context->AddTensors` might invalidate pointers to existing tensors.
219	// Therefore the logic to add tensors are isolated into this function.
220	static TfLiteStatus AllocateTemporaryTensorsIfRequired(
221	TfLiteContext* context, TfLiteNode* node, bool is_hybrid,
222	bool is_per_channel, KernelType kernel_type, size_t im2col_bytes) {
223	auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
224	OpData* data = reinterpret_cast<OpData*>(node->user_data);
225
226	TF_LITE_ENSURE(context, node->inputs->size >= `2`);
227	const TfLiteTensor* input;
228	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `0`, &input));
229	const TfLiteTensor* filter;
230	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `1`, &filter));
231
232	// If we're using the optimized multithreaded EigenTensor implementation of
233	// convolution, it expects the filter weights to be transposed compared to
234	// the normal TF Lite buffer format. Typical TF Lite weights are
235	// [filter_count, filter_height, filter_width, input_depth], but for the float
236	// implementation we need them as [filter_height, filter_width, input_depth,
237	// filter_count]. We get to that format by transposing, and create a temporary
238	// buffer to store the results.
239	// This path is only used for float processing, so only create the buffer if
240	// we're running with that data type.
241	data->need_hwcn_weights =
242	input->type == kTfLiteFloat32 && data->supports_multithreaded_kernel;
243
244	// We don't always need to allocate im2col. It is only used in some versions
245	// of the optimized Conv. This test just mimics something that happens inside
246	// optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
247	data->need_im2col =
248	IsIm2ColRequired(input, params, filter, data, is_hybrid, kernel_type);
249
250	// If im2col_oversized is found to be true, we have to fallback to an
251	// execution path (like kReference in float/quantized cases) that doesn't
252	// require im2col operation. Therefore, we have to skip checking the hybrid
253	// case (but not the hybrid-per-channel one) where there's no such a fallback
254	// execution path.
255	// TODO(b/178743262): Consider making this check conditioned on the available
256	// memory of the system, rather than coupling to the mobile platform check.
257	if (IsMobilePlatform() && !(is_hybrid && !is_per_channel) &&
258	data->need_im2col && im2col_bytes >= kMaxIm2colBufferSizeMobile) {
259	data->need_im2col = false;
260	data->im2col_oversized = true;
261	}
262	int temporaries_count = `0`;
263	if (data->need_im2col) {
264	data->im2col_index = temporaries_count;
265	if (data->im2col_id == kTensorNotAllocated) {
266	context->AddTensors(context, `1`, &data->im2col_id);
267	}
268	++temporaries_count;
269	}
270	if (data->need_hwcn_weights) {
271	data->hwcn_weights_index = temporaries_count;
272	if (data->hwcn_weights_id == kTensorNotAllocated) {
273	context->AddTensors(context, `1`, &data->hwcn_weights_id);
274	}
275	++temporaries_count;
276	}
277
278	if (is_hybrid) {
279	// Allocate tensor to store the on-the-fly quantized inputs.
280	data->input_quantized_index = temporaries_count;
281	if (data->input_quantized_id == kTensorNotAllocated) {
282	TF_LITE_ENSURE_OK(
283	context, context->AddTensors(context, `1`, &data->input_quantized_id));
284	}
285	++temporaries_count;
286
287	// Allocate tensor to store the quantization params computed during
288	// on-the-fly input quantization.
289	data->scaling_factors_index = temporaries_count;
290	if (data->scaling_factors_id == kTensorNotAllocated) {
291	TF_LITE_ENSURE_OK(
292	context, context->AddTensors(context, `1`, &data->scaling_factors_id));
293	}
294	++temporaries_count;
295
296	// Allocate tensor to store the accumulators for the matrix multiply.
297	data->accum_scratch_index = temporaries_count;
298	if (data->accum_scratch_id == kTensorNotAllocated) {
299	TF_LITE_ENSURE_OK(
300	context, context->AddTensors(context, `1`, &data->accum_scratch_id));
301	}
302	++temporaries_count;
303	if (is_per_channel) {
304	data->input_offset_index = temporaries_count;
305	if (data->input_offset_id == kTensorNotAllocated) {
306	TF_LITE_ENSURE_OK(
307	context, context->AddTensors(context, `1`, &data->input_offset_id));
308	}
309	++temporaries_count;
310
311	data->row_sums_index = temporaries_count;
312	if (data->row_sums_id == kTensorNotAllocated) {
313	TF_LITE_ENSURE_OK(context,
314	context->AddTensors(context, `1`, &data->row_sums_id));
315	}
316	++temporaries_count;
317	}
318	}
319
320	TfLiteIntArrayFree(node->temporaries);
321	node->temporaries = TfLiteIntArrayCreate(temporaries_count);
322
323	return kTfLiteOk;
324	}
325
326	TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
327	TfLiteNode* node) {
328	auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
329	OpData* data = reinterpret_cast<OpData*>(node->user_data);
330
331	bool has_bias = node->inputs->size == `3`;
332	// Check number of inputs/outputs
333	TF_LITE_ENSURE(context, has_bias \|\| node->inputs->size == `2`);
334	TF_LITE_ENSURE_EQ(context, node->outputs->size, `1`);
335	TfLiteTensor* output;
336	TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, `0`, &output));
337	const TfLiteTensor* input;
338	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `0`, &input));
339	const TfLiteTensor* filter;
340	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `1`, &filter));
341
342	// Check dimensionality of input, filter
343	TF_LITE_ENSURE_EQ(context, input->dims->size, `4`);
344	TF_LITE_ENSURE_EQ(context, filter->dims->size, `4`);
345	// Check input channels matching filter
346	// Filter input channel can be a factor of channels of input (grouped conv)
347	// or equals (normal conv).
348	auto input_channel = input->dims->data[`3`];
349	auto filter_input_channel = filter->dims->data[`3`];
350	TF_LITE_ENSURE_EQ(context, input_channel % filter_input_channel, `0`);
351	data->groups = input_channel / filter_input_channel;
352
353	// Check types. (We assume that UINT8 refers to quantized tensors)
354	TfLiteType input_type = input->type;
355	TF_LITE_ENSURE(context,
356	input_type == kTfLiteFloat32 \|\| input_type == kTfLiteUInt8 \|\|
357	input_type == kTfLiteInt8 \|\| input_type == kTfLiteInt16);
358	TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
359
360	if (input_type == kTfLiteInt16) {
361	TF_LITE_ENSURE_EQ(context, input->params.zero_point, `0`);
362	TF_LITE_ENSURE_EQ(context, output->params.zero_point, `0`);
363	}
364	// Filter must have zero zero-points in per-channel quantization.
365	if (input_type == kTfLiteInt16 \|\| input_type == kTfLiteInt8) {
366	TF_LITE_ENSURE_EQ(context, filter->quantization.type,
367	kTfLiteAffineQuantization);
368	const auto* affine_quantization =
369	reinterpret_cast<TfLiteAffineQuantization*>(
370	filter->quantization.params);
371	for (int i = `0`; i < affine_quantization->zero_point->size; ++i) {
372	TF_LITE_ENSURE_EQ(context, affine_quantization->zero_point->data[i], `0`);
373	}
374	}
375
376	const TfLiteTensor* bias = nullptr;
377
378	// TODO(ahentz): At this point the optimized versions require 'bias'. We can
379	// either change that or document that convolution requires it.
380	TF_LITE_ENSURE(context, has_bias);
381
382	if (has_bias) {
383	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `2`, &bias));
384	if (input_type == kTfLiteUInt8 \|\| input_type == kTfLiteInt8) {
385	TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
386	TF_LITE_ENSURE_EQ(context, bias->params.zero_point, `0`);
387	} else if (input_type == kTfLiteInt16) {
388	TF_LITE_ENSURE(context, (bias->type == kTfLiteInt32) \|\|
389	(bias->type == kTfLiteInt64));
390	TF_LITE_ENSURE_EQ(context, bias->params.zero_point, `0`);
391	} else {
392	TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type);
393	}
394	TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, `0`));
395	}
396
397	const bool is_hybrid =
398	(input->type == kTfLiteFloat32 &&
399	(filter->type == kTfLiteUInt8 \|\| filter->type == kTfLiteInt8));
400
401	if (is_hybrid && filter->type == kTfLiteInt8 &&
402	filter->quantization.type == kTfLiteAffineQuantization &&
403	filter->quantization.params &&
404	reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
405	->scale &&
406	reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
407	->scale->size > `1`) {
408	const auto* affine_quantization =
409	reinterpret_cast<TfLiteAffineQuantization*>(
410	filter->quantization.params);
411	const float scale = affine_quantization->scale->data[`0`];
412	for (int i = `1`; i < affine_quantization->scale->size; i++) {
413	if (affine_quantization->scale->data[i] != scale) {
414	data->is_hybrid_per_channel = true;
415	break;
416	}
417	}
418	}
419
420	// The multi-threaded kernel supports neither dilation nor hybrid kernels, and
421	// is incompatible with mutable input filters that might change between evals.
422	data->supports_multithreaded_kernel =
423	(kernel_type == kMultithreadOptimized) &&
424	(context->recommended_num_threads != `1`) && !is_hybrid &&
425	(params->dilation_width_factor == `1`) &&
426	(params->dilation_height_factor == `1`) &&
427	(filter->allocation_type != kTfLiteArenaRw) && !IsDynamicTensor(filter);
428
429	int channels_in = filter->dims->data[`3`];
430	int channels_out = filter->dims->data[`0`];
431	int width = input->dims->data[`2`];
432	int height = input->dims->data[`1`];
433	int filter_width = filter->dims->data[`2`];
434	int filter_height = filter->dims->data[`1`];
435	int batches = input->dims->data[`0`];
436
437	// Matching GetWindowedOutputSize in TensorFlow.
438	auto padding = params->padding;
439	int out_width, out_height;
440	data->padding = ComputePaddingHeightWidth(
441	params->stride_height, params->stride_width,
442	params->dilation_height_factor, params->dilation_width_factor, height,
443	width, filter_height, filter_width, padding, &out_height, &out_width);
444
445	size_t im2col_type_size;
446	TF_LITE_ENSURE_STATUS(GetSizeOfType(context, input->type, &im2col_type_size));
447	// Note that we intentionally promote the first multiplicand (i.e. 'batches')
448	// to 'size_t' to avoid integer overflow here.
449	const size_t im2col_bytes = static_cast<size_t>(batches) * out_height *
450	out_width * channels_in * filter_height *
451	filter_width * im2col_type_size;
452	TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
453	context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type,
454	im2col_bytes));
455
456	TF_LITE_ENSURE(context, has_bias);
457
458	// Note that full fixed-point inference requires that all tensors have their
459	// parameters set. This is usually done during quantized training or
460	// calibration.
461	if (input_type != kTfLiteFloat32) {
462	TF_LITE_ENSURE_EQ(context, filter->quantization.type,
463	kTfLiteAffineQuantization);
464	const auto* affine_quantization =
465	reinterpret_cast<TfLiteAffineQuantization*>(
466	filter->quantization.params);
467	TF_LITE_ENSURE(context, affine_quantization);
468	TF_LITE_ENSURE(context, affine_quantization->scale);
469	TF_LITE_ENSURE(context, (affine_quantization->scale->size == `1` \|\|
470	affine_quantization->scale->size == channels_out));
471
472	data->per_channel_output_multiplier.resize(channels_out);
473	data->per_channel_output_shift.resize(channels_out);
474	TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
475	context, input, filter, bias, output, params->activation,
476	&data->output_multiplier, &data->output_shift,
477	&data->output_activation_min, &data->output_activation_max,
478	data->per_channel_output_multiplier.data(),
479	data->per_channel_output_shift.data(), channels_out));
480	}
481
482	TfLiteIntArray* output_size = TfLiteIntArrayCreate(`4`);
483	output_size->data[`0`] = batches;
484	output_size->data[`1`] = out_height;
485	output_size->data[`2`] = out_width;
486	output_size->data[`3`] = channels_out;
487	auto output_status = context->ResizeTensor(context, output, output_size);
488
489	if (output_status != kTfLiteOk) return output_status;
490
491	if (data->need_im2col) {
492	node->temporaries->data[data->im2col_index] = data->im2col_id;
493
494	TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(`4`);
495
496	auto filter_input_channel = filter->dims->data[`3`];
497	im2col_size->data[`0`] = output_size->data[`0`];
498	im2col_size->data[`1`] = output_size->data[`1`];
499	im2col_size->data[`2`] = output_size->data[`2`];
500	im2col_size->data[`3`] = filter_input_channel * filter_height * filter_width;
501
502	TfLiteTensor* im2col =
503	&context->tensors[node->temporaries->data[data->im2col_index]];
504	im2col->type = input->type;
505	if (is_hybrid) {
506	im2col->type = filter->type;
507	}
508	im2col->allocation_type = kTfLiteArenaRw;
509	auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
510	if (im2col_status != kTfLiteOk) return im2col_status;
511	}
512
513	if (data->need_hwcn_weights) {
514	node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
515	TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(`2`);
516
517	// Because we're treating the filter weights as a matrix when we do the
518	// transpose, we allocate the buffer with a two-dimensional shape, where one
519	// dimension is the number of elements in each filter, and the second is the
520	// total number of filters.
521	auto filter_input_channel = filter->dims->data[`3`];
522	hwcn_weights_size->data[`0`] =
523	(filter_height * filter_width * filter_input_channel);
524	hwcn_weights_size->data[`1`] = channels_out;
525
526	TfLiteTensor* hwcn_weights =
527	&context->tensors[node->temporaries->data[data->hwcn_weights_index]];
528	hwcn_weights->type = input_type;
529	hwcn_weights->name = "Conv_hwcn_weights";
530	hwcn_weights->allocation_type = kTfLiteArenaRwPersistent;
531
532	auto hwcn_weights_status =
533	context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
534	if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
535
536	// TODO(petewarden): If Resize() is called when the size hasn't actually
537	// changed, this will do extra redundant work.
538	data->have_weights_been_transposed = false;
539	}
540
541	if (is_hybrid) {
542	node->temporaries->data[data->input_quantized_index] =
543	data->input_quantized_id;
544	TfLiteTensor* input_quantized;
545	TF_LITE_ENSURE_OK(
546	context, GetTemporarySafe(context, node, data->input_quantized_index,
547	&input_quantized));
548	input_quantized->type = kTfLiteInt8;
549	input_quantized->allocation_type = kTfLiteArenaRw;
550	if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
551	TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
552	TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
553	input_quantized_size));
554	}
555
556	node->temporaries->data[data->scaling_factors_index] =
557	data->scaling_factors_id;
558	TfLiteTensor* scaling_factors;
559	TF_LITE_ENSURE_OK(
560	context, GetTemporarySafe(context, node, data->scaling_factors_index,
561	&scaling_factors));
562	scaling_factors->type = kTfLiteFloat32;
563	scaling_factors->allocation_type = kTfLiteArenaRw;
564	// Only one scale factor per batch is typically necessary. See optimized
565	// implementation for why we need to allocate for the height of the inputs
566	// flattened to 2D.
567	TF_LITE_ENSURE(context, channels_in != `0`);
568	const int height = NumElements(input) / channels_in;
569	int scaling_dims[`1`] = {height};
570	if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, `1`, scaling_dims)) {
571	TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(`1`);
572	scaling_factors_size->data[`0`] = height;
573	TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
574	scaling_factors_size));
575	}
576
577	node->temporaries->data[data->accum_scratch_index] = data->accum_scratch_id;
578	TfLiteTensor* accum_scratch;
579	TF_LITE_ENSURE_OK(context,
580	GetTemporarySafe(context, node, data->accum_scratch_index,
581	&accum_scratch));
582	accum_scratch->type = kTfLiteInt32;
583	accum_scratch->allocation_type = kTfLiteArenaRw;
584	const int scratch_width = batches * out_height * out_width;
585	int accum_scratch_dims[`2`] = {channels_out, scratch_width};
586	if (!TfLiteIntArrayEqualsArray(accum_scratch->dims, `2`,
587	accum_scratch_dims)) {
588	TfLiteIntArray* accum_scratch_size = TfLiteIntArrayCreate(`2`);
589	accum_scratch_size->data[`0`] = channels_out;
590	accum_scratch_size->data[`1`] = scratch_width;
591	TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, accum_scratch,
592	accum_scratch_size));
593	}
594
595	if (data->is_hybrid_per_channel) {
596	const auto* affine_quantization =
597	reinterpret_cast<TfLiteAffineQuantization*>(
598	filter->quantization.params);
599	TF_LITE_ENSURE_EQ(
600	context, affine_quantization->scale->size,
601	filter->dims->data[affine_quantization->quantized_dimension]);
602	node->temporaries->data[data->input_offset_index] = data->input_offset_id;
603	TfLiteTensor* input_offsets;
604	TF_LITE_ENSURE_OK(
605	context, GetTemporarySafe(context, node, data->input_offset_index,
606	&input_offsets));
607	input_offsets->type = kTfLiteInt32;
608	input_offsets->allocation_type = kTfLiteArenaRw;
609	// See above comment for the need to allocate for height of inputs.
610	TF_LITE_ENSURE(context, channels_in != `0`);
611	const int height = NumElements(input) / channels_in;
612	const int input_offset_dims[`1`] = {height};
613	if (!TfLiteIntArrayEqualsArray(input_offsets->dims, `1`,
614	input_offset_dims)) {
615	TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(`1`);
616	input_offsets_size->data[`0`] = input_offset_dims[`0`];
617	TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
618	input_offsets_size));
619	}
620	node->temporaries->data[data->row_sums_index] = data->row_sums_id;
621	TfLiteTensor* row_sums;
622	TF_LITE_ENSURE_OK(
623	context,
624	GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
625	row_sums->type = kTfLiteInt32;
626	row_sums->name = "Conv_row_sums";
627	row_sums->allocation_type = kTfLiteArenaRwPersistent;
628	// See above comment for the need to allocate for height of inputs.
629	const int row_sums_dims[`1`] = {channels_out};
630	if (!TfLiteIntArrayEqualsArray(row_sums->dims, `1`, row_sums_dims)) {
631	TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(`1`);
632	row_sums_size->data[`0`] = row_sums_dims[`0`];
633	TF_LITE_ENSURE_OK(
634	context, context->ResizeTensor(context, row_sums, row_sums_size));
635	}
636	}
637	}
638	return kTfLiteOk;
639	}
640
641	template <KernelType kernel_type>
642	TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
643	return Prepare(kernel_type, context, node);
644	}
645
646	template <KernelType kernel_type>
647	void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
648	TfLiteConvParams* params, OpData* data,
649	const TfLiteTensor* input, const TfLiteTensor* filter,
650	const TfLiteTensor* bias, TfLiteTensor* im2col,
651	TfLiteTensor* output) {
652	auto input_offset = -input->params.zero_point;
653	auto filter_offset = -filter->params.zero_point;
654	auto output_offset = output->params.zero_point;
655
656	KernelType effective_kernel_type;
657	if ((kernel_type == kMultithreadOptimized \|\|
658	kernel_type == kCblasOptimized) &&
659	(params->dilation_width_factor != `1` \|\|
660	params->dilation_height_factor != `1`)) {
661	// kMultithreadOptimized and kCblasOptimized do not support dilation.
662	// Therefore, fallback to optimized.
663	effective_kernel_type = kGenericOptimized;
664	} else {
665	effective_kernel_type = kernel_type;
666	}
667
668	// We have to fallback to reference execution path when im2col is needed but
669	// disabled because to-be-allocated temporary im2col tensor is too large.
670	// See b/178743262 for the detailed motivation.
671	if (data->im2col_oversized) {
672	effective_kernel_type = kReference;
673	}
674
675	// Grouped convolution is right now only supported on reference kernel.
676	if (data->groups != `1`) {
677	effective_kernel_type = kReference;
678	}
679
680	ConvParams op_params;
681	op_params.padding_type = PaddingType::kSame;
682	op_params.padding_values.width = data->padding.width;
683	op_params.padding_values.height = data->padding.height;
684	op_params.dilation_width_factor = params->dilation_width_factor;
685	op_params.dilation_height_factor = params->dilation_height_factor;
686	op_params.stride_width = params->stride_width;
687	op_params.stride_height = params->stride_height;
688	op_params.input_offset = input_offset;
689	op_params.weights_offset = filter_offset;
690	op_params.output_offset = output_offset;
691	op_params.output_multiplier = data->output_multiplier;
692	op_params.output_shift = -data->output_shift;
693	op_params.quantized_activation_min = data->output_activation_min;
694	op_params.quantized_activation_max = data->output_activation_max;
695	switch (effective_kernel_type) {
696	case kReference: {
697	reference_ops::Conv(
698	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
699	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
700	GetTensorShape(bias), GetTensorData<int32_t>(bias),
701	GetTensorShape(output), GetTensorData<uint8_t>(output),
702	GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
703	/ cpu_backend_context = / nullptr);
704	break;
705	}
706	case kGenericOptimized:
707	case kMultithreadOptimized:
708	case kCblasOptimized: {
709	// There is only one optimized implementation for Quantized Conv.
710	optimized_ops::Conv(
711	op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
712	GetTensorShape(filter), GetTensorData<uint8_t>(filter),
713	GetTensorShape(bias), GetTensorData<int32_t>(bias),
714	GetTensorShape(output), GetTensorData<uint8_t>(output),
715	GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
716	CpuBackendContext::GetFromContext(context));
717	break;
718	}
719	}
720	}
721
722	template <KernelType kernel_type>
723	void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
724	TfLiteConvParams* params, OpData* data,
725	const TfLiteTensor* input,
726	const TfLiteTensor* filter,
727	const TfLiteTensor* bias, TfLiteTensor* output,
728	TfLiteTensor* im2col) {
729	ConvParams op_params;
730	op_params.input_offset = -input->params.zero_point;
731	op_params.output_offset = output->params.zero_point;
732	op_params.stride_height = params->stride_height;
733	op_params.stride_width = params->stride_width;
734	op_params.dilation_height_factor = params->dilation_height_factor;
735	op_params.dilation_width_factor = params->dilation_width_factor;
736	op_params.padding_values.height = data->padding.height;
737	op_params.padding_values.width = data->padding.width;
738	op_params.quantized_activation_min = data->output_activation_min;
739	op_params.quantized_activation_max = data->output_activation_max;
740
741	KernelType effective_kernel_type = kernel_type;
742	// We have to fallback to reference execution path when im2col is needed but
743	// disabled because to-be-allocated temporary im2col tensor is too large.
744	// See b/178743262 for the detailed motivation.
745	if (data->im2col_oversized) {
746	effective_kernel_type = kReference;
747	}
748
749	// Grouped convolution is right now only supported on reference kernel.
750	if (data->groups != `1`) {
751	effective_kernel_type = kReference;
752	}
753
754	switch (effective_kernel_type) {
755	case kReference: {
756	reference_integer_ops::ConvPerChannel(
757	op_params, data->per_channel_output_multiplier.data(),
758	data->per_channel_output_shift.data(), GetTensorShape(input),
759	GetTensorData<int8>(input), GetTensorShape(filter),
760	GetTensorData<int8>(filter), GetTensorShape(bias),
761	GetTensorData<int32>(bias), GetTensorShape(output),
762	GetTensorData<int8>(output));
763	break;
764	}
765	case kGenericOptimized:
766	case kMultithreadOptimized:
767	case kCblasOptimized: {
768	optimized_integer_ops::ConvPerChannel(
769	op_params, data->per_channel_output_multiplier.data(),
770	data->per_channel_output_shift.data(), GetTensorShape(input),
771	GetTensorData<int8>(input), GetTensorShape(filter),
772	GetTensorData<int8>(filter), GetTensorShape(bias),
773	GetTensorData<int32>(bias), GetTensorShape(output),
774	GetTensorData<int8>(output), GetTensorShape(im2col),
775	GetTensorData<int8>(im2col),
776	CpuBackendContext::GetFromContext(context));
777	break;
778	}
779	}
780	}
781
782	template <KernelType kernel_type>
783	void EvalQuantizedPerChannel16x8(TfLiteContext* context, TfLiteNode* node,
784	TfLiteConvParams* params, OpData* data,
785	const TfLiteTensor* input,
786	const TfLiteTensor* filter,
787	const TfLiteTensor* bias, TfLiteTensor* output,
788	TfLiteTensor* im2col) {
789	ConvParams op_params;
790	op_params.input_offset = -input->params.zero_point;
791	op_params.output_offset = output->params.zero_point;
792	op_params.stride_height = params->stride_height;
793	op_params.stride_width = params->stride_width;
794	op_params.dilation_height_factor = params->dilation_height_factor;
795	op_params.dilation_width_factor = params->dilation_width_factor;
796	op_params.padding_values.height = data->padding.height;
797	op_params.padding_values.width = data->padding.width;
798	op_params.quantized_activation_min = data->output_activation_min;
799	op_params.quantized_activation_max = data->output_activation_max;
800
801	KernelType effective_kernel_type = kernel_type;
802	// We have to fallback to reference execution path when im2col is needed but
803	// disabled because to-be-allocated temporary im2col tensor is too large.
804	// See b/178743262 for the detailed motivation.
805	if (data->im2col_oversized) {
806	effective_kernel_type = kReference;
807	}
808
809	// Grouped convolution is right now only supported on reference kernel.
810	if (data->groups != `1`) {
811	effective_kernel_type = kReference;
812	}
813
814	// To prevent 32bit accum overflow for 16x8 quantization, it enables the
815	// optimized path only when zero_point is 0.
816	bool has_non_zero_point = input->params.zero_point \|\|
817	filter->params.zero_point \|\|
818	output->params.zero_point;
819
820	// Fallback to reference kernel when bias_type is int64 as
821	// there is no optimized kernel for int64 bias yet.
822	if (bias && bias->type == kTfLiteInt64) {
823	reference_integer_ops::ConvPerChannel(
824	op_params, data->per_channel_output_multiplier.data(),
825	data->per_channel_output_shift.data(), GetTensorShape(input),
826	GetTensorData<int16>(input), GetTensorShape(filter),
827	GetTensorData<int8>(filter), GetTensorShape(bias),
828	GetTensorData<std::int64_t>(bias), GetTensorShape(output),
829	GetTensorData<int16>(output));
830	} else if (effective_kernel_type == kReference \|\| has_non_zero_point) {
831	reference_integer_ops::ConvPerChannel(
832	op_params, data->per_channel_output_multiplier.data(),
833	data->per_channel_output_shift.data(), GetTensorShape(input),
834	GetTensorData<int16>(input), GetTensorShape(filter),
835	GetTensorData<int8>(filter), GetTensorShape(bias),
836	GetTensorData<std::int32_t>(bias), GetTensorShape(output),
837	GetTensorData<int16>(output));
838	} else {
839	optimized_integer_ops::ConvPerChannel(
840	op_params, data->per_channel_output_multiplier.data(),
841	data->per_channel_output_shift.data(), GetTensorShape(input),
842	GetTensorData<int16_t>(input), GetTensorShape(filter),
843	GetTensorData<int8_t>(filter), GetTensorShape(bias),
844	GetTensorData<std::int32_t>(bias), GetTensorShape(output),
845	GetTensorData<int16_t>(output), GetTensorShape(im2col),
846	GetTensorData<int16_t>(im2col),
847	CpuBackendContext::GetFromContext(context));
848	}
849	}
850
851	template <KernelType kernel_type>
852	void EvalFloat(TfLiteContext* context, TfLiteNode* node,
853	TfLiteConvParams* params, OpData* data,
854	const TfLiteTensor* input, const TfLiteTensor* filter,
855	const TfLiteTensor* bias, TfLiteTensor* im2col,
856	TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
857	float output_activation_min, output_activation_max;
858	CalculateActivationRange(params->activation, &output_activation_min,
859	&output_activation_max);
860	KernelType effective_kernel_type = kernel_type;
861	// Fall back to the optimized path if multi-threaded conv is unsupported.
862	if ((kernel_type == kMultithreadOptimized) &&
863	!data->supports_multithreaded_kernel) {
864	effective_kernel_type = kGenericOptimized;
865	}
866
867	// When im2col is needed (which is implied when 'im2col_oversized' is true),
868	// the GEMMM-based optimized path requires im2col data be allocated to ensure
869	// the correctness. Therefore, when im2col is disabled because of the
870	// oversized temporary im2col tensor, fallback to a non-optimized path is
871	// needed.
872	// See b/178743262 for the detailed motivation.
873	if (data->im2col_oversized) {
874	effective_kernel_type = kReference;
875	#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
876	// As detailed by tflite::multithreaded_ops::Conv implementation in
877	// multithreaded_conv.h, the Eigen-based execution doesn't need im2col data.
878	// Therefore, we could rely on it as a better-optimized fallback than the
879	// reference one.
880	if (data->supports_multithreaded_kernel) {
881	effective_kernel_type = kMultithreadOptimized;
882	}
883	#endif
884	}
885
886	// Grouped convolution is right now only supported on reference kernel.
887	if (data->groups != `1`) {
888	effective_kernel_type = kReference;
889	}
890
891	ConvParams op_params;
892	op_params.padding_type = RuntimePaddingType(params->padding);
893	op_params.padding_values.width = data->padding.width;
894	op_params.padding_values.height = data->padding.height;
895	op_params.stride_width = params->stride_width;
896	op_params.stride_height = params->stride_height;
897	op_params.dilation_width_factor = params->dilation_width_factor;
898	op_params.dilation_height_factor = params->dilation_height_factor;
899	op_params.float_activation_min = output_activation_min;
900	op_params.float_activation_max = output_activation_max;
901	switch (effective_kernel_type) {
902	case kReference: {
903	reference_ops::Conv(op_params, GetTensorShape(input),
904	GetTensorData<float>(input), GetTensorShape(filter),
905	GetTensorData<float>(filter), GetTensorShape(bias),
906	GetTensorData<float>(bias), GetTensorShape(output),
907	GetTensorData<float>(output), GetTensorShape(im2col),
908	GetTensorData<float>(im2col));
909	break;
910	}
911	case kCblasOptimized:
912	case kGenericOptimized: {
913	optimized_ops::Conv(op_params, GetTensorShape(input),
914	GetTensorData<float>(input), GetTensorShape(filter),
915	GetTensorData<float>(filter), GetTensorShape(bias),
916	GetTensorData<float>(bias), GetTensorShape(output),
917	GetTensorData<float>(output), GetTensorShape(im2col),
918	GetTensorData<float>(im2col),
919	CpuBackendContext::GetFromContext(context));
920	break;
921	}
922	case kMultithreadOptimized: {
923	#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
924	const float* filter_data;
925	if (data->need_hwcn_weights) {
926	filter_data = GetTensorData<float>(hwcn_weights);
927	} else {
928	filter_data = GetTensorData<float>(filter);
929	}
930	multithreaded_ops::Conv(
931	*eigen_support::GetThreadPoolDevice(context), op_params,
932	GetTensorShape(input), GetTensorData<float>(input),
933	GetTensorShape(filter), filter_data, GetTensorShape(bias),
934	GetTensorData<float>(bias), GetTensorShape(output),
935	GetTensorData<float>(output), GetTensorShape(im2col),
936	GetTensorData<float>(im2col));
937	break;
938	#else // !defined(TFLITE_WITH_MULTITHREADED_EIGEN)
939	// See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
940	// was enabled. We #if out this code in order to get the corresponding
941	// binary size benefits.
942	TFLITE_DCHECK(false);
943	#endif // defined(TFLITE_WITH_MULTITHREADED_EIGEN)
944	}
945	}
946	}
947
948	template <KernelType kernel_type>
949	TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
950	TfLiteConvParams* params, OpData* data,
951	const TfLiteTensor* input,
952	const TfLiteTensor* filter,
953	const TfLiteTensor* bias,
954	TfLiteTensor* im2col, TfLiteTensor* output) {
955	float output_activation_min, output_activation_max;
956	CalculateActivationRange(params->activation, &output_activation_min,
957	&output_activation_max);
958
959	const int batch_size = SizeOfDimension(input, `0`);
960	TF_LITE_ENSURE(context, batch_size != `0`);
961	const int input_size = NumElements(input) / batch_size;
962	TfLiteTensor* quantized_input_tensor;
963	TF_LITE_ENSURE_OK(context,
964	GetTemporarySafe(context, node, data->input_quantized_index,
965	&quantized_input_tensor));
966	int8_t* quantized_input_ptr_batch =
967	GetTensorData<int8_t>(quantized_input_tensor);
968	TfLiteTensor* scaling_factors_tensor;
969	TF_LITE_ENSURE_OK(context,
970	GetTemporarySafe(context, node, data->scaling_factors_index,
971	&scaling_factors_tensor));
972	float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
973	TfLiteTensor* input_offset_tensor;
974	TF_LITE_ENSURE_OK(context,
975	GetTemporarySafe(context, node, data->input_offset_index,
976	&input_offset_tensor));
977	int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
978
979	for (int b = `0`; b < batch_size; ++b) {
980	const int offset = b * input_size;
981	tensor_utils::AsymmetricQuantizeFloats(
982	GetTensorData<float>(input) + offset, input_size,
983	quantized_input_ptr_batch + offset, &scaling_factors_ptr[b],
984	&input_offset_ptr[b]);
985	}
986
987	int8_t* im2col_ptr = nullptr;
988	int8_t* filter_ptr = nullptr;
989	if (im2col != nullptr) {
990	im2col_ptr = im2col->data.int8;
991	}
992	filter_ptr = filter->data.int8;
993	const auto* affine_quantization =
994	reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
995
996	KernelType effective_kernel_type = kernel_type;
997	// We have to fallback to reference execution path when im2col is needed but
998	// disabled because to-be-allocated temporary im2col tensor is too large.
999	// See b/178743262 for the detailed motivation.
1000	if (data->im2col_oversized) {
1001	effective_kernel_type = kReference;
1002	}
1003
1004	// Grouped convolution is right now only supported on reference kernel.
1005	if (data->groups != `1`) {
1006	effective_kernel_type = kReference;
1007	}
1008
1009	ConvParams op_params;
1010	op_params.padding_type = PaddingType::kSame;
1011	op_params.padding_values.width = data->padding.width;
1012	op_params.padding_values.height = data->padding.height;
1013	op_params.dilation_width_factor = params->dilation_width_factor;
1014	op_params.dilation_height_factor = params->dilation_height_factor;
1015	op_params.stride_width = params->stride_width;
1016	op_params.stride_height = params->stride_height;
1017	op_params.float_activation_min = output_activation_min;
1018	op_params.float_activation_max = output_activation_max;
1019	switch (effective_kernel_type) {
1020	case kReference:
1021	reference_ops::HybridConvPerChannel(
1022	op_params, scaling_factors_ptr, GetTensorShape(input),
1023	quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1024	GetTensorShape(bias), GetTensorData<float>(bias),
1025	GetTensorShape(output), GetTensorData<float>(output),
1026	GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1027	input_offset_ptr);
1028	break;
1029	case kGenericOptimized:
1030	case kMultithreadOptimized:
1031	case kCblasOptimized: {
1032	TfLiteTensor* row_sums;
1033	TF_LITE_ENSURE_OK(
1034	context,
1035	GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
1036	TfLiteTensor* scratch;
1037	TF_LITE_ENSURE_OK(
1038	context,
1039	GetTemporarySafe(context, node, data->accum_scratch_index, &scratch));
1040	optimized_ops::HybridConvPerChannel(
1041	op_params, scaling_factors_ptr, GetTensorShape(input),
1042	quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1043	GetTensorShape(bias), GetTensorData<float>(bias),
1044	GetTensorShape(output), GetTensorData<float>(output),
1045	GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1046	input_offset_ptr, GetTensorShape(scratch),
1047	GetTensorData<int32>(scratch), GetTensorData<int32_t>(row_sums),
1048	&data->compute_hybrid_row_sums,
1049	CpuBackendContext::GetFromContext(context));
1050	data->compute_hybrid_row_sums = false;
1051	break;
1052	}
1053	}
1054
1055	return kTfLiteOk;
1056	}
1057
1058	template <KernelType kernel_type>
1059	TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
1060	TfLiteConvParams* params, OpData* data,
1061	const TfLiteTensor* input, const TfLiteTensor* filter,
1062	const TfLiteTensor* bias, TfLiteTensor* im2col,
1063	TfLiteTensor* accum_scratch, TfLiteTensor* output) {
1064	float output_activation_min, output_activation_max;
1065	CalculateActivationRange(params->activation, &output_activation_min,
1066	&output_activation_max);
1067
1068	const int batch_size = SizeOfDimension(input, `0`);
1069	TF_LITE_ENSURE(context, batch_size != `0`);
1070	const int input_size = NumElements(input) / batch_size;
1071
1072	const float* input_ptr = GetTensorData<float>(input);
1073	TfLiteTensor* quantized_input_tensor;
1074	TF_LITE_ENSURE_OK(context,
1075	GetTemporarySafe(context, node, data->input_quantized_index,
1076	&quantized_input_tensor));
1077	int8_t* quantized_input_ptr_batch =
1078	GetTensorData<int8_t>(quantized_input_tensor);
1079	TfLiteTensor* scaling_factors_tensor;
1080	TF_LITE_ENSURE_OK(context,
1081	GetTemporarySafe(context, node, data->scaling_factors_index,
1082	&scaling_factors_tensor));
1083	float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
1084
1085	// Per-batch input quantization for higher accuracy.
1086	{
1087	ruy::profiler::ScopeLabel label("ConvHybridQuantizeInputs");
1088	for (int b = `0`; b < batch_size; ++b) {
1089	float unused_min, unused_max;
1090	const int offset = b * input_size;
1091	tensor_utils::SymmetricQuantizeFloats(
1092	input_ptr + offset, input_size, quantized_input_ptr_batch + offset,
1093	&unused_min, &unused_max, &scaling_factors_ptr[b]);
1094	scaling_factors_ptr[b] *= filter->params.scale;
1095	}
1096	}
1097
1098	switch (kernel_type) {
1099	case kReference:
1100	case kGenericOptimized:
1101	case kMultithreadOptimized:
1102	case kCblasOptimized: {
1103	// There is only one implementation for hybrid kernel.
1104	ConvParams op_params;
1105	op_params.padding_type = PaddingType::kSame;
1106	op_params.padding_values.width = data->padding.width;
1107	op_params.padding_values.height = data->padding.height;
1108	op_params.stride_width = params->stride_width;
1109	op_params.stride_height = params->stride_height;
1110	op_params.dilation_width_factor = params->dilation_width_factor;
1111	op_params.dilation_height_factor = params->dilation_height_factor;
1112	op_params.float_activation_min = output_activation_min;
1113	op_params.float_activation_max = output_activation_max;
1114	if (data->groups == `1`) {
1115	optimized_ops::HybridConv(
1116	op_params, scaling_factors_ptr, GetTensorShape(input),
1117	quantized_input_ptr_batch, GetTensorShape(filter),
1118	GetTensorData<int8_t>(filter), GetTensorShape(bias),
1119	GetTensorData<float>(bias), GetTensorShape(accum_scratch),
1120	GetTensorData<int32_t>(accum_scratch), GetTensorShape(output),
1121	GetTensorData<float>(output), GetTensorShape(im2col),
1122	GetTensorData<int8_t>(im2col),
1123	CpuBackendContext::GetFromContext(context));
1124	} else {
1125	// This case is handled by (fallbacked to) per channel hybrid group conv
1126	// and shouldn't hit this branch.
1127	TF_LITE_KERNEL_LOG(
1128	context,
1129	"Group convolution currently not supported for hybrid kernel.");
1130	return kTfLiteError;
1131	}
1132	break;
1133	}
1134	}
1135
1136	return kTfLiteOk;
1137	}
1138
1139	template <KernelType kernel_type, TfLiteType input_type>
1140	TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
1141	auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
1142	OpData* data = reinterpret_cast<OpData*>(node->user_data);
1143
1144	TfLiteTensor* output;
1145	TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, `0`, &output));
1146	const TfLiteTensor* input;
1147	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `0`, &input));
1148	const TfLiteTensor* filter;
1149	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `1`, &filter));
1150	bool has_bias = node->inputs->size == `3`;
1151	const TfLiteTensor* bias = has_bias ? GetInput(context, node, `2`) : nullptr;
1152	TfLiteTensor* im2col =
1153	data->need_im2col
1154	? &context->tensors[node->temporaries->data[data->im2col_index]]
1155	: nullptr;
1156	TfLiteTensor* hwcn_weights =
1157	data->need_hwcn_weights
1158	? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
1159	: nullptr;
1160
1161	if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
1162	TransposeFloatTensor(filter, hwcn_weights);
1163	data->have_weights_been_transposed = true;
1164	}
1165
1166	TFLITE_DCHECK_EQ(input_type, input->type);
1167	switch (input_type) { // Already know in/outtypes are same.
1168	case kTfLiteFloat32:
1169	if (filter->type == kTfLiteUInt8 \|\| filter->type == kTfLiteInt8) {
1170	if (data->is_hybrid_per_channel \|\|
1171	// TODO(b/162870360): Fallback to PerChannel implementation
1172	// before we have grouped hybrid convolution.
1173	data->groups != `1`) {
1174	TF_LITE_ENSURE_OK(context, EvalHybridPerChannel<kernel_type>(
1175	context, node, params, data, input,
1176	filter, bias, im2col, output));
1177	} else {
1178	TfLiteTensor* accum_scratch =
1179	&context->tensors[node->temporaries
1180	->data[data->accum_scratch_index]];
1181	TF_LITE_ENSURE_OK(context,
1182	EvalHybrid<kernel_type>(context, node, params, data,
1183	input, filter, bias, im2col,
1184	accum_scratch, output));
1185	}
1186	} else {
1187	EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
1188	im2col, hwcn_weights, output);
1189	}
1190	break;
1191	case kTfLiteUInt8:
1192	EvalQuantized<kernel_type>(context, node, params, data, input, filter,
1193	bias, im2col, output);
1194	break;
1195	case kTfLiteInt8:
1196	EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
1197	filter, bias, output, im2col);
1198	break;
1199	case kTfLiteInt16:
1200	EvalQuantizedPerChannel16x8<kernel_type>(
1201	context, node, params, data, input, filter, bias, output, im2col);
1202	break;
1203	default:
1204	TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
1205	TfLiteTypeGetName(input->type));
1206	return kTfLiteError;
1207	}
1208	return kTfLiteOk;
1209	}
1210
1211	template <KernelType kernel_type>
1212	TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
1213	const TfLiteTensor* input;
1214	TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, `0`, &input));
1215
1216	switch (input->type) {
1217	case kTfLiteFloat32:
1218	return EvalImpl<kernel_type, kTfLiteFloat32>(context, node);
1219	case kTfLiteUInt8:
1220	return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
1221	case kTfLiteInt8:
1222	return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
1223	case kTfLiteInt16:
1224	return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
1225	default:
1226	TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
1227	TfLiteTypeGetName(input->type));
1228	return kTfLiteError;
1229	}
1230	}
1231
1232	} // namespace conv
1233
1234	TfLiteRegistration* Register_CONVOLUTION_REF() {
1235	static TfLiteRegistration r = {conv::Init, conv::Free,
1236	conv::Prepare<conv::kReference>,
1237	conv::Eval<conv::kReference>};
1238	return &r;
1239	}
1240
1241	TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
1242	static TfLiteRegistration r = {conv::Init, conv::Free,
1243	conv::Prepare<conv::kGenericOptimized>,
1244	conv::Eval<conv::kGenericOptimized>};
1245	return &r;
1246	}
1247
1248	TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT_UINT8() {
1249	static TfLiteRegistration r = {
1250	conv::Init, conv::Free, conv::Prepare<conv::kGenericOptimized>,
1251	conv::EvalImpl<conv::kGenericOptimized, kTfLiteUInt8>};
1252	return &r;
1253	}
1254
1255	TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
1256	static TfLiteRegistration r = {conv::Init, conv::Free,
1257	conv::Prepare<conv::kMultithreadOptimized>,
1258	conv::Eval<conv::kMultithreadOptimized>};
1259	return &r;
1260	}
1261
1262	TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
1263	static TfLiteRegistration r = {conv::Init, conv::Free,
1264	conv::Prepare<conv::kCblasOptimized>,
1265	conv::Eval<conv::kCblasOptimized>};
1266	return &r;
1267	}
1268
1269	TfLiteRegistration* Register_CONV_2D() {
1270	#if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
1271	return Register_CONVOLUTION_CBLAS_OPT();
1272	#elif defined TFLITE_WITH_MULTITHREADED_EIGEN
1273	return Register_CONVOLUTION_MULTITHREADED_OPT();
1274	#else
1275	return Register_CONVOLUTION_GENERIC_OPT();
1276	#endif
1277	}
1278
1279	// Warning: Clients using this variant are responsible for ensuring that their
1280	// models only need the UINT8 type. TFLite's op registration mechanism doesn't
1281	// yet allow for more nuanced registration mechanisms.
1282	TfLiteRegistration* Register_CONV_2D_UINT8() {
1283	#if defined TFLITE_WITH_RUY
1284	// TFLITE_WITH_RUY optimizes the generic kernel type.
1285	return Register_CONVOLUTION_GENERIC_OPT_UINT8();
1286	#else
1287	return Register_CONV_2D();
1288	#endif
1289	}
1290
1291	} // namespace builtin
1292	} // namespace ops
1293	} // namespace tflite
1294

Browse the source code of tensorflow/tensorflow/lite/kernels/conv.cc