1/* Copyright 2017 The TensorFlow Authors. All Rights Reserved.
2
3Licensed under the Apache License, Version 2.0 (the "License");
4you may not use this file except in compliance with the License.
5You may obtain a copy of the License at
6
7 http://www.apache.org/licenses/LICENSE-2.0
8
9Unless required by applicable law or agreed to in writing, software
10distributed under the License is distributed on an "AS IS" BASIS,
11WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12See the License for the specific language governing permissions and
13limitations under the License.
14==============================================================================*/
15#include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h"
16
17#include <stddef.h>
18
19#include <cstdint>
20#include <vector>
21
22// Only use multi-threaded Eigen if ruy is disabled.
23#if !defined(TFLITE_WITH_RUY)
24#define TFLITE_WITH_MULTITHREADED_EIGEN
25#endif
26
27#include "tensorflow/lite/c/builtin_op_data.h"
28#include "tensorflow/lite/c/common.h"
29#include "tensorflow/lite/kernels/cpu_backend_context.h"
30#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
31#include "tensorflow/lite/kernels/eigen_support.h"
32#endif
33#include "tensorflow/lite/kernels/internal/compatibility.h"
34#include "tensorflow/lite/kernels/internal/types.h"
35// b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h
36#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
37#include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h"
38#endif
39#include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h"
40#include "tensorflow/lite/kernels/internal/quantization_util.h"
41#include "tensorflow/lite/kernels/internal/reference/conv.h"
42#include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h"
43#include "tensorflow/lite/kernels/internal/tensor.h"
44#include "tensorflow/lite/kernels/internal/tensor_ctypes.h"
45#include "tensorflow/lite/kernels/internal/tensor_utils.h"
46#include "tensorflow/lite/kernels/kernel_util.h"
47#include "tensorflow/lite/kernels/padding.h"
48#include "tensorflow/lite/util.h"
49
50namespace tflite {
51namespace ops {
52namespace builtin {
53namespace conv {
54
55// This file has 4 implementation of Conv.
56enum KernelType {
57 kReference,
58 kGenericOptimized, // Neon-free
59 // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads
60 // are available and kGenericOptimized when we must use only one thread.
61 kMultithreadOptimized,
62 // The kernel uses use CBLAS interface for matrix multiplication.
63 // It's fast when an optimized CBLAS implementation is available (e.g. Apple
64 // Accelerate Framework), and it's slow when falling back to naive
65 // implementation.
66 kCblasOptimized,
67};
68
69const int kTensorNotAllocated = -1;
70
71static constexpr size_t kMaxIm2colBufferSizeMobile = 1024 * 1024 * 1024; // 1GB
72
73struct OpData {
74 // IDs are the arbitrary identifiers used by TF Lite to identify and access
75 // memory buffers.
76 int im2col_id = kTensorNotAllocated;
77 int hwcn_weights_id = kTensorNotAllocated;
78 int input_quantized_id = kTensorNotAllocated;
79 int scaling_factors_id = kTensorNotAllocated;
80 int input_offset_id = kTensorNotAllocated;
81 int accum_scratch_id = kTensorNotAllocated;
82 // Row sums are used to cache filter sums for hybrid zero-point calculations.
83 int row_sums_id = kTensorNotAllocated;
84
85 TfLitePaddingValues padding;
86 // The scaling factor from input to output (aka the 'real multiplier') can
87 // be represented as a fixed point multiplier plus a left shift.
88 int32_t output_multiplier;
89 int output_shift;
90
91 // Per channel output multiplier and shift.
92 std::vector<int32_t> per_channel_output_multiplier;
93 std::vector<int> per_channel_output_shift;
94
95 // The range of the fused activation layer. For example for kNone and
96 // uint8_t these would be 0 and 255.
97 int32_t output_activation_min;
98 int32_t output_activation_max;
99 // Indexes are the offset to the memory buffer in the array used to keep track
100 // of the allocated temporaries.
101 int32_t im2col_index;
102 int32_t hwcn_weights_index;
103 int32_t input_quantized_index;
104 int32_t scaling_factors_index;
105 int32_t accum_scratch_index;
106 int32_t input_offset_index;
107 int32_t row_sums_index;
108
109 bool need_hwcn_weights = false;
110 bool have_weights_been_transposed = false;
111 bool need_im2col = false;
112 // If it's true, it means im2col is needed but gets disabled because the
113 // temporary im2col tensor requires too much memory (i.e.
114 // >= kMaxIm2colBufferSize);
115 bool im2col_oversized = false;
116
117 bool supports_multithreaded_kernel = false;
118 bool is_hybrid_per_channel = false;
119 bool compute_hybrid_row_sums = true;
120
121 // Number of convolution groups.
122 int32_t groups = 1;
123};
124
125inline PaddingType RuntimePaddingType(TfLitePadding padding) {
126 switch (padding) {
127 case TfLitePadding::kTfLitePaddingSame:
128 return PaddingType::kSame;
129 case TfLitePadding::kTfLitePaddingValid:
130 return PaddingType::kValid;
131 case TfLitePadding::kTfLitePaddingUnknown:
132 default:
133 return PaddingType::kNone;
134 }
135}
136
137void* Init(TfLiteContext* context, const char* buffer, size_t length) {
138 // This is a builtin op, so we don't use the contents in 'buffer', if any.
139 // Instead, we allocate a new object to use as scratch space for im2col, and
140 // to carry information from Prepare() to Eval().
141 auto* data = new OpData;
142#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
143 eigen_support::IncrementUsageCounter(context);
144#endif
145 return data;
146}
147
148void Free(TfLiteContext* context, void* buffer) {
149#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
150 eigen_support::DecrementUsageCounter(context);
151#endif
152 delete reinterpret_cast<OpData*>(buffer);
153}
154
155// Naive implementation of transpose for floats. Could be optimized to be more
156// cache friendly, but for now it's a one-time cost on first run, and we would
157// prefer to remove the need to do this at all eventually.
158void TransposeFloatTensor(const TfLiteTensor* input, TfLiteTensor* output) {
159 const int rows = output->dims->data[1];
160 const int cols = output->dims->data[0];
161 const float* input_data = GetTensorData<float>(input);
162 float* output_data = GetTensorData<float>(output);
163 for (int i = 0; i < rows; ++i) {
164 for (int j = 0; j < cols; ++j) {
165 const float in_value = input_data[i * cols + j];
166 output_data[j * rows + i] = in_value;
167 }
168 }
169}
170
171// Check if im2col needs to be allocated, as some version of optimized Conv dont
172// use it. If any change is supporting im2col in any of the Conv versions, then
173// it should be updated here as well
174bool IsIm2ColRequired(const TfLiteTensor* input, TfLiteConvParams* params,
175 const TfLiteTensor* filter, OpData* data, bool is_hybrid,
176 KernelType kernel_type) {
177 // If HWCN weights are required, Im2Col not required
178 if (data->need_hwcn_weights) return false;
179
180 // segregate based on dilated conv & non-dialated conv
181 const bool need_dilated_im2col =
182 params->dilation_width_factor != 1 || params->dilation_height_factor != 1;
183 const bool need_non_dilated_im2col =
184 params->stride_width != 1 || params->stride_height != 1 ||
185 filter->dims->data[2] != 1 || filter->dims->data[1] != 1;
186
187 const bool need_im2col = need_dilated_im2col || need_non_dilated_im2col;
188
189 // Return early as basic requirement is not met
190 if (!need_im2col) return false;
191
192 switch (kernel_type) {
193 case kReference:
194 if (is_hybrid) {
195 return true;
196 } else {
197 return false;
198 }
199 case kGenericOptimized:
200 case kCblasOptimized:
201 // `need_im2col` is always satisfied.
202 return true;
203 case kMultithreadOptimized:
204 if (input->type == kTfLiteUInt8 || //
205 input->type == kTfLiteInt8 || //
206 input->type == kTfLiteInt16 || // quantized.
207 !data->supports_multithreaded_kernel) {
208 return true;
209 } else {
210 return false;
211 }
212 default:
213 return false;
214 }
215}
216
217// Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary).
218// Note: `context->AddTensors` might invalidate pointers to existing tensors.
219// Therefore the logic to add tensors are isolated into this function.
220static TfLiteStatus AllocateTemporaryTensorsIfRequired(
221 TfLiteContext* context, TfLiteNode* node, bool is_hybrid,
222 bool is_per_channel, KernelType kernel_type, size_t im2col_bytes) {
223 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
224 OpData* data = reinterpret_cast<OpData*>(node->user_data);
225
226 TF_LITE_ENSURE(context, node->inputs->size >= 2);
227 const TfLiteTensor* input;
228 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
229 const TfLiteTensor* filter;
230 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
231
232 // If we're using the optimized multithreaded EigenTensor implementation of
233 // convolution, it expects the filter weights to be transposed compared to
234 // the normal TF Lite buffer format. Typical TF Lite weights are
235 // [filter_count, filter_height, filter_width, input_depth], but for the float
236 // implementation we need them as [filter_height, filter_width, input_depth,
237 // filter_count]. We get to that format by transposing, and create a temporary
238 // buffer to store the results.
239 // This path is only used for float processing, so only create the buffer if
240 // we're running with that data type.
241 data->need_hwcn_weights =
242 input->type == kTfLiteFloat32 && data->supports_multithreaded_kernel;
243
244 // We don't always need to allocate im2col. It is only used in some versions
245 // of the optimized Conv. This test just mimics something that happens inside
246 // optimized_ops.h, in order to avoid a DCHECK(!im2col_data).
247 data->need_im2col =
248 IsIm2ColRequired(input, params, filter, data, is_hybrid, kernel_type);
249
250 // If im2col_oversized is found to be true, we have to fallback to an
251 // execution path (like kReference in float/quantized cases) that doesn't
252 // require im2col operation. Therefore, we have to skip checking the hybrid
253 // case (but not the hybrid-per-channel one) where there's no such a fallback
254 // execution path.
255 // TODO(b/178743262): Consider making this check conditioned on the available
256 // memory of the system, rather than coupling to the mobile platform check.
257 if (IsMobilePlatform() && !(is_hybrid && !is_per_channel) &&
258 data->need_im2col && im2col_bytes >= kMaxIm2colBufferSizeMobile) {
259 data->need_im2col = false;
260 data->im2col_oversized = true;
261 }
262 int temporaries_count = 0;
263 if (data->need_im2col) {
264 data->im2col_index = temporaries_count;
265 if (data->im2col_id == kTensorNotAllocated) {
266 context->AddTensors(context, 1, &data->im2col_id);
267 }
268 ++temporaries_count;
269 }
270 if (data->need_hwcn_weights) {
271 data->hwcn_weights_index = temporaries_count;
272 if (data->hwcn_weights_id == kTensorNotAllocated) {
273 context->AddTensors(context, 1, &data->hwcn_weights_id);
274 }
275 ++temporaries_count;
276 }
277
278 if (is_hybrid) {
279 // Allocate tensor to store the on-the-fly quantized inputs.
280 data->input_quantized_index = temporaries_count;
281 if (data->input_quantized_id == kTensorNotAllocated) {
282 TF_LITE_ENSURE_OK(
283 context, context->AddTensors(context, 1, &data->input_quantized_id));
284 }
285 ++temporaries_count;
286
287 // Allocate tensor to store the quantization params computed during
288 // on-the-fly input quantization.
289 data->scaling_factors_index = temporaries_count;
290 if (data->scaling_factors_id == kTensorNotAllocated) {
291 TF_LITE_ENSURE_OK(
292 context, context->AddTensors(context, 1, &data->scaling_factors_id));
293 }
294 ++temporaries_count;
295
296 // Allocate tensor to store the accumulators for the matrix multiply.
297 data->accum_scratch_index = temporaries_count;
298 if (data->accum_scratch_id == kTensorNotAllocated) {
299 TF_LITE_ENSURE_OK(
300 context, context->AddTensors(context, 1, &data->accum_scratch_id));
301 }
302 ++temporaries_count;
303 if (is_per_channel) {
304 data->input_offset_index = temporaries_count;
305 if (data->input_offset_id == kTensorNotAllocated) {
306 TF_LITE_ENSURE_OK(
307 context, context->AddTensors(context, 1, &data->input_offset_id));
308 }
309 ++temporaries_count;
310
311 data->row_sums_index = temporaries_count;
312 if (data->row_sums_id == kTensorNotAllocated) {
313 TF_LITE_ENSURE_OK(context,
314 context->AddTensors(context, 1, &data->row_sums_id));
315 }
316 ++temporaries_count;
317 }
318 }
319
320 TfLiteIntArrayFree(node->temporaries);
321 node->temporaries = TfLiteIntArrayCreate(temporaries_count);
322
323 return kTfLiteOk;
324}
325
326TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context,
327 TfLiteNode* node) {
328 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
329 OpData* data = reinterpret_cast<OpData*>(node->user_data);
330
331 bool has_bias = node->inputs->size == 3;
332 // Check number of inputs/outputs
333 TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2);
334 TF_LITE_ENSURE_EQ(context, node->outputs->size, 1);
335 TfLiteTensor* output;
336 TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
337 const TfLiteTensor* input;
338 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
339 const TfLiteTensor* filter;
340 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
341
342 // Check dimensionality of input, filter
343 TF_LITE_ENSURE_EQ(context, input->dims->size, 4);
344 TF_LITE_ENSURE_EQ(context, filter->dims->size, 4);
345 // Check input channels matching filter
346 // Filter input channel can be a factor of channels of input (grouped conv)
347 // or equals (normal conv).
348 auto input_channel = input->dims->data[3];
349 auto filter_input_channel = filter->dims->data[3];
350 TF_LITE_ENSURE_EQ(context, input_channel % filter_input_channel, 0);
351 data->groups = input_channel / filter_input_channel;
352
353 // Check types. (We assume that UINT8 refers to quantized tensors)
354 TfLiteType input_type = input->type;
355 TF_LITE_ENSURE(context,
356 input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 ||
357 input_type == kTfLiteInt8 || input_type == kTfLiteInt16);
358 TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type);
359
360 if (input_type == kTfLiteInt16) {
361 TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0);
362 TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0);
363 }
364 // Filter must have zero zero-points in per-channel quantization.
365 if (input_type == kTfLiteInt16 || input_type == kTfLiteInt8) {
366 TF_LITE_ENSURE_EQ(context, filter->quantization.type,
367 kTfLiteAffineQuantization);
368 const auto* affine_quantization =
369 reinterpret_cast<TfLiteAffineQuantization*>(
370 filter->quantization.params);
371 for (int i = 0; i < affine_quantization->zero_point->size; ++i) {
372 TF_LITE_ENSURE_EQ(context, affine_quantization->zero_point->data[i], 0);
373 }
374 }
375
376 const TfLiteTensor* bias = nullptr;
377
378 // TODO(ahentz): At this point the optimized versions require 'bias'. We can
379 // either change that or document that convolution requires it.
380 TF_LITE_ENSURE(context, has_bias);
381
382 if (has_bias) {
383 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &bias));
384 if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) {
385 TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32);
386 TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
387 } else if (input_type == kTfLiteInt16) {
388 TF_LITE_ENSURE(context, (bias->type == kTfLiteInt32) ||
389 (bias->type == kTfLiteInt64));
390 TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0);
391 } else {
392 TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type);
393 }
394 TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0));
395 }
396
397 const bool is_hybrid =
398 (input->type == kTfLiteFloat32 &&
399 (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8));
400
401 if (is_hybrid && filter->type == kTfLiteInt8 &&
402 filter->quantization.type == kTfLiteAffineQuantization &&
403 filter->quantization.params &&
404 reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
405 ->scale &&
406 reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params)
407 ->scale->size > 1) {
408 const auto* affine_quantization =
409 reinterpret_cast<TfLiteAffineQuantization*>(
410 filter->quantization.params);
411 const float scale = affine_quantization->scale->data[0];
412 for (int i = 1; i < affine_quantization->scale->size; i++) {
413 if (affine_quantization->scale->data[i] != scale) {
414 data->is_hybrid_per_channel = true;
415 break;
416 }
417 }
418 }
419
420 // The multi-threaded kernel supports neither dilation nor hybrid kernels, and
421 // is incompatible with mutable input filters that might change between evals.
422 data->supports_multithreaded_kernel =
423 (kernel_type == kMultithreadOptimized) &&
424 (context->recommended_num_threads != 1) && !is_hybrid &&
425 (params->dilation_width_factor == 1) &&
426 (params->dilation_height_factor == 1) &&
427 (filter->allocation_type != kTfLiteArenaRw) && !IsDynamicTensor(filter);
428
429 int channels_in = filter->dims->data[3];
430 int channels_out = filter->dims->data[0];
431 int width = input->dims->data[2];
432 int height = input->dims->data[1];
433 int filter_width = filter->dims->data[2];
434 int filter_height = filter->dims->data[1];
435 int batches = input->dims->data[0];
436
437 // Matching GetWindowedOutputSize in TensorFlow.
438 auto padding = params->padding;
439 int out_width, out_height;
440 data->padding = ComputePaddingHeightWidth(
441 params->stride_height, params->stride_width,
442 params->dilation_height_factor, params->dilation_width_factor, height,
443 width, filter_height, filter_width, padding, &out_height, &out_width);
444
445 size_t im2col_type_size;
446 TF_LITE_ENSURE_STATUS(GetSizeOfType(context, input->type, &im2col_type_size));
447 // Note that we intentionally promote the first multiplicand (i.e. 'batches')
448 // to 'size_t' to avoid integer overflow here.
449 const size_t im2col_bytes = static_cast<size_t>(batches) * out_height *
450 out_width * channels_in * filter_height *
451 filter_width * im2col_type_size;
452 TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired(
453 context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type,
454 im2col_bytes));
455
456 TF_LITE_ENSURE(context, has_bias);
457
458 // Note that full fixed-point inference requires that all tensors have their
459 // parameters set. This is usually done during quantized training or
460 // calibration.
461 if (input_type != kTfLiteFloat32) {
462 TF_LITE_ENSURE_EQ(context, filter->quantization.type,
463 kTfLiteAffineQuantization);
464 const auto* affine_quantization =
465 reinterpret_cast<TfLiteAffineQuantization*>(
466 filter->quantization.params);
467 TF_LITE_ENSURE(context, affine_quantization);
468 TF_LITE_ENSURE(context, affine_quantization->scale);
469 TF_LITE_ENSURE(context, (affine_quantization->scale->size == 1 ||
470 affine_quantization->scale->size == channels_out));
471
472 data->per_channel_output_multiplier.resize(channels_out);
473 data->per_channel_output_shift.resize(channels_out);
474 TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams(
475 context, input, filter, bias, output, params->activation,
476 &data->output_multiplier, &data->output_shift,
477 &data->output_activation_min, &data->output_activation_max,
478 data->per_channel_output_multiplier.data(),
479 data->per_channel_output_shift.data(), channels_out));
480 }
481
482 TfLiteIntArray* output_size = TfLiteIntArrayCreate(4);
483 output_size->data[0] = batches;
484 output_size->data[1] = out_height;
485 output_size->data[2] = out_width;
486 output_size->data[3] = channels_out;
487 auto output_status = context->ResizeTensor(context, output, output_size);
488
489 if (output_status != kTfLiteOk) return output_status;
490
491 if (data->need_im2col) {
492 node->temporaries->data[data->im2col_index] = data->im2col_id;
493
494 TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4);
495
496 auto filter_input_channel = filter->dims->data[3];
497 im2col_size->data[0] = output_size->data[0];
498 im2col_size->data[1] = output_size->data[1];
499 im2col_size->data[2] = output_size->data[2];
500 im2col_size->data[3] = filter_input_channel * filter_height * filter_width;
501
502 TfLiteTensor* im2col =
503 &context->tensors[node->temporaries->data[data->im2col_index]];
504 im2col->type = input->type;
505 if (is_hybrid) {
506 im2col->type = filter->type;
507 }
508 im2col->allocation_type = kTfLiteArenaRw;
509 auto im2col_status = context->ResizeTensor(context, im2col, im2col_size);
510 if (im2col_status != kTfLiteOk) return im2col_status;
511 }
512
513 if (data->need_hwcn_weights) {
514 node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id;
515 TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2);
516
517 // Because we're treating the filter weights as a matrix when we do the
518 // transpose, we allocate the buffer with a two-dimensional shape, where one
519 // dimension is the number of elements in each filter, and the second is the
520 // total number of filters.
521 auto filter_input_channel = filter->dims->data[3];
522 hwcn_weights_size->data[0] =
523 (filter_height * filter_width * filter_input_channel);
524 hwcn_weights_size->data[1] = channels_out;
525
526 TfLiteTensor* hwcn_weights =
527 &context->tensors[node->temporaries->data[data->hwcn_weights_index]];
528 hwcn_weights->type = input_type;
529 hwcn_weights->name = "Conv_hwcn_weights";
530 hwcn_weights->allocation_type = kTfLiteArenaRwPersistent;
531
532 auto hwcn_weights_status =
533 context->ResizeTensor(context, hwcn_weights, hwcn_weights_size);
534 if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status;
535
536 // TODO(petewarden): If Resize() is called when the size hasn't actually
537 // changed, this will do extra redundant work.
538 data->have_weights_been_transposed = false;
539 }
540
541 if (is_hybrid) {
542 node->temporaries->data[data->input_quantized_index] =
543 data->input_quantized_id;
544 TfLiteTensor* input_quantized;
545 TF_LITE_ENSURE_OK(
546 context, GetTemporarySafe(context, node, data->input_quantized_index,
547 &input_quantized));
548 input_quantized->type = kTfLiteInt8;
549 input_quantized->allocation_type = kTfLiteArenaRw;
550 if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) {
551 TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims);
552 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized,
553 input_quantized_size));
554 }
555
556 node->temporaries->data[data->scaling_factors_index] =
557 data->scaling_factors_id;
558 TfLiteTensor* scaling_factors;
559 TF_LITE_ENSURE_OK(
560 context, GetTemporarySafe(context, node, data->scaling_factors_index,
561 &scaling_factors));
562 scaling_factors->type = kTfLiteFloat32;
563 scaling_factors->allocation_type = kTfLiteArenaRw;
564 // Only one scale factor per batch is typically necessary. See optimized
565 // implementation for why we need to allocate for the height of the inputs
566 // flattened to 2D.
567 TF_LITE_ENSURE(context, channels_in != 0);
568 const int height = NumElements(input) / channels_in;
569 int scaling_dims[1] = {height};
570 if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) {
571 TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1);
572 scaling_factors_size->data[0] = height;
573 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors,
574 scaling_factors_size));
575 }
576
577 node->temporaries->data[data->accum_scratch_index] = data->accum_scratch_id;
578 TfLiteTensor* accum_scratch;
579 TF_LITE_ENSURE_OK(context,
580 GetTemporarySafe(context, node, data->accum_scratch_index,
581 &accum_scratch));
582 accum_scratch->type = kTfLiteInt32;
583 accum_scratch->allocation_type = kTfLiteArenaRw;
584 const int scratch_width = batches * out_height * out_width;
585 int accum_scratch_dims[2] = {channels_out, scratch_width};
586 if (!TfLiteIntArrayEqualsArray(accum_scratch->dims, 2,
587 accum_scratch_dims)) {
588 TfLiteIntArray* accum_scratch_size = TfLiteIntArrayCreate(2);
589 accum_scratch_size->data[0] = channels_out;
590 accum_scratch_size->data[1] = scratch_width;
591 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, accum_scratch,
592 accum_scratch_size));
593 }
594
595 if (data->is_hybrid_per_channel) {
596 const auto* affine_quantization =
597 reinterpret_cast<TfLiteAffineQuantization*>(
598 filter->quantization.params);
599 TF_LITE_ENSURE_EQ(
600 context, affine_quantization->scale->size,
601 filter->dims->data[affine_quantization->quantized_dimension]);
602 node->temporaries->data[data->input_offset_index] = data->input_offset_id;
603 TfLiteTensor* input_offsets;
604 TF_LITE_ENSURE_OK(
605 context, GetTemporarySafe(context, node, data->input_offset_index,
606 &input_offsets));
607 input_offsets->type = kTfLiteInt32;
608 input_offsets->allocation_type = kTfLiteArenaRw;
609 // See above comment for the need to allocate for height of inputs.
610 TF_LITE_ENSURE(context, channels_in != 0);
611 const int height = NumElements(input) / channels_in;
612 const int input_offset_dims[1] = {height};
613 if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1,
614 input_offset_dims)) {
615 TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1);
616 input_offsets_size->data[0] = input_offset_dims[0];
617 TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets,
618 input_offsets_size));
619 }
620 node->temporaries->data[data->row_sums_index] = data->row_sums_id;
621 TfLiteTensor* row_sums;
622 TF_LITE_ENSURE_OK(
623 context,
624 GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
625 row_sums->type = kTfLiteInt32;
626 row_sums->name = "Conv_row_sums";
627 row_sums->allocation_type = kTfLiteArenaRwPersistent;
628 // See above comment for the need to allocate for height of inputs.
629 const int row_sums_dims[1] = {channels_out};
630 if (!TfLiteIntArrayEqualsArray(row_sums->dims, 1, row_sums_dims)) {
631 TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(1);
632 row_sums_size->data[0] = row_sums_dims[0];
633 TF_LITE_ENSURE_OK(
634 context, context->ResizeTensor(context, row_sums, row_sums_size));
635 }
636 }
637 }
638 return kTfLiteOk;
639}
640
641template <KernelType kernel_type>
642TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) {
643 return Prepare(kernel_type, context, node);
644}
645
646template <KernelType kernel_type>
647void EvalQuantized(TfLiteContext* context, TfLiteNode* node,
648 TfLiteConvParams* params, OpData* data,
649 const TfLiteTensor* input, const TfLiteTensor* filter,
650 const TfLiteTensor* bias, TfLiteTensor* im2col,
651 TfLiteTensor* output) {
652 auto input_offset = -input->params.zero_point;
653 auto filter_offset = -filter->params.zero_point;
654 auto output_offset = output->params.zero_point;
655
656 KernelType effective_kernel_type;
657 if ((kernel_type == kMultithreadOptimized ||
658 kernel_type == kCblasOptimized) &&
659 (params->dilation_width_factor != 1 ||
660 params->dilation_height_factor != 1)) {
661 // kMultithreadOptimized and kCblasOptimized do not support dilation.
662 // Therefore, fallback to optimized.
663 effective_kernel_type = kGenericOptimized;
664 } else {
665 effective_kernel_type = kernel_type;
666 }
667
668 // We have to fallback to reference execution path when im2col is needed but
669 // disabled because to-be-allocated temporary im2col tensor is too large.
670 // See b/178743262 for the detailed motivation.
671 if (data->im2col_oversized) {
672 effective_kernel_type = kReference;
673 }
674
675 // Grouped convolution is right now only supported on reference kernel.
676 if (data->groups != 1) {
677 effective_kernel_type = kReference;
678 }
679
680 ConvParams op_params;
681 op_params.padding_type = PaddingType::kSame;
682 op_params.padding_values.width = data->padding.width;
683 op_params.padding_values.height = data->padding.height;
684 op_params.dilation_width_factor = params->dilation_width_factor;
685 op_params.dilation_height_factor = params->dilation_height_factor;
686 op_params.stride_width = params->stride_width;
687 op_params.stride_height = params->stride_height;
688 op_params.input_offset = input_offset;
689 op_params.weights_offset = filter_offset;
690 op_params.output_offset = output_offset;
691 op_params.output_multiplier = data->output_multiplier;
692 op_params.output_shift = -data->output_shift;
693 op_params.quantized_activation_min = data->output_activation_min;
694 op_params.quantized_activation_max = data->output_activation_max;
695 switch (effective_kernel_type) {
696 case kReference: {
697 reference_ops::Conv(
698 op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
699 GetTensorShape(filter), GetTensorData<uint8_t>(filter),
700 GetTensorShape(bias), GetTensorData<int32_t>(bias),
701 GetTensorShape(output), GetTensorData<uint8_t>(output),
702 GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
703 /* cpu_backend_context = */ nullptr);
704 break;
705 }
706 case kGenericOptimized:
707 case kMultithreadOptimized:
708 case kCblasOptimized: {
709 // There is only one optimized implementation for Quantized Conv.
710 optimized_ops::Conv(
711 op_params, GetTensorShape(input), GetTensorData<uint8_t>(input),
712 GetTensorShape(filter), GetTensorData<uint8_t>(filter),
713 GetTensorShape(bias), GetTensorData<int32_t>(bias),
714 GetTensorShape(output), GetTensorData<uint8_t>(output),
715 GetTensorShape(im2col), GetTensorData<uint8_t>(im2col),
716 CpuBackendContext::GetFromContext(context));
717 break;
718 }
719 }
720}
721
722template <KernelType kernel_type>
723void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node,
724 TfLiteConvParams* params, OpData* data,
725 const TfLiteTensor* input,
726 const TfLiteTensor* filter,
727 const TfLiteTensor* bias, TfLiteTensor* output,
728 TfLiteTensor* im2col) {
729 ConvParams op_params;
730 op_params.input_offset = -input->params.zero_point;
731 op_params.output_offset = output->params.zero_point;
732 op_params.stride_height = params->stride_height;
733 op_params.stride_width = params->stride_width;
734 op_params.dilation_height_factor = params->dilation_height_factor;
735 op_params.dilation_width_factor = params->dilation_width_factor;
736 op_params.padding_values.height = data->padding.height;
737 op_params.padding_values.width = data->padding.width;
738 op_params.quantized_activation_min = data->output_activation_min;
739 op_params.quantized_activation_max = data->output_activation_max;
740
741 KernelType effective_kernel_type = kernel_type;
742 // We have to fallback to reference execution path when im2col is needed but
743 // disabled because to-be-allocated temporary im2col tensor is too large.
744 // See b/178743262 for the detailed motivation.
745 if (data->im2col_oversized) {
746 effective_kernel_type = kReference;
747 }
748
749 // Grouped convolution is right now only supported on reference kernel.
750 if (data->groups != 1) {
751 effective_kernel_type = kReference;
752 }
753
754 switch (effective_kernel_type) {
755 case kReference: {
756 reference_integer_ops::ConvPerChannel(
757 op_params, data->per_channel_output_multiplier.data(),
758 data->per_channel_output_shift.data(), GetTensorShape(input),
759 GetTensorData<int8>(input), GetTensorShape(filter),
760 GetTensorData<int8>(filter), GetTensorShape(bias),
761 GetTensorData<int32>(bias), GetTensorShape(output),
762 GetTensorData<int8>(output));
763 break;
764 }
765 case kGenericOptimized:
766 case kMultithreadOptimized:
767 case kCblasOptimized: {
768 optimized_integer_ops::ConvPerChannel(
769 op_params, data->per_channel_output_multiplier.data(),
770 data->per_channel_output_shift.data(), GetTensorShape(input),
771 GetTensorData<int8>(input), GetTensorShape(filter),
772 GetTensorData<int8>(filter), GetTensorShape(bias),
773 GetTensorData<int32>(bias), GetTensorShape(output),
774 GetTensorData<int8>(output), GetTensorShape(im2col),
775 GetTensorData<int8>(im2col),
776 CpuBackendContext::GetFromContext(context));
777 break;
778 }
779 }
780}
781
782template <KernelType kernel_type>
783void EvalQuantizedPerChannel16x8(TfLiteContext* context, TfLiteNode* node,
784 TfLiteConvParams* params, OpData* data,
785 const TfLiteTensor* input,
786 const TfLiteTensor* filter,
787 const TfLiteTensor* bias, TfLiteTensor* output,
788 TfLiteTensor* im2col) {
789 ConvParams op_params;
790 op_params.input_offset = -input->params.zero_point;
791 op_params.output_offset = output->params.zero_point;
792 op_params.stride_height = params->stride_height;
793 op_params.stride_width = params->stride_width;
794 op_params.dilation_height_factor = params->dilation_height_factor;
795 op_params.dilation_width_factor = params->dilation_width_factor;
796 op_params.padding_values.height = data->padding.height;
797 op_params.padding_values.width = data->padding.width;
798 op_params.quantized_activation_min = data->output_activation_min;
799 op_params.quantized_activation_max = data->output_activation_max;
800
801 KernelType effective_kernel_type = kernel_type;
802 // We have to fallback to reference execution path when im2col is needed but
803 // disabled because to-be-allocated temporary im2col tensor is too large.
804 // See b/178743262 for the detailed motivation.
805 if (data->im2col_oversized) {
806 effective_kernel_type = kReference;
807 }
808
809 // Grouped convolution is right now only supported on reference kernel.
810 if (data->groups != 1) {
811 effective_kernel_type = kReference;
812 }
813
814 // To prevent 32bit accum overflow for 16x8 quantization, it enables the
815 // optimized path only when zero_point is 0.
816 bool has_non_zero_point = input->params.zero_point ||
817 filter->params.zero_point ||
818 output->params.zero_point;
819
820 // Fallback to reference kernel when bias_type is int64 as
821 // there is no optimized kernel for int64 bias yet.
822 if (bias && bias->type == kTfLiteInt64) {
823 reference_integer_ops::ConvPerChannel(
824 op_params, data->per_channel_output_multiplier.data(),
825 data->per_channel_output_shift.data(), GetTensorShape(input),
826 GetTensorData<int16>(input), GetTensorShape(filter),
827 GetTensorData<int8>(filter), GetTensorShape(bias),
828 GetTensorData<std::int64_t>(bias), GetTensorShape(output),
829 GetTensorData<int16>(output));
830 } else if (effective_kernel_type == kReference || has_non_zero_point) {
831 reference_integer_ops::ConvPerChannel(
832 op_params, data->per_channel_output_multiplier.data(),
833 data->per_channel_output_shift.data(), GetTensorShape(input),
834 GetTensorData<int16>(input), GetTensorShape(filter),
835 GetTensorData<int8>(filter), GetTensorShape(bias),
836 GetTensorData<std::int32_t>(bias), GetTensorShape(output),
837 GetTensorData<int16>(output));
838 } else {
839 optimized_integer_ops::ConvPerChannel(
840 op_params, data->per_channel_output_multiplier.data(),
841 data->per_channel_output_shift.data(), GetTensorShape(input),
842 GetTensorData<int16_t>(input), GetTensorShape(filter),
843 GetTensorData<int8_t>(filter), GetTensorShape(bias),
844 GetTensorData<std::int32_t>(bias), GetTensorShape(output),
845 GetTensorData<int16_t>(output), GetTensorShape(im2col),
846 GetTensorData<int16_t>(im2col),
847 CpuBackendContext::GetFromContext(context));
848 }
849}
850
851template <KernelType kernel_type>
852void EvalFloat(TfLiteContext* context, TfLiteNode* node,
853 TfLiteConvParams* params, OpData* data,
854 const TfLiteTensor* input, const TfLiteTensor* filter,
855 const TfLiteTensor* bias, TfLiteTensor* im2col,
856 TfLiteTensor* hwcn_weights, TfLiteTensor* output) {
857 float output_activation_min, output_activation_max;
858 CalculateActivationRange(params->activation, &output_activation_min,
859 &output_activation_max);
860 KernelType effective_kernel_type = kernel_type;
861 // Fall back to the optimized path if multi-threaded conv is unsupported.
862 if ((kernel_type == kMultithreadOptimized) &&
863 !data->supports_multithreaded_kernel) {
864 effective_kernel_type = kGenericOptimized;
865 }
866
867 // When im2col is needed (which is implied when 'im2col_oversized' is true),
868 // the GEMMM-based optimized path requires im2col data be allocated to ensure
869 // the correctness. Therefore, when im2col is disabled because of the
870 // oversized temporary im2col tensor, fallback to a non-optimized path is
871 // needed.
872 // See b/178743262 for the detailed motivation.
873 if (data->im2col_oversized) {
874 effective_kernel_type = kReference;
875#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
876 // As detailed by tflite::multithreaded_ops::Conv implementation in
877 // multithreaded_conv.h, the Eigen-based execution doesn't need im2col data.
878 // Therefore, we could rely on it as a better-optimized fallback than the
879 // reference one.
880 if (data->supports_multithreaded_kernel) {
881 effective_kernel_type = kMultithreadOptimized;
882 }
883#endif
884 }
885
886 // Grouped convolution is right now only supported on reference kernel.
887 if (data->groups != 1) {
888 effective_kernel_type = kReference;
889 }
890
891 ConvParams op_params;
892 op_params.padding_type = RuntimePaddingType(params->padding);
893 op_params.padding_values.width = data->padding.width;
894 op_params.padding_values.height = data->padding.height;
895 op_params.stride_width = params->stride_width;
896 op_params.stride_height = params->stride_height;
897 op_params.dilation_width_factor = params->dilation_width_factor;
898 op_params.dilation_height_factor = params->dilation_height_factor;
899 op_params.float_activation_min = output_activation_min;
900 op_params.float_activation_max = output_activation_max;
901 switch (effective_kernel_type) {
902 case kReference: {
903 reference_ops::Conv(op_params, GetTensorShape(input),
904 GetTensorData<float>(input), GetTensorShape(filter),
905 GetTensorData<float>(filter), GetTensorShape(bias),
906 GetTensorData<float>(bias), GetTensorShape(output),
907 GetTensorData<float>(output), GetTensorShape(im2col),
908 GetTensorData<float>(im2col));
909 break;
910 }
911 case kCblasOptimized:
912 case kGenericOptimized: {
913 optimized_ops::Conv(op_params, GetTensorShape(input),
914 GetTensorData<float>(input), GetTensorShape(filter),
915 GetTensorData<float>(filter), GetTensorShape(bias),
916 GetTensorData<float>(bias), GetTensorShape(output),
917 GetTensorData<float>(output), GetTensorShape(im2col),
918 GetTensorData<float>(im2col),
919 CpuBackendContext::GetFromContext(context));
920 break;
921 }
922 case kMultithreadOptimized: {
923#if defined(TFLITE_WITH_MULTITHREADED_EIGEN)
924 const float* filter_data;
925 if (data->need_hwcn_weights) {
926 filter_data = GetTensorData<float>(hwcn_weights);
927 } else {
928 filter_data = GetTensorData<float>(filter);
929 }
930 multithreaded_ops::Conv(
931 *eigen_support::GetThreadPoolDevice(context), op_params,
932 GetTensorShape(input), GetTensorData<float>(input),
933 GetTensorShape(filter), filter_data, GetTensorShape(bias),
934 GetTensorData<float>(bias), GetTensorShape(output),
935 GetTensorData<float>(output), GetTensorShape(im2col),
936 GetTensorData<float>(im2col));
937 break;
938#else // !defined(TFLITE_WITH_MULTITHREADED_EIGEN)
939 // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY
940 // was enabled. We #if out this code in order to get the corresponding
941 // binary size benefits.
942 TFLITE_DCHECK(false);
943#endif // defined(TFLITE_WITH_MULTITHREADED_EIGEN)
944 }
945 }
946}
947
948template <KernelType kernel_type>
949TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node,
950 TfLiteConvParams* params, OpData* data,
951 const TfLiteTensor* input,
952 const TfLiteTensor* filter,
953 const TfLiteTensor* bias,
954 TfLiteTensor* im2col, TfLiteTensor* output) {
955 float output_activation_min, output_activation_max;
956 CalculateActivationRange(params->activation, &output_activation_min,
957 &output_activation_max);
958
959 const int batch_size = SizeOfDimension(input, 0);
960 TF_LITE_ENSURE(context, batch_size != 0);
961 const int input_size = NumElements(input) / batch_size;
962 TfLiteTensor* quantized_input_tensor;
963 TF_LITE_ENSURE_OK(context,
964 GetTemporarySafe(context, node, data->input_quantized_index,
965 &quantized_input_tensor));
966 int8_t* quantized_input_ptr_batch =
967 GetTensorData<int8_t>(quantized_input_tensor);
968 TfLiteTensor* scaling_factors_tensor;
969 TF_LITE_ENSURE_OK(context,
970 GetTemporarySafe(context, node, data->scaling_factors_index,
971 &scaling_factors_tensor));
972 float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
973 TfLiteTensor* input_offset_tensor;
974 TF_LITE_ENSURE_OK(context,
975 GetTemporarySafe(context, node, data->input_offset_index,
976 &input_offset_tensor));
977 int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor);
978
979 for (int b = 0; b < batch_size; ++b) {
980 const int offset = b * input_size;
981 tensor_utils::AsymmetricQuantizeFloats(
982 GetTensorData<float>(input) + offset, input_size,
983 quantized_input_ptr_batch + offset, &scaling_factors_ptr[b],
984 &input_offset_ptr[b]);
985 }
986
987 int8_t* im2col_ptr = nullptr;
988 int8_t* filter_ptr = nullptr;
989 if (im2col != nullptr) {
990 im2col_ptr = im2col->data.int8;
991 }
992 filter_ptr = filter->data.int8;
993 const auto* affine_quantization =
994 reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params);
995
996 KernelType effective_kernel_type = kernel_type;
997 // We have to fallback to reference execution path when im2col is needed but
998 // disabled because to-be-allocated temporary im2col tensor is too large.
999 // See b/178743262 for the detailed motivation.
1000 if (data->im2col_oversized) {
1001 effective_kernel_type = kReference;
1002 }
1003
1004 // Grouped convolution is right now only supported on reference kernel.
1005 if (data->groups != 1) {
1006 effective_kernel_type = kReference;
1007 }
1008
1009 ConvParams op_params;
1010 op_params.padding_type = PaddingType::kSame;
1011 op_params.padding_values.width = data->padding.width;
1012 op_params.padding_values.height = data->padding.height;
1013 op_params.dilation_width_factor = params->dilation_width_factor;
1014 op_params.dilation_height_factor = params->dilation_height_factor;
1015 op_params.stride_width = params->stride_width;
1016 op_params.stride_height = params->stride_height;
1017 op_params.float_activation_min = output_activation_min;
1018 op_params.float_activation_max = output_activation_max;
1019 switch (effective_kernel_type) {
1020 case kReference:
1021 reference_ops::HybridConvPerChannel(
1022 op_params, scaling_factors_ptr, GetTensorShape(input),
1023 quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1024 GetTensorShape(bias), GetTensorData<float>(bias),
1025 GetTensorShape(output), GetTensorData<float>(output),
1026 GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1027 input_offset_ptr);
1028 break;
1029 case kGenericOptimized:
1030 case kMultithreadOptimized:
1031 case kCblasOptimized: {
1032 TfLiteTensor* row_sums;
1033 TF_LITE_ENSURE_OK(
1034 context,
1035 GetTemporarySafe(context, node, data->row_sums_index, &row_sums));
1036 TfLiteTensor* scratch;
1037 TF_LITE_ENSURE_OK(
1038 context,
1039 GetTemporarySafe(context, node, data->accum_scratch_index, &scratch));
1040 optimized_ops::HybridConvPerChannel(
1041 op_params, scaling_factors_ptr, GetTensorShape(input),
1042 quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr,
1043 GetTensorShape(bias), GetTensorData<float>(bias),
1044 GetTensorShape(output), GetTensorData<float>(output),
1045 GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data,
1046 input_offset_ptr, GetTensorShape(scratch),
1047 GetTensorData<int32>(scratch), GetTensorData<int32_t>(row_sums),
1048 &data->compute_hybrid_row_sums,
1049 CpuBackendContext::GetFromContext(context));
1050 data->compute_hybrid_row_sums = false;
1051 break;
1052 }
1053 }
1054
1055 return kTfLiteOk;
1056}
1057
1058template <KernelType kernel_type>
1059TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node,
1060 TfLiteConvParams* params, OpData* data,
1061 const TfLiteTensor* input, const TfLiteTensor* filter,
1062 const TfLiteTensor* bias, TfLiteTensor* im2col,
1063 TfLiteTensor* accum_scratch, TfLiteTensor* output) {
1064 float output_activation_min, output_activation_max;
1065 CalculateActivationRange(params->activation, &output_activation_min,
1066 &output_activation_max);
1067
1068 const int batch_size = SizeOfDimension(input, 0);
1069 TF_LITE_ENSURE(context, batch_size != 0);
1070 const int input_size = NumElements(input) / batch_size;
1071
1072 const float* input_ptr = GetTensorData<float>(input);
1073 TfLiteTensor* quantized_input_tensor;
1074 TF_LITE_ENSURE_OK(context,
1075 GetTemporarySafe(context, node, data->input_quantized_index,
1076 &quantized_input_tensor));
1077 int8_t* quantized_input_ptr_batch =
1078 GetTensorData<int8_t>(quantized_input_tensor);
1079 TfLiteTensor* scaling_factors_tensor;
1080 TF_LITE_ENSURE_OK(context,
1081 GetTemporarySafe(context, node, data->scaling_factors_index,
1082 &scaling_factors_tensor));
1083 float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor);
1084
1085 // Per-batch input quantization for higher accuracy.
1086 {
1087 ruy::profiler::ScopeLabel label("ConvHybridQuantizeInputs");
1088 for (int b = 0; b < batch_size; ++b) {
1089 float unused_min, unused_max;
1090 const int offset = b * input_size;
1091 tensor_utils::SymmetricQuantizeFloats(
1092 input_ptr + offset, input_size, quantized_input_ptr_batch + offset,
1093 &unused_min, &unused_max, &scaling_factors_ptr[b]);
1094 scaling_factors_ptr[b] *= filter->params.scale;
1095 }
1096 }
1097
1098 switch (kernel_type) {
1099 case kReference:
1100 case kGenericOptimized:
1101 case kMultithreadOptimized:
1102 case kCblasOptimized: {
1103 // There is only one implementation for hybrid kernel.
1104 ConvParams op_params;
1105 op_params.padding_type = PaddingType::kSame;
1106 op_params.padding_values.width = data->padding.width;
1107 op_params.padding_values.height = data->padding.height;
1108 op_params.stride_width = params->stride_width;
1109 op_params.stride_height = params->stride_height;
1110 op_params.dilation_width_factor = params->dilation_width_factor;
1111 op_params.dilation_height_factor = params->dilation_height_factor;
1112 op_params.float_activation_min = output_activation_min;
1113 op_params.float_activation_max = output_activation_max;
1114 if (data->groups == 1) {
1115 optimized_ops::HybridConv(
1116 op_params, scaling_factors_ptr, GetTensorShape(input),
1117 quantized_input_ptr_batch, GetTensorShape(filter),
1118 GetTensorData<int8_t>(filter), GetTensorShape(bias),
1119 GetTensorData<float>(bias), GetTensorShape(accum_scratch),
1120 GetTensorData<int32_t>(accum_scratch), GetTensorShape(output),
1121 GetTensorData<float>(output), GetTensorShape(im2col),
1122 GetTensorData<int8_t>(im2col),
1123 CpuBackendContext::GetFromContext(context));
1124 } else {
1125 // This case is handled by (fallbacked to) per channel hybrid group conv
1126 // and shouldn't hit this branch.
1127 TF_LITE_KERNEL_LOG(
1128 context,
1129 "Group convolution currently not supported for hybrid kernel.");
1130 return kTfLiteError;
1131 }
1132 break;
1133 }
1134 }
1135
1136 return kTfLiteOk;
1137}
1138
1139template <KernelType kernel_type, TfLiteType input_type>
1140TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) {
1141 auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data);
1142 OpData* data = reinterpret_cast<OpData*>(node->user_data);
1143
1144 TfLiteTensor* output;
1145 TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output));
1146 const TfLiteTensor* input;
1147 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1148 const TfLiteTensor* filter;
1149 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter));
1150 bool has_bias = node->inputs->size == 3;
1151 const TfLiteTensor* bias = has_bias ? GetInput(context, node, 2) : nullptr;
1152 TfLiteTensor* im2col =
1153 data->need_im2col
1154 ? &context->tensors[node->temporaries->data[data->im2col_index]]
1155 : nullptr;
1156 TfLiteTensor* hwcn_weights =
1157 data->need_hwcn_weights
1158 ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]]
1159 : nullptr;
1160
1161 if (data->need_hwcn_weights && !data->have_weights_been_transposed) {
1162 TransposeFloatTensor(filter, hwcn_weights);
1163 data->have_weights_been_transposed = true;
1164 }
1165
1166 TFLITE_DCHECK_EQ(input_type, input->type);
1167 switch (input_type) { // Already know in/outtypes are same.
1168 case kTfLiteFloat32:
1169 if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) {
1170 if (data->is_hybrid_per_channel ||
1171 // TODO(b/162870360): Fallback to PerChannel implementation
1172 // before we have grouped hybrid convolution.
1173 data->groups != 1) {
1174 TF_LITE_ENSURE_OK(context, EvalHybridPerChannel<kernel_type>(
1175 context, node, params, data, input,
1176 filter, bias, im2col, output));
1177 } else {
1178 TfLiteTensor* accum_scratch =
1179 &context->tensors[node->temporaries
1180 ->data[data->accum_scratch_index]];
1181 TF_LITE_ENSURE_OK(context,
1182 EvalHybrid<kernel_type>(context, node, params, data,
1183 input, filter, bias, im2col,
1184 accum_scratch, output));
1185 }
1186 } else {
1187 EvalFloat<kernel_type>(context, node, params, data, input, filter, bias,
1188 im2col, hwcn_weights, output);
1189 }
1190 break;
1191 case kTfLiteUInt8:
1192 EvalQuantized<kernel_type>(context, node, params, data, input, filter,
1193 bias, im2col, output);
1194 break;
1195 case kTfLiteInt8:
1196 EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input,
1197 filter, bias, output, im2col);
1198 break;
1199 case kTfLiteInt16:
1200 EvalQuantizedPerChannel16x8<kernel_type>(
1201 context, node, params, data, input, filter, bias, output, im2col);
1202 break;
1203 default:
1204 TF_LITE_KERNEL_LOG(context, "Type %s currently not supported.",
1205 TfLiteTypeGetName(input->type));
1206 return kTfLiteError;
1207 }
1208 return kTfLiteOk;
1209}
1210
1211template <KernelType kernel_type>
1212TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) {
1213 const TfLiteTensor* input;
1214 TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input));
1215
1216 switch (input->type) {
1217 case kTfLiteFloat32:
1218 return EvalImpl<kernel_type, kTfLiteFloat32>(context, node);
1219 case kTfLiteUInt8:
1220 return EvalImpl<kernel_type, kTfLiteUInt8>(context, node);
1221 case kTfLiteInt8:
1222 return EvalImpl<kernel_type, kTfLiteInt8>(context, node);
1223 case kTfLiteInt16:
1224 return EvalImpl<kernel_type, kTfLiteInt16>(context, node);
1225 default:
1226 TF_LITE_KERNEL_LOG(context, "Type %s not currently supported.",
1227 TfLiteTypeGetName(input->type));
1228 return kTfLiteError;
1229 }
1230}
1231
1232} // namespace conv
1233
1234TfLiteRegistration* Register_CONVOLUTION_REF() {
1235 static TfLiteRegistration r = {conv::Init, conv::Free,
1236 conv::Prepare<conv::kReference>,
1237 conv::Eval<conv::kReference>};
1238 return &r;
1239}
1240
1241TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() {
1242 static TfLiteRegistration r = {conv::Init, conv::Free,
1243 conv::Prepare<conv::kGenericOptimized>,
1244 conv::Eval<conv::kGenericOptimized>};
1245 return &r;
1246}
1247
1248TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT_UINT8() {
1249 static TfLiteRegistration r = {
1250 conv::Init, conv::Free, conv::Prepare<conv::kGenericOptimized>,
1251 conv::EvalImpl<conv::kGenericOptimized, kTfLiteUInt8>};
1252 return &r;
1253}
1254
1255TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() {
1256 static TfLiteRegistration r = {conv::Init, conv::Free,
1257 conv::Prepare<conv::kMultithreadOptimized>,
1258 conv::Eval<conv::kMultithreadOptimized>};
1259 return &r;
1260}
1261
1262TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() {
1263 static TfLiteRegistration r = {conv::Init, conv::Free,
1264 conv::Prepare<conv::kCblasOptimized>,
1265 conv::Eval<conv::kCblasOptimized>};
1266 return &r;
1267}
1268
1269TfLiteRegistration* Register_CONV_2D() {
1270#if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV
1271 return Register_CONVOLUTION_CBLAS_OPT();
1272#elif defined TFLITE_WITH_MULTITHREADED_EIGEN
1273 return Register_CONVOLUTION_MULTITHREADED_OPT();
1274#else
1275 return Register_CONVOLUTION_GENERIC_OPT();
1276#endif
1277}
1278
1279// Warning: Clients using this variant are responsible for ensuring that their
1280// models only need the UINT8 type. TFLite's op registration mechanism doesn't
1281// yet allow for more nuanced registration mechanisms.
1282TfLiteRegistration* Register_CONV_2D_UINT8() {
1283#if defined TFLITE_WITH_RUY
1284 // TFLITE_WITH_RUY optimizes the generic kernel type.
1285 return Register_CONVOLUTION_GENERIC_OPT_UINT8();
1286#else
1287 return Register_CONV_2D();
1288#endif
1289}
1290
1291} // namespace builtin
1292} // namespace ops
1293} // namespace tflite
1294