1 | /* Copyright 2017 The TensorFlow Authors. All Rights Reserved. |
2 | |
3 | Licensed under the Apache License, Version 2.0 (the "License"); |
4 | you may not use this file except in compliance with the License. |
5 | You may obtain a copy of the License at |
6 | |
7 | http://www.apache.org/licenses/LICENSE-2.0 |
8 | |
9 | Unless required by applicable law or agreed to in writing, software |
10 | distributed under the License is distributed on an "AS IS" BASIS, |
11 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | See the License for the specific language governing permissions and |
13 | limitations under the License. |
14 | ==============================================================================*/ |
15 | #include "tensorflow/lite/kernels/internal/optimized/integer_ops/conv.h" |
16 | |
17 | #include <stddef.h> |
18 | |
19 | #include <cstdint> |
20 | #include <vector> |
21 | |
22 | // Only use multi-threaded Eigen if ruy is disabled. |
23 | #if !defined(TFLITE_WITH_RUY) |
24 | #define TFLITE_WITH_MULTITHREADED_EIGEN |
25 | #endif |
26 | |
27 | #include "tensorflow/lite/c/builtin_op_data.h" |
28 | #include "tensorflow/lite/c/common.h" |
29 | #include "tensorflow/lite/kernels/cpu_backend_context.h" |
30 | #if defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
31 | #include "tensorflow/lite/kernels/eigen_support.h" |
32 | #endif |
33 | #include "tensorflow/lite/kernels/internal/compatibility.h" |
34 | #include "tensorflow/lite/kernels/internal/types.h" |
35 | // b/131835803 forces us to include multithreaded_conv.h before optimized_ops.h |
36 | #if defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
37 | #include "tensorflow/lite/kernels/internal/optimized/multithreaded_conv.h" |
38 | #endif |
39 | #include "tensorflow/lite/kernels/internal/optimized/optimized_ops.h" |
40 | #include "tensorflow/lite/kernels/internal/quantization_util.h" |
41 | #include "tensorflow/lite/kernels/internal/reference/conv.h" |
42 | #include "tensorflow/lite/kernels/internal/reference/integer_ops/conv.h" |
43 | #include "tensorflow/lite/kernels/internal/tensor.h" |
44 | #include "tensorflow/lite/kernels/internal/tensor_ctypes.h" |
45 | #include "tensorflow/lite/kernels/internal/tensor_utils.h" |
46 | #include "tensorflow/lite/kernels/kernel_util.h" |
47 | #include "tensorflow/lite/kernels/padding.h" |
48 | #include "tensorflow/lite/util.h" |
49 | |
50 | namespace tflite { |
51 | namespace ops { |
52 | namespace builtin { |
53 | namespace conv { |
54 | |
55 | // This file has 4 implementation of Conv. |
56 | enum KernelType { |
57 | kReference, |
58 | kGenericOptimized, // Neon-free |
59 | // kMultithreadOptimized is a mixture of an Eigen-based kernel when threads |
60 | // are available and kGenericOptimized when we must use only one thread. |
61 | kMultithreadOptimized, |
62 | // The kernel uses use CBLAS interface for matrix multiplication. |
63 | // It's fast when an optimized CBLAS implementation is available (e.g. Apple |
64 | // Accelerate Framework), and it's slow when falling back to naive |
65 | // implementation. |
66 | kCblasOptimized, |
67 | }; |
68 | |
69 | const int kTensorNotAllocated = -1; |
70 | |
71 | static constexpr size_t kMaxIm2colBufferSizeMobile = 1024 * 1024 * 1024; // 1GB |
72 | |
73 | struct OpData { |
74 | // IDs are the arbitrary identifiers used by TF Lite to identify and access |
75 | // memory buffers. |
76 | int im2col_id = kTensorNotAllocated; |
77 | int hwcn_weights_id = kTensorNotAllocated; |
78 | int input_quantized_id = kTensorNotAllocated; |
79 | int scaling_factors_id = kTensorNotAllocated; |
80 | int input_offset_id = kTensorNotAllocated; |
81 | int accum_scratch_id = kTensorNotAllocated; |
82 | // Row sums are used to cache filter sums for hybrid zero-point calculations. |
83 | int row_sums_id = kTensorNotAllocated; |
84 | |
85 | TfLitePaddingValues padding; |
86 | // The scaling factor from input to output (aka the 'real multiplier') can |
87 | // be represented as a fixed point multiplier plus a left shift. |
88 | int32_t output_multiplier; |
89 | int output_shift; |
90 | |
91 | // Per channel output multiplier and shift. |
92 | std::vector<int32_t> per_channel_output_multiplier; |
93 | std::vector<int> per_channel_output_shift; |
94 | |
95 | // The range of the fused activation layer. For example for kNone and |
96 | // uint8_t these would be 0 and 255. |
97 | int32_t output_activation_min; |
98 | int32_t output_activation_max; |
99 | // Indexes are the offset to the memory buffer in the array used to keep track |
100 | // of the allocated temporaries. |
101 | int32_t im2col_index; |
102 | int32_t hwcn_weights_index; |
103 | int32_t input_quantized_index; |
104 | int32_t scaling_factors_index; |
105 | int32_t accum_scratch_index; |
106 | int32_t input_offset_index; |
107 | int32_t row_sums_index; |
108 | |
109 | bool need_hwcn_weights = false; |
110 | bool have_weights_been_transposed = false; |
111 | bool need_im2col = false; |
112 | // If it's true, it means im2col is needed but gets disabled because the |
113 | // temporary im2col tensor requires too much memory (i.e. |
114 | // >= kMaxIm2colBufferSize); |
115 | bool im2col_oversized = false; |
116 | |
117 | bool supports_multithreaded_kernel = false; |
118 | bool is_hybrid_per_channel = false; |
119 | bool compute_hybrid_row_sums = true; |
120 | |
121 | // Number of convolution groups. |
122 | int32_t groups = 1; |
123 | }; |
124 | |
125 | inline PaddingType RuntimePaddingType(TfLitePadding padding) { |
126 | switch (padding) { |
127 | case TfLitePadding::kTfLitePaddingSame: |
128 | return PaddingType::kSame; |
129 | case TfLitePadding::kTfLitePaddingValid: |
130 | return PaddingType::kValid; |
131 | case TfLitePadding::kTfLitePaddingUnknown: |
132 | default: |
133 | return PaddingType::kNone; |
134 | } |
135 | } |
136 | |
137 | void* Init(TfLiteContext* context, const char* buffer, size_t length) { |
138 | // This is a builtin op, so we don't use the contents in 'buffer', if any. |
139 | // Instead, we allocate a new object to use as scratch space for im2col, and |
140 | // to carry information from Prepare() to Eval(). |
141 | auto* data = new OpData; |
142 | #if defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
143 | eigen_support::IncrementUsageCounter(context); |
144 | #endif |
145 | return data; |
146 | } |
147 | |
148 | void Free(TfLiteContext* context, void* buffer) { |
149 | #if defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
150 | eigen_support::DecrementUsageCounter(context); |
151 | #endif |
152 | delete reinterpret_cast<OpData*>(buffer); |
153 | } |
154 | |
155 | // Naive implementation of transpose for floats. Could be optimized to be more |
156 | // cache friendly, but for now it's a one-time cost on first run, and we would |
157 | // prefer to remove the need to do this at all eventually. |
158 | void TransposeFloatTensor(const TfLiteTensor* input, TfLiteTensor* output) { |
159 | const int rows = output->dims->data[1]; |
160 | const int cols = output->dims->data[0]; |
161 | const float* input_data = GetTensorData<float>(input); |
162 | float* output_data = GetTensorData<float>(output); |
163 | for (int i = 0; i < rows; ++i) { |
164 | for (int j = 0; j < cols; ++j) { |
165 | const float in_value = input_data[i * cols + j]; |
166 | output_data[j * rows + i] = in_value; |
167 | } |
168 | } |
169 | } |
170 | |
171 | // Check if im2col needs to be allocated, as some version of optimized Conv dont |
172 | // use it. If any change is supporting im2col in any of the Conv versions, then |
173 | // it should be updated here as well |
174 | bool IsIm2ColRequired(const TfLiteTensor* input, TfLiteConvParams* params, |
175 | const TfLiteTensor* filter, OpData* data, bool is_hybrid, |
176 | KernelType kernel_type) { |
177 | // If HWCN weights are required, Im2Col not required |
178 | if (data->need_hwcn_weights) return false; |
179 | |
180 | // segregate based on dilated conv & non-dialated conv |
181 | const bool need_dilated_im2col = |
182 | params->dilation_width_factor != 1 || params->dilation_height_factor != 1; |
183 | const bool need_non_dilated_im2col = |
184 | params->stride_width != 1 || params->stride_height != 1 || |
185 | filter->dims->data[2] != 1 || filter->dims->data[1] != 1; |
186 | |
187 | const bool need_im2col = need_dilated_im2col || need_non_dilated_im2col; |
188 | |
189 | // Return early as basic requirement is not met |
190 | if (!need_im2col) return false; |
191 | |
192 | switch (kernel_type) { |
193 | case kReference: |
194 | if (is_hybrid) { |
195 | return true; |
196 | } else { |
197 | return false; |
198 | } |
199 | case kGenericOptimized: |
200 | case kCblasOptimized: |
201 | // `need_im2col` is always satisfied. |
202 | return true; |
203 | case kMultithreadOptimized: |
204 | if (input->type == kTfLiteUInt8 || // |
205 | input->type == kTfLiteInt8 || // |
206 | input->type == kTfLiteInt16 || // quantized. |
207 | !data->supports_multithreaded_kernel) { |
208 | return true; |
209 | } else { |
210 | return false; |
211 | } |
212 | default: |
213 | return false; |
214 | } |
215 | } |
216 | |
217 | // Allocate temporary tensors (`im2col`, `hwcn_weights` if necessary). |
218 | // Note: `context->AddTensors` might invalidate pointers to existing tensors. |
219 | // Therefore the logic to add tensors are isolated into this function. |
220 | static TfLiteStatus AllocateTemporaryTensorsIfRequired( |
221 | TfLiteContext* context, TfLiteNode* node, bool is_hybrid, |
222 | bool is_per_channel, KernelType kernel_type, size_t im2col_bytes) { |
223 | auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data); |
224 | OpData* data = reinterpret_cast<OpData*>(node->user_data); |
225 | |
226 | TF_LITE_ENSURE(context, node->inputs->size >= 2); |
227 | const TfLiteTensor* input; |
228 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input)); |
229 | const TfLiteTensor* filter; |
230 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter)); |
231 | |
232 | // If we're using the optimized multithreaded EigenTensor implementation of |
233 | // convolution, it expects the filter weights to be transposed compared to |
234 | // the normal TF Lite buffer format. Typical TF Lite weights are |
235 | // [filter_count, filter_height, filter_width, input_depth], but for the float |
236 | // implementation we need them as [filter_height, filter_width, input_depth, |
237 | // filter_count]. We get to that format by transposing, and create a temporary |
238 | // buffer to store the results. |
239 | // This path is only used for float processing, so only create the buffer if |
240 | // we're running with that data type. |
241 | data->need_hwcn_weights = |
242 | input->type == kTfLiteFloat32 && data->supports_multithreaded_kernel; |
243 | |
244 | // We don't always need to allocate im2col. It is only used in some versions |
245 | // of the optimized Conv. This test just mimics something that happens inside |
246 | // optimized_ops.h, in order to avoid a DCHECK(!im2col_data). |
247 | data->need_im2col = |
248 | IsIm2ColRequired(input, params, filter, data, is_hybrid, kernel_type); |
249 | |
250 | // If im2col_oversized is found to be true, we have to fallback to an |
251 | // execution path (like kReference in float/quantized cases) that doesn't |
252 | // require im2col operation. Therefore, we have to skip checking the hybrid |
253 | // case (but not the hybrid-per-channel one) where there's no such a fallback |
254 | // execution path. |
255 | // TODO(b/178743262): Consider making this check conditioned on the available |
256 | // memory of the system, rather than coupling to the mobile platform check. |
257 | if (IsMobilePlatform() && !(is_hybrid && !is_per_channel) && |
258 | data->need_im2col && im2col_bytes >= kMaxIm2colBufferSizeMobile) { |
259 | data->need_im2col = false; |
260 | data->im2col_oversized = true; |
261 | } |
262 | int temporaries_count = 0; |
263 | if (data->need_im2col) { |
264 | data->im2col_index = temporaries_count; |
265 | if (data->im2col_id == kTensorNotAllocated) { |
266 | context->AddTensors(context, 1, &data->im2col_id); |
267 | } |
268 | ++temporaries_count; |
269 | } |
270 | if (data->need_hwcn_weights) { |
271 | data->hwcn_weights_index = temporaries_count; |
272 | if (data->hwcn_weights_id == kTensorNotAllocated) { |
273 | context->AddTensors(context, 1, &data->hwcn_weights_id); |
274 | } |
275 | ++temporaries_count; |
276 | } |
277 | |
278 | if (is_hybrid) { |
279 | // Allocate tensor to store the on-the-fly quantized inputs. |
280 | data->input_quantized_index = temporaries_count; |
281 | if (data->input_quantized_id == kTensorNotAllocated) { |
282 | TF_LITE_ENSURE_OK( |
283 | context, context->AddTensors(context, 1, &data->input_quantized_id)); |
284 | } |
285 | ++temporaries_count; |
286 | |
287 | // Allocate tensor to store the quantization params computed during |
288 | // on-the-fly input quantization. |
289 | data->scaling_factors_index = temporaries_count; |
290 | if (data->scaling_factors_id == kTensorNotAllocated) { |
291 | TF_LITE_ENSURE_OK( |
292 | context, context->AddTensors(context, 1, &data->scaling_factors_id)); |
293 | } |
294 | ++temporaries_count; |
295 | |
296 | // Allocate tensor to store the accumulators for the matrix multiply. |
297 | data->accum_scratch_index = temporaries_count; |
298 | if (data->accum_scratch_id == kTensorNotAllocated) { |
299 | TF_LITE_ENSURE_OK( |
300 | context, context->AddTensors(context, 1, &data->accum_scratch_id)); |
301 | } |
302 | ++temporaries_count; |
303 | if (is_per_channel) { |
304 | data->input_offset_index = temporaries_count; |
305 | if (data->input_offset_id == kTensorNotAllocated) { |
306 | TF_LITE_ENSURE_OK( |
307 | context, context->AddTensors(context, 1, &data->input_offset_id)); |
308 | } |
309 | ++temporaries_count; |
310 | |
311 | data->row_sums_index = temporaries_count; |
312 | if (data->row_sums_id == kTensorNotAllocated) { |
313 | TF_LITE_ENSURE_OK(context, |
314 | context->AddTensors(context, 1, &data->row_sums_id)); |
315 | } |
316 | ++temporaries_count; |
317 | } |
318 | } |
319 | |
320 | TfLiteIntArrayFree(node->temporaries); |
321 | node->temporaries = TfLiteIntArrayCreate(temporaries_count); |
322 | |
323 | return kTfLiteOk; |
324 | } |
325 | |
326 | TfLiteStatus Prepare(KernelType kernel_type, TfLiteContext* context, |
327 | TfLiteNode* node) { |
328 | auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data); |
329 | OpData* data = reinterpret_cast<OpData*>(node->user_data); |
330 | |
331 | bool has_bias = node->inputs->size == 3; |
332 | // Check number of inputs/outputs |
333 | TF_LITE_ENSURE(context, has_bias || node->inputs->size == 2); |
334 | TF_LITE_ENSURE_EQ(context, node->outputs->size, 1); |
335 | TfLiteTensor* output; |
336 | TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output)); |
337 | const TfLiteTensor* input; |
338 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input)); |
339 | const TfLiteTensor* filter; |
340 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter)); |
341 | |
342 | // Check dimensionality of input, filter |
343 | TF_LITE_ENSURE_EQ(context, input->dims->size, 4); |
344 | TF_LITE_ENSURE_EQ(context, filter->dims->size, 4); |
345 | // Check input channels matching filter |
346 | // Filter input channel can be a factor of channels of input (grouped conv) |
347 | // or equals (normal conv). |
348 | auto input_channel = input->dims->data[3]; |
349 | auto filter_input_channel = filter->dims->data[3]; |
350 | TF_LITE_ENSURE_EQ(context, input_channel % filter_input_channel, 0); |
351 | data->groups = input_channel / filter_input_channel; |
352 | |
353 | // Check types. (We assume that UINT8 refers to quantized tensors) |
354 | TfLiteType input_type = input->type; |
355 | TF_LITE_ENSURE(context, |
356 | input_type == kTfLiteFloat32 || input_type == kTfLiteUInt8 || |
357 | input_type == kTfLiteInt8 || input_type == kTfLiteInt16); |
358 | TF_LITE_ENSURE_TYPES_EQ(context, output->type, input_type); |
359 | |
360 | if (input_type == kTfLiteInt16) { |
361 | TF_LITE_ENSURE_EQ(context, input->params.zero_point, 0); |
362 | TF_LITE_ENSURE_EQ(context, output->params.zero_point, 0); |
363 | } |
364 | // Filter must have zero zero-points in per-channel quantization. |
365 | if (input_type == kTfLiteInt16 || input_type == kTfLiteInt8) { |
366 | TF_LITE_ENSURE_EQ(context, filter->quantization.type, |
367 | kTfLiteAffineQuantization); |
368 | const auto* affine_quantization = |
369 | reinterpret_cast<TfLiteAffineQuantization*>( |
370 | filter->quantization.params); |
371 | for (int i = 0; i < affine_quantization->zero_point->size; ++i) { |
372 | TF_LITE_ENSURE_EQ(context, affine_quantization->zero_point->data[i], 0); |
373 | } |
374 | } |
375 | |
376 | const TfLiteTensor* bias = nullptr; |
377 | |
378 | // TODO(ahentz): At this point the optimized versions require 'bias'. We can |
379 | // either change that or document that convolution requires it. |
380 | TF_LITE_ENSURE(context, has_bias); |
381 | |
382 | if (has_bias) { |
383 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 2, &bias)); |
384 | if (input_type == kTfLiteUInt8 || input_type == kTfLiteInt8) { |
385 | TF_LITE_ENSURE_TYPES_EQ(context, bias->type, kTfLiteInt32); |
386 | TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0); |
387 | } else if (input_type == kTfLiteInt16) { |
388 | TF_LITE_ENSURE(context, (bias->type == kTfLiteInt32) || |
389 | (bias->type == kTfLiteInt64)); |
390 | TF_LITE_ENSURE_EQ(context, bias->params.zero_point, 0); |
391 | } else { |
392 | TF_LITE_ENSURE_TYPES_EQ(context, bias->type, input_type); |
393 | } |
394 | TF_LITE_ENSURE_EQ(context, NumElements(bias), SizeOfDimension(filter, 0)); |
395 | } |
396 | |
397 | const bool is_hybrid = |
398 | (input->type == kTfLiteFloat32 && |
399 | (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8)); |
400 | |
401 | if (is_hybrid && filter->type == kTfLiteInt8 && |
402 | filter->quantization.type == kTfLiteAffineQuantization && |
403 | filter->quantization.params && |
404 | reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params) |
405 | ->scale && |
406 | reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params) |
407 | ->scale->size > 1) { |
408 | const auto* affine_quantization = |
409 | reinterpret_cast<TfLiteAffineQuantization*>( |
410 | filter->quantization.params); |
411 | const float scale = affine_quantization->scale->data[0]; |
412 | for (int i = 1; i < affine_quantization->scale->size; i++) { |
413 | if (affine_quantization->scale->data[i] != scale) { |
414 | data->is_hybrid_per_channel = true; |
415 | break; |
416 | } |
417 | } |
418 | } |
419 | |
420 | // The multi-threaded kernel supports neither dilation nor hybrid kernels, and |
421 | // is incompatible with mutable input filters that might change between evals. |
422 | data->supports_multithreaded_kernel = |
423 | (kernel_type == kMultithreadOptimized) && |
424 | (context->recommended_num_threads != 1) && !is_hybrid && |
425 | (params->dilation_width_factor == 1) && |
426 | (params->dilation_height_factor == 1) && |
427 | (filter->allocation_type != kTfLiteArenaRw) && !IsDynamicTensor(filter); |
428 | |
429 | int channels_in = filter->dims->data[3]; |
430 | int channels_out = filter->dims->data[0]; |
431 | int width = input->dims->data[2]; |
432 | int height = input->dims->data[1]; |
433 | int filter_width = filter->dims->data[2]; |
434 | int filter_height = filter->dims->data[1]; |
435 | int batches = input->dims->data[0]; |
436 | |
437 | // Matching GetWindowedOutputSize in TensorFlow. |
438 | auto padding = params->padding; |
439 | int out_width, out_height; |
440 | data->padding = ComputePaddingHeightWidth( |
441 | params->stride_height, params->stride_width, |
442 | params->dilation_height_factor, params->dilation_width_factor, height, |
443 | width, filter_height, filter_width, padding, &out_height, &out_width); |
444 | |
445 | size_t im2col_type_size; |
446 | TF_LITE_ENSURE_STATUS(GetSizeOfType(context, input->type, &im2col_type_size)); |
447 | // Note that we intentionally promote the first multiplicand (i.e. 'batches') |
448 | // to 'size_t' to avoid integer overflow here. |
449 | const size_t im2col_bytes = static_cast<size_t>(batches) * out_height * |
450 | out_width * channels_in * filter_height * |
451 | filter_width * im2col_type_size; |
452 | TF_LITE_ENSURE_STATUS(AllocateTemporaryTensorsIfRequired( |
453 | context, node, is_hybrid, data->is_hybrid_per_channel, kernel_type, |
454 | im2col_bytes)); |
455 | |
456 | TF_LITE_ENSURE(context, has_bias); |
457 | |
458 | // Note that full fixed-point inference requires that all tensors have their |
459 | // parameters set. This is usually done during quantized training or |
460 | // calibration. |
461 | if (input_type != kTfLiteFloat32) { |
462 | TF_LITE_ENSURE_EQ(context, filter->quantization.type, |
463 | kTfLiteAffineQuantization); |
464 | const auto* affine_quantization = |
465 | reinterpret_cast<TfLiteAffineQuantization*>( |
466 | filter->quantization.params); |
467 | TF_LITE_ENSURE(context, affine_quantization); |
468 | TF_LITE_ENSURE(context, affine_quantization->scale); |
469 | TF_LITE_ENSURE(context, (affine_quantization->scale->size == 1 || |
470 | affine_quantization->scale->size == channels_out)); |
471 | |
472 | data->per_channel_output_multiplier.resize(channels_out); |
473 | data->per_channel_output_shift.resize(channels_out); |
474 | TF_LITE_ENSURE_STATUS(tflite::PopulateConvolutionQuantizationParams( |
475 | context, input, filter, bias, output, params->activation, |
476 | &data->output_multiplier, &data->output_shift, |
477 | &data->output_activation_min, &data->output_activation_max, |
478 | data->per_channel_output_multiplier.data(), |
479 | data->per_channel_output_shift.data(), channels_out)); |
480 | } |
481 | |
482 | TfLiteIntArray* output_size = TfLiteIntArrayCreate(4); |
483 | output_size->data[0] = batches; |
484 | output_size->data[1] = out_height; |
485 | output_size->data[2] = out_width; |
486 | output_size->data[3] = channels_out; |
487 | auto output_status = context->ResizeTensor(context, output, output_size); |
488 | |
489 | if (output_status != kTfLiteOk) return output_status; |
490 | |
491 | if (data->need_im2col) { |
492 | node->temporaries->data[data->im2col_index] = data->im2col_id; |
493 | |
494 | TfLiteIntArray* im2col_size = TfLiteIntArrayCreate(4); |
495 | |
496 | auto filter_input_channel = filter->dims->data[3]; |
497 | im2col_size->data[0] = output_size->data[0]; |
498 | im2col_size->data[1] = output_size->data[1]; |
499 | im2col_size->data[2] = output_size->data[2]; |
500 | im2col_size->data[3] = filter_input_channel * filter_height * filter_width; |
501 | |
502 | TfLiteTensor* im2col = |
503 | &context->tensors[node->temporaries->data[data->im2col_index]]; |
504 | im2col->type = input->type; |
505 | if (is_hybrid) { |
506 | im2col->type = filter->type; |
507 | } |
508 | im2col->allocation_type = kTfLiteArenaRw; |
509 | auto im2col_status = context->ResizeTensor(context, im2col, im2col_size); |
510 | if (im2col_status != kTfLiteOk) return im2col_status; |
511 | } |
512 | |
513 | if (data->need_hwcn_weights) { |
514 | node->temporaries->data[data->hwcn_weights_index] = data->hwcn_weights_id; |
515 | TfLiteIntArray* hwcn_weights_size = TfLiteIntArrayCreate(2); |
516 | |
517 | // Because we're treating the filter weights as a matrix when we do the |
518 | // transpose, we allocate the buffer with a two-dimensional shape, where one |
519 | // dimension is the number of elements in each filter, and the second is the |
520 | // total number of filters. |
521 | auto filter_input_channel = filter->dims->data[3]; |
522 | hwcn_weights_size->data[0] = |
523 | (filter_height * filter_width * filter_input_channel); |
524 | hwcn_weights_size->data[1] = channels_out; |
525 | |
526 | TfLiteTensor* hwcn_weights = |
527 | &context->tensors[node->temporaries->data[data->hwcn_weights_index]]; |
528 | hwcn_weights->type = input_type; |
529 | hwcn_weights->name = "Conv_hwcn_weights" ; |
530 | hwcn_weights->allocation_type = kTfLiteArenaRwPersistent; |
531 | |
532 | auto hwcn_weights_status = |
533 | context->ResizeTensor(context, hwcn_weights, hwcn_weights_size); |
534 | if (hwcn_weights_status != kTfLiteOk) return hwcn_weights_status; |
535 | |
536 | // TODO(petewarden): If Resize() is called when the size hasn't actually |
537 | // changed, this will do extra redundant work. |
538 | data->have_weights_been_transposed = false; |
539 | } |
540 | |
541 | if (is_hybrid) { |
542 | node->temporaries->data[data->input_quantized_index] = |
543 | data->input_quantized_id; |
544 | TfLiteTensor* input_quantized; |
545 | TF_LITE_ENSURE_OK( |
546 | context, GetTemporarySafe(context, node, data->input_quantized_index, |
547 | &input_quantized)); |
548 | input_quantized->type = kTfLiteInt8; |
549 | input_quantized->allocation_type = kTfLiteArenaRw; |
550 | if (!TfLiteIntArrayEqual(input_quantized->dims, input->dims)) { |
551 | TfLiteIntArray* input_quantized_size = TfLiteIntArrayCopy(input->dims); |
552 | TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_quantized, |
553 | input_quantized_size)); |
554 | } |
555 | |
556 | node->temporaries->data[data->scaling_factors_index] = |
557 | data->scaling_factors_id; |
558 | TfLiteTensor* scaling_factors; |
559 | TF_LITE_ENSURE_OK( |
560 | context, GetTemporarySafe(context, node, data->scaling_factors_index, |
561 | &scaling_factors)); |
562 | scaling_factors->type = kTfLiteFloat32; |
563 | scaling_factors->allocation_type = kTfLiteArenaRw; |
564 | // Only one scale factor per batch is typically necessary. See optimized |
565 | // implementation for why we need to allocate for the height of the inputs |
566 | // flattened to 2D. |
567 | TF_LITE_ENSURE(context, channels_in != 0); |
568 | const int height = NumElements(input) / channels_in; |
569 | int scaling_dims[1] = {height}; |
570 | if (!TfLiteIntArrayEqualsArray(scaling_factors->dims, 1, scaling_dims)) { |
571 | TfLiteIntArray* scaling_factors_size = TfLiteIntArrayCreate(1); |
572 | scaling_factors_size->data[0] = height; |
573 | TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, scaling_factors, |
574 | scaling_factors_size)); |
575 | } |
576 | |
577 | node->temporaries->data[data->accum_scratch_index] = data->accum_scratch_id; |
578 | TfLiteTensor* accum_scratch; |
579 | TF_LITE_ENSURE_OK(context, |
580 | GetTemporarySafe(context, node, data->accum_scratch_index, |
581 | &accum_scratch)); |
582 | accum_scratch->type = kTfLiteInt32; |
583 | accum_scratch->allocation_type = kTfLiteArenaRw; |
584 | const int scratch_width = batches * out_height * out_width; |
585 | int accum_scratch_dims[2] = {channels_out, scratch_width}; |
586 | if (!TfLiteIntArrayEqualsArray(accum_scratch->dims, 2, |
587 | accum_scratch_dims)) { |
588 | TfLiteIntArray* accum_scratch_size = TfLiteIntArrayCreate(2); |
589 | accum_scratch_size->data[0] = channels_out; |
590 | accum_scratch_size->data[1] = scratch_width; |
591 | TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, accum_scratch, |
592 | accum_scratch_size)); |
593 | } |
594 | |
595 | if (data->is_hybrid_per_channel) { |
596 | const auto* affine_quantization = |
597 | reinterpret_cast<TfLiteAffineQuantization*>( |
598 | filter->quantization.params); |
599 | TF_LITE_ENSURE_EQ( |
600 | context, affine_quantization->scale->size, |
601 | filter->dims->data[affine_quantization->quantized_dimension]); |
602 | node->temporaries->data[data->input_offset_index] = data->input_offset_id; |
603 | TfLiteTensor* input_offsets; |
604 | TF_LITE_ENSURE_OK( |
605 | context, GetTemporarySafe(context, node, data->input_offset_index, |
606 | &input_offsets)); |
607 | input_offsets->type = kTfLiteInt32; |
608 | input_offsets->allocation_type = kTfLiteArenaRw; |
609 | // See above comment for the need to allocate for height of inputs. |
610 | TF_LITE_ENSURE(context, channels_in != 0); |
611 | const int height = NumElements(input) / channels_in; |
612 | const int input_offset_dims[1] = {height}; |
613 | if (!TfLiteIntArrayEqualsArray(input_offsets->dims, 1, |
614 | input_offset_dims)) { |
615 | TfLiteIntArray* input_offsets_size = TfLiteIntArrayCreate(1); |
616 | input_offsets_size->data[0] = input_offset_dims[0]; |
617 | TF_LITE_ENSURE_OK(context, context->ResizeTensor(context, input_offsets, |
618 | input_offsets_size)); |
619 | } |
620 | node->temporaries->data[data->row_sums_index] = data->row_sums_id; |
621 | TfLiteTensor* row_sums; |
622 | TF_LITE_ENSURE_OK( |
623 | context, |
624 | GetTemporarySafe(context, node, data->row_sums_index, &row_sums)); |
625 | row_sums->type = kTfLiteInt32; |
626 | row_sums->name = "Conv_row_sums" ; |
627 | row_sums->allocation_type = kTfLiteArenaRwPersistent; |
628 | // See above comment for the need to allocate for height of inputs. |
629 | const int row_sums_dims[1] = {channels_out}; |
630 | if (!TfLiteIntArrayEqualsArray(row_sums->dims, 1, row_sums_dims)) { |
631 | TfLiteIntArray* row_sums_size = TfLiteIntArrayCreate(1); |
632 | row_sums_size->data[0] = row_sums_dims[0]; |
633 | TF_LITE_ENSURE_OK( |
634 | context, context->ResizeTensor(context, row_sums, row_sums_size)); |
635 | } |
636 | } |
637 | } |
638 | return kTfLiteOk; |
639 | } |
640 | |
641 | template <KernelType kernel_type> |
642 | TfLiteStatus Prepare(TfLiteContext* context, TfLiteNode* node) { |
643 | return Prepare(kernel_type, context, node); |
644 | } |
645 | |
646 | template <KernelType kernel_type> |
647 | void EvalQuantized(TfLiteContext* context, TfLiteNode* node, |
648 | TfLiteConvParams* params, OpData* data, |
649 | const TfLiteTensor* input, const TfLiteTensor* filter, |
650 | const TfLiteTensor* bias, TfLiteTensor* im2col, |
651 | TfLiteTensor* output) { |
652 | auto input_offset = -input->params.zero_point; |
653 | auto filter_offset = -filter->params.zero_point; |
654 | auto output_offset = output->params.zero_point; |
655 | |
656 | KernelType effective_kernel_type; |
657 | if ((kernel_type == kMultithreadOptimized || |
658 | kernel_type == kCblasOptimized) && |
659 | (params->dilation_width_factor != 1 || |
660 | params->dilation_height_factor != 1)) { |
661 | // kMultithreadOptimized and kCblasOptimized do not support dilation. |
662 | // Therefore, fallback to optimized. |
663 | effective_kernel_type = kGenericOptimized; |
664 | } else { |
665 | effective_kernel_type = kernel_type; |
666 | } |
667 | |
668 | // We have to fallback to reference execution path when im2col is needed but |
669 | // disabled because to-be-allocated temporary im2col tensor is too large. |
670 | // See b/178743262 for the detailed motivation. |
671 | if (data->im2col_oversized) { |
672 | effective_kernel_type = kReference; |
673 | } |
674 | |
675 | // Grouped convolution is right now only supported on reference kernel. |
676 | if (data->groups != 1) { |
677 | effective_kernel_type = kReference; |
678 | } |
679 | |
680 | ConvParams op_params; |
681 | op_params.padding_type = PaddingType::kSame; |
682 | op_params.padding_values.width = data->padding.width; |
683 | op_params.padding_values.height = data->padding.height; |
684 | op_params.dilation_width_factor = params->dilation_width_factor; |
685 | op_params.dilation_height_factor = params->dilation_height_factor; |
686 | op_params.stride_width = params->stride_width; |
687 | op_params.stride_height = params->stride_height; |
688 | op_params.input_offset = input_offset; |
689 | op_params.weights_offset = filter_offset; |
690 | op_params.output_offset = output_offset; |
691 | op_params.output_multiplier = data->output_multiplier; |
692 | op_params.output_shift = -data->output_shift; |
693 | op_params.quantized_activation_min = data->output_activation_min; |
694 | op_params.quantized_activation_max = data->output_activation_max; |
695 | switch (effective_kernel_type) { |
696 | case kReference: { |
697 | reference_ops::Conv( |
698 | op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), |
699 | GetTensorShape(filter), GetTensorData<uint8_t>(filter), |
700 | GetTensorShape(bias), GetTensorData<int32_t>(bias), |
701 | GetTensorShape(output), GetTensorData<uint8_t>(output), |
702 | GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), |
703 | /* cpu_backend_context = */ nullptr); |
704 | break; |
705 | } |
706 | case kGenericOptimized: |
707 | case kMultithreadOptimized: |
708 | case kCblasOptimized: { |
709 | // There is only one optimized implementation for Quantized Conv. |
710 | optimized_ops::Conv( |
711 | op_params, GetTensorShape(input), GetTensorData<uint8_t>(input), |
712 | GetTensorShape(filter), GetTensorData<uint8_t>(filter), |
713 | GetTensorShape(bias), GetTensorData<int32_t>(bias), |
714 | GetTensorShape(output), GetTensorData<uint8_t>(output), |
715 | GetTensorShape(im2col), GetTensorData<uint8_t>(im2col), |
716 | CpuBackendContext::GetFromContext(context)); |
717 | break; |
718 | } |
719 | } |
720 | } |
721 | |
722 | template <KernelType kernel_type> |
723 | void EvalQuantizedPerChannel(TfLiteContext* context, TfLiteNode* node, |
724 | TfLiteConvParams* params, OpData* data, |
725 | const TfLiteTensor* input, |
726 | const TfLiteTensor* filter, |
727 | const TfLiteTensor* bias, TfLiteTensor* output, |
728 | TfLiteTensor* im2col) { |
729 | ConvParams op_params; |
730 | op_params.input_offset = -input->params.zero_point; |
731 | op_params.output_offset = output->params.zero_point; |
732 | op_params.stride_height = params->stride_height; |
733 | op_params.stride_width = params->stride_width; |
734 | op_params.dilation_height_factor = params->dilation_height_factor; |
735 | op_params.dilation_width_factor = params->dilation_width_factor; |
736 | op_params.padding_values.height = data->padding.height; |
737 | op_params.padding_values.width = data->padding.width; |
738 | op_params.quantized_activation_min = data->output_activation_min; |
739 | op_params.quantized_activation_max = data->output_activation_max; |
740 | |
741 | KernelType effective_kernel_type = kernel_type; |
742 | // We have to fallback to reference execution path when im2col is needed but |
743 | // disabled because to-be-allocated temporary im2col tensor is too large. |
744 | // See b/178743262 for the detailed motivation. |
745 | if (data->im2col_oversized) { |
746 | effective_kernel_type = kReference; |
747 | } |
748 | |
749 | // Grouped convolution is right now only supported on reference kernel. |
750 | if (data->groups != 1) { |
751 | effective_kernel_type = kReference; |
752 | } |
753 | |
754 | switch (effective_kernel_type) { |
755 | case kReference: { |
756 | reference_integer_ops::ConvPerChannel( |
757 | op_params, data->per_channel_output_multiplier.data(), |
758 | data->per_channel_output_shift.data(), GetTensorShape(input), |
759 | GetTensorData<int8>(input), GetTensorShape(filter), |
760 | GetTensorData<int8>(filter), GetTensorShape(bias), |
761 | GetTensorData<int32>(bias), GetTensorShape(output), |
762 | GetTensorData<int8>(output)); |
763 | break; |
764 | } |
765 | case kGenericOptimized: |
766 | case kMultithreadOptimized: |
767 | case kCblasOptimized: { |
768 | optimized_integer_ops::ConvPerChannel( |
769 | op_params, data->per_channel_output_multiplier.data(), |
770 | data->per_channel_output_shift.data(), GetTensorShape(input), |
771 | GetTensorData<int8>(input), GetTensorShape(filter), |
772 | GetTensorData<int8>(filter), GetTensorShape(bias), |
773 | GetTensorData<int32>(bias), GetTensorShape(output), |
774 | GetTensorData<int8>(output), GetTensorShape(im2col), |
775 | GetTensorData<int8>(im2col), |
776 | CpuBackendContext::GetFromContext(context)); |
777 | break; |
778 | } |
779 | } |
780 | } |
781 | |
782 | template <KernelType kernel_type> |
783 | void EvalQuantizedPerChannel16x8(TfLiteContext* context, TfLiteNode* node, |
784 | TfLiteConvParams* params, OpData* data, |
785 | const TfLiteTensor* input, |
786 | const TfLiteTensor* filter, |
787 | const TfLiteTensor* bias, TfLiteTensor* output, |
788 | TfLiteTensor* im2col) { |
789 | ConvParams op_params; |
790 | op_params.input_offset = -input->params.zero_point; |
791 | op_params.output_offset = output->params.zero_point; |
792 | op_params.stride_height = params->stride_height; |
793 | op_params.stride_width = params->stride_width; |
794 | op_params.dilation_height_factor = params->dilation_height_factor; |
795 | op_params.dilation_width_factor = params->dilation_width_factor; |
796 | op_params.padding_values.height = data->padding.height; |
797 | op_params.padding_values.width = data->padding.width; |
798 | op_params.quantized_activation_min = data->output_activation_min; |
799 | op_params.quantized_activation_max = data->output_activation_max; |
800 | |
801 | KernelType effective_kernel_type = kernel_type; |
802 | // We have to fallback to reference execution path when im2col is needed but |
803 | // disabled because to-be-allocated temporary im2col tensor is too large. |
804 | // See b/178743262 for the detailed motivation. |
805 | if (data->im2col_oversized) { |
806 | effective_kernel_type = kReference; |
807 | } |
808 | |
809 | // Grouped convolution is right now only supported on reference kernel. |
810 | if (data->groups != 1) { |
811 | effective_kernel_type = kReference; |
812 | } |
813 | |
814 | // To prevent 32bit accum overflow for 16x8 quantization, it enables the |
815 | // optimized path only when zero_point is 0. |
816 | bool has_non_zero_point = input->params.zero_point || |
817 | filter->params.zero_point || |
818 | output->params.zero_point; |
819 | |
820 | // Fallback to reference kernel when bias_type is int64 as |
821 | // there is no optimized kernel for int64 bias yet. |
822 | if (bias && bias->type == kTfLiteInt64) { |
823 | reference_integer_ops::ConvPerChannel( |
824 | op_params, data->per_channel_output_multiplier.data(), |
825 | data->per_channel_output_shift.data(), GetTensorShape(input), |
826 | GetTensorData<int16>(input), GetTensorShape(filter), |
827 | GetTensorData<int8>(filter), GetTensorShape(bias), |
828 | GetTensorData<std::int64_t>(bias), GetTensorShape(output), |
829 | GetTensorData<int16>(output)); |
830 | } else if (effective_kernel_type == kReference || has_non_zero_point) { |
831 | reference_integer_ops::ConvPerChannel( |
832 | op_params, data->per_channel_output_multiplier.data(), |
833 | data->per_channel_output_shift.data(), GetTensorShape(input), |
834 | GetTensorData<int16>(input), GetTensorShape(filter), |
835 | GetTensorData<int8>(filter), GetTensorShape(bias), |
836 | GetTensorData<std::int32_t>(bias), GetTensorShape(output), |
837 | GetTensorData<int16>(output)); |
838 | } else { |
839 | optimized_integer_ops::ConvPerChannel( |
840 | op_params, data->per_channel_output_multiplier.data(), |
841 | data->per_channel_output_shift.data(), GetTensorShape(input), |
842 | GetTensorData<int16_t>(input), GetTensorShape(filter), |
843 | GetTensorData<int8_t>(filter), GetTensorShape(bias), |
844 | GetTensorData<std::int32_t>(bias), GetTensorShape(output), |
845 | GetTensorData<int16_t>(output), GetTensorShape(im2col), |
846 | GetTensorData<int16_t>(im2col), |
847 | CpuBackendContext::GetFromContext(context)); |
848 | } |
849 | } |
850 | |
851 | template <KernelType kernel_type> |
852 | void EvalFloat(TfLiteContext* context, TfLiteNode* node, |
853 | TfLiteConvParams* params, OpData* data, |
854 | const TfLiteTensor* input, const TfLiteTensor* filter, |
855 | const TfLiteTensor* bias, TfLiteTensor* im2col, |
856 | TfLiteTensor* hwcn_weights, TfLiteTensor* output) { |
857 | float output_activation_min, output_activation_max; |
858 | CalculateActivationRange(params->activation, &output_activation_min, |
859 | &output_activation_max); |
860 | KernelType effective_kernel_type = kernel_type; |
861 | // Fall back to the optimized path if multi-threaded conv is unsupported. |
862 | if ((kernel_type == kMultithreadOptimized) && |
863 | !data->supports_multithreaded_kernel) { |
864 | effective_kernel_type = kGenericOptimized; |
865 | } |
866 | |
867 | // When im2col is needed (which is implied when 'im2col_oversized' is true), |
868 | // the GEMMM-based optimized path requires im2col data be allocated to ensure |
869 | // the correctness. Therefore, when im2col is disabled because of the |
870 | // oversized temporary im2col tensor, fallback to a non-optimized path is |
871 | // needed. |
872 | // See b/178743262 for the detailed motivation. |
873 | if (data->im2col_oversized) { |
874 | effective_kernel_type = kReference; |
875 | #if defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
876 | // As detailed by tflite::multithreaded_ops::Conv implementation in |
877 | // multithreaded_conv.h, the Eigen-based execution doesn't need im2col data. |
878 | // Therefore, we could rely on it as a better-optimized fallback than the |
879 | // reference one. |
880 | if (data->supports_multithreaded_kernel) { |
881 | effective_kernel_type = kMultithreadOptimized; |
882 | } |
883 | #endif |
884 | } |
885 | |
886 | // Grouped convolution is right now only supported on reference kernel. |
887 | if (data->groups != 1) { |
888 | effective_kernel_type = kReference; |
889 | } |
890 | |
891 | ConvParams op_params; |
892 | op_params.padding_type = RuntimePaddingType(params->padding); |
893 | op_params.padding_values.width = data->padding.width; |
894 | op_params.padding_values.height = data->padding.height; |
895 | op_params.stride_width = params->stride_width; |
896 | op_params.stride_height = params->stride_height; |
897 | op_params.dilation_width_factor = params->dilation_width_factor; |
898 | op_params.dilation_height_factor = params->dilation_height_factor; |
899 | op_params.float_activation_min = output_activation_min; |
900 | op_params.float_activation_max = output_activation_max; |
901 | switch (effective_kernel_type) { |
902 | case kReference: { |
903 | reference_ops::Conv(op_params, GetTensorShape(input), |
904 | GetTensorData<float>(input), GetTensorShape(filter), |
905 | GetTensorData<float>(filter), GetTensorShape(bias), |
906 | GetTensorData<float>(bias), GetTensorShape(output), |
907 | GetTensorData<float>(output), GetTensorShape(im2col), |
908 | GetTensorData<float>(im2col)); |
909 | break; |
910 | } |
911 | case kCblasOptimized: |
912 | case kGenericOptimized: { |
913 | optimized_ops::Conv(op_params, GetTensorShape(input), |
914 | GetTensorData<float>(input), GetTensorShape(filter), |
915 | GetTensorData<float>(filter), GetTensorShape(bias), |
916 | GetTensorData<float>(bias), GetTensorShape(output), |
917 | GetTensorData<float>(output), GetTensorShape(im2col), |
918 | GetTensorData<float>(im2col), |
919 | CpuBackendContext::GetFromContext(context)); |
920 | break; |
921 | } |
922 | case kMultithreadOptimized: { |
923 | #if defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
924 | const float* filter_data; |
925 | if (data->need_hwcn_weights) { |
926 | filter_data = GetTensorData<float>(hwcn_weights); |
927 | } else { |
928 | filter_data = GetTensorData<float>(filter); |
929 | } |
930 | multithreaded_ops::Conv( |
931 | *eigen_support::GetThreadPoolDevice(context), op_params, |
932 | GetTensorShape(input), GetTensorData<float>(input), |
933 | GetTensorShape(filter), filter_data, GetTensorShape(bias), |
934 | GetTensorData<float>(bias), GetTensorShape(output), |
935 | GetTensorData<float>(output), GetTensorShape(im2col), |
936 | GetTensorData<float>(im2col)); |
937 | break; |
938 | #else // !defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
939 | // See Register_CONV_2D: we should never be here when TFLITE_WITH_RUY |
940 | // was enabled. We #if out this code in order to get the corresponding |
941 | // binary size benefits. |
942 | TFLITE_DCHECK(false); |
943 | #endif // defined(TFLITE_WITH_MULTITHREADED_EIGEN) |
944 | } |
945 | } |
946 | } |
947 | |
948 | template <KernelType kernel_type> |
949 | TfLiteStatus EvalHybridPerChannel(TfLiteContext* context, TfLiteNode* node, |
950 | TfLiteConvParams* params, OpData* data, |
951 | const TfLiteTensor* input, |
952 | const TfLiteTensor* filter, |
953 | const TfLiteTensor* bias, |
954 | TfLiteTensor* im2col, TfLiteTensor* output) { |
955 | float output_activation_min, output_activation_max; |
956 | CalculateActivationRange(params->activation, &output_activation_min, |
957 | &output_activation_max); |
958 | |
959 | const int batch_size = SizeOfDimension(input, 0); |
960 | TF_LITE_ENSURE(context, batch_size != 0); |
961 | const int input_size = NumElements(input) / batch_size; |
962 | TfLiteTensor* quantized_input_tensor; |
963 | TF_LITE_ENSURE_OK(context, |
964 | GetTemporarySafe(context, node, data->input_quantized_index, |
965 | &quantized_input_tensor)); |
966 | int8_t* quantized_input_ptr_batch = |
967 | GetTensorData<int8_t>(quantized_input_tensor); |
968 | TfLiteTensor* scaling_factors_tensor; |
969 | TF_LITE_ENSURE_OK(context, |
970 | GetTemporarySafe(context, node, data->scaling_factors_index, |
971 | &scaling_factors_tensor)); |
972 | float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor); |
973 | TfLiteTensor* input_offset_tensor; |
974 | TF_LITE_ENSURE_OK(context, |
975 | GetTemporarySafe(context, node, data->input_offset_index, |
976 | &input_offset_tensor)); |
977 | int32_t* input_offset_ptr = GetTensorData<int32_t>(input_offset_tensor); |
978 | |
979 | for (int b = 0; b < batch_size; ++b) { |
980 | const int offset = b * input_size; |
981 | tensor_utils::AsymmetricQuantizeFloats( |
982 | GetTensorData<float>(input) + offset, input_size, |
983 | quantized_input_ptr_batch + offset, &scaling_factors_ptr[b], |
984 | &input_offset_ptr[b]); |
985 | } |
986 | |
987 | int8_t* im2col_ptr = nullptr; |
988 | int8_t* filter_ptr = nullptr; |
989 | if (im2col != nullptr) { |
990 | im2col_ptr = im2col->data.int8; |
991 | } |
992 | filter_ptr = filter->data.int8; |
993 | const auto* affine_quantization = |
994 | reinterpret_cast<TfLiteAffineQuantization*>(filter->quantization.params); |
995 | |
996 | KernelType effective_kernel_type = kernel_type; |
997 | // We have to fallback to reference execution path when im2col is needed but |
998 | // disabled because to-be-allocated temporary im2col tensor is too large. |
999 | // See b/178743262 for the detailed motivation. |
1000 | if (data->im2col_oversized) { |
1001 | effective_kernel_type = kReference; |
1002 | } |
1003 | |
1004 | // Grouped convolution is right now only supported on reference kernel. |
1005 | if (data->groups != 1) { |
1006 | effective_kernel_type = kReference; |
1007 | } |
1008 | |
1009 | ConvParams op_params; |
1010 | op_params.padding_type = PaddingType::kSame; |
1011 | op_params.padding_values.width = data->padding.width; |
1012 | op_params.padding_values.height = data->padding.height; |
1013 | op_params.dilation_width_factor = params->dilation_width_factor; |
1014 | op_params.dilation_height_factor = params->dilation_height_factor; |
1015 | op_params.stride_width = params->stride_width; |
1016 | op_params.stride_height = params->stride_height; |
1017 | op_params.float_activation_min = output_activation_min; |
1018 | op_params.float_activation_max = output_activation_max; |
1019 | switch (effective_kernel_type) { |
1020 | case kReference: |
1021 | reference_ops::HybridConvPerChannel( |
1022 | op_params, scaling_factors_ptr, GetTensorShape(input), |
1023 | quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr, |
1024 | GetTensorShape(bias), GetTensorData<float>(bias), |
1025 | GetTensorShape(output), GetTensorData<float>(output), |
1026 | GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data, |
1027 | input_offset_ptr); |
1028 | break; |
1029 | case kGenericOptimized: |
1030 | case kMultithreadOptimized: |
1031 | case kCblasOptimized: { |
1032 | TfLiteTensor* row_sums; |
1033 | TF_LITE_ENSURE_OK( |
1034 | context, |
1035 | GetTemporarySafe(context, node, data->row_sums_index, &row_sums)); |
1036 | TfLiteTensor* scratch; |
1037 | TF_LITE_ENSURE_OK( |
1038 | context, |
1039 | GetTemporarySafe(context, node, data->accum_scratch_index, &scratch)); |
1040 | optimized_ops::HybridConvPerChannel( |
1041 | op_params, scaling_factors_ptr, GetTensorShape(input), |
1042 | quantized_input_ptr_batch, GetTensorShape(filter), filter_ptr, |
1043 | GetTensorShape(bias), GetTensorData<float>(bias), |
1044 | GetTensorShape(output), GetTensorData<float>(output), |
1045 | GetTensorShape(im2col), im2col_ptr, affine_quantization->scale->data, |
1046 | input_offset_ptr, GetTensorShape(scratch), |
1047 | GetTensorData<int32>(scratch), GetTensorData<int32_t>(row_sums), |
1048 | &data->compute_hybrid_row_sums, |
1049 | CpuBackendContext::GetFromContext(context)); |
1050 | data->compute_hybrid_row_sums = false; |
1051 | break; |
1052 | } |
1053 | } |
1054 | |
1055 | return kTfLiteOk; |
1056 | } |
1057 | |
1058 | template <KernelType kernel_type> |
1059 | TfLiteStatus EvalHybrid(TfLiteContext* context, TfLiteNode* node, |
1060 | TfLiteConvParams* params, OpData* data, |
1061 | const TfLiteTensor* input, const TfLiteTensor* filter, |
1062 | const TfLiteTensor* bias, TfLiteTensor* im2col, |
1063 | TfLiteTensor* accum_scratch, TfLiteTensor* output) { |
1064 | float output_activation_min, output_activation_max; |
1065 | CalculateActivationRange(params->activation, &output_activation_min, |
1066 | &output_activation_max); |
1067 | |
1068 | const int batch_size = SizeOfDimension(input, 0); |
1069 | TF_LITE_ENSURE(context, batch_size != 0); |
1070 | const int input_size = NumElements(input) / batch_size; |
1071 | |
1072 | const float* input_ptr = GetTensorData<float>(input); |
1073 | TfLiteTensor* quantized_input_tensor; |
1074 | TF_LITE_ENSURE_OK(context, |
1075 | GetTemporarySafe(context, node, data->input_quantized_index, |
1076 | &quantized_input_tensor)); |
1077 | int8_t* quantized_input_ptr_batch = |
1078 | GetTensorData<int8_t>(quantized_input_tensor); |
1079 | TfLiteTensor* scaling_factors_tensor; |
1080 | TF_LITE_ENSURE_OK(context, |
1081 | GetTemporarySafe(context, node, data->scaling_factors_index, |
1082 | &scaling_factors_tensor)); |
1083 | float* scaling_factors_ptr = GetTensorData<float>(scaling_factors_tensor); |
1084 | |
1085 | // Per-batch input quantization for higher accuracy. |
1086 | { |
1087 | ruy::profiler::ScopeLabel label("ConvHybridQuantizeInputs" ); |
1088 | for (int b = 0; b < batch_size; ++b) { |
1089 | float unused_min, unused_max; |
1090 | const int offset = b * input_size; |
1091 | tensor_utils::SymmetricQuantizeFloats( |
1092 | input_ptr + offset, input_size, quantized_input_ptr_batch + offset, |
1093 | &unused_min, &unused_max, &scaling_factors_ptr[b]); |
1094 | scaling_factors_ptr[b] *= filter->params.scale; |
1095 | } |
1096 | } |
1097 | |
1098 | switch (kernel_type) { |
1099 | case kReference: |
1100 | case kGenericOptimized: |
1101 | case kMultithreadOptimized: |
1102 | case kCblasOptimized: { |
1103 | // There is only one implementation for hybrid kernel. |
1104 | ConvParams op_params; |
1105 | op_params.padding_type = PaddingType::kSame; |
1106 | op_params.padding_values.width = data->padding.width; |
1107 | op_params.padding_values.height = data->padding.height; |
1108 | op_params.stride_width = params->stride_width; |
1109 | op_params.stride_height = params->stride_height; |
1110 | op_params.dilation_width_factor = params->dilation_width_factor; |
1111 | op_params.dilation_height_factor = params->dilation_height_factor; |
1112 | op_params.float_activation_min = output_activation_min; |
1113 | op_params.float_activation_max = output_activation_max; |
1114 | if (data->groups == 1) { |
1115 | optimized_ops::HybridConv( |
1116 | op_params, scaling_factors_ptr, GetTensorShape(input), |
1117 | quantized_input_ptr_batch, GetTensorShape(filter), |
1118 | GetTensorData<int8_t>(filter), GetTensorShape(bias), |
1119 | GetTensorData<float>(bias), GetTensorShape(accum_scratch), |
1120 | GetTensorData<int32_t>(accum_scratch), GetTensorShape(output), |
1121 | GetTensorData<float>(output), GetTensorShape(im2col), |
1122 | GetTensorData<int8_t>(im2col), |
1123 | CpuBackendContext::GetFromContext(context)); |
1124 | } else { |
1125 | // This case is handled by (fallbacked to) per channel hybrid group conv |
1126 | // and shouldn't hit this branch. |
1127 | TF_LITE_KERNEL_LOG( |
1128 | context, |
1129 | "Group convolution currently not supported for hybrid kernel." ); |
1130 | return kTfLiteError; |
1131 | } |
1132 | break; |
1133 | } |
1134 | } |
1135 | |
1136 | return kTfLiteOk; |
1137 | } |
1138 | |
1139 | template <KernelType kernel_type, TfLiteType input_type> |
1140 | TfLiteStatus EvalImpl(TfLiteContext* context, TfLiteNode* node) { |
1141 | auto* params = reinterpret_cast<TfLiteConvParams*>(node->builtin_data); |
1142 | OpData* data = reinterpret_cast<OpData*>(node->user_data); |
1143 | |
1144 | TfLiteTensor* output; |
1145 | TF_LITE_ENSURE_OK(context, GetOutputSafe(context, node, 0, &output)); |
1146 | const TfLiteTensor* input; |
1147 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input)); |
1148 | const TfLiteTensor* filter; |
1149 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 1, &filter)); |
1150 | bool has_bias = node->inputs->size == 3; |
1151 | const TfLiteTensor* bias = has_bias ? GetInput(context, node, 2) : nullptr; |
1152 | TfLiteTensor* im2col = |
1153 | data->need_im2col |
1154 | ? &context->tensors[node->temporaries->data[data->im2col_index]] |
1155 | : nullptr; |
1156 | TfLiteTensor* hwcn_weights = |
1157 | data->need_hwcn_weights |
1158 | ? &context->tensors[node->temporaries->data[data->hwcn_weights_index]] |
1159 | : nullptr; |
1160 | |
1161 | if (data->need_hwcn_weights && !data->have_weights_been_transposed) { |
1162 | TransposeFloatTensor(filter, hwcn_weights); |
1163 | data->have_weights_been_transposed = true; |
1164 | } |
1165 | |
1166 | TFLITE_DCHECK_EQ(input_type, input->type); |
1167 | switch (input_type) { // Already know in/outtypes are same. |
1168 | case kTfLiteFloat32: |
1169 | if (filter->type == kTfLiteUInt8 || filter->type == kTfLiteInt8) { |
1170 | if (data->is_hybrid_per_channel || |
1171 | // TODO(b/162870360): Fallback to PerChannel implementation |
1172 | // before we have grouped hybrid convolution. |
1173 | data->groups != 1) { |
1174 | TF_LITE_ENSURE_OK(context, EvalHybridPerChannel<kernel_type>( |
1175 | context, node, params, data, input, |
1176 | filter, bias, im2col, output)); |
1177 | } else { |
1178 | TfLiteTensor* accum_scratch = |
1179 | &context->tensors[node->temporaries |
1180 | ->data[data->accum_scratch_index]]; |
1181 | TF_LITE_ENSURE_OK(context, |
1182 | EvalHybrid<kernel_type>(context, node, params, data, |
1183 | input, filter, bias, im2col, |
1184 | accum_scratch, output)); |
1185 | } |
1186 | } else { |
1187 | EvalFloat<kernel_type>(context, node, params, data, input, filter, bias, |
1188 | im2col, hwcn_weights, output); |
1189 | } |
1190 | break; |
1191 | case kTfLiteUInt8: |
1192 | EvalQuantized<kernel_type>(context, node, params, data, input, filter, |
1193 | bias, im2col, output); |
1194 | break; |
1195 | case kTfLiteInt8: |
1196 | EvalQuantizedPerChannel<kernel_type>(context, node, params, data, input, |
1197 | filter, bias, output, im2col); |
1198 | break; |
1199 | case kTfLiteInt16: |
1200 | EvalQuantizedPerChannel16x8<kernel_type>( |
1201 | context, node, params, data, input, filter, bias, output, im2col); |
1202 | break; |
1203 | default: |
1204 | TF_LITE_KERNEL_LOG(context, "Type %s currently not supported." , |
1205 | TfLiteTypeGetName(input->type)); |
1206 | return kTfLiteError; |
1207 | } |
1208 | return kTfLiteOk; |
1209 | } |
1210 | |
1211 | template <KernelType kernel_type> |
1212 | TfLiteStatus Eval(TfLiteContext* context, TfLiteNode* node) { |
1213 | const TfLiteTensor* input; |
1214 | TF_LITE_ENSURE_OK(context, GetInputSafe(context, node, 0, &input)); |
1215 | |
1216 | switch (input->type) { |
1217 | case kTfLiteFloat32: |
1218 | return EvalImpl<kernel_type, kTfLiteFloat32>(context, node); |
1219 | case kTfLiteUInt8: |
1220 | return EvalImpl<kernel_type, kTfLiteUInt8>(context, node); |
1221 | case kTfLiteInt8: |
1222 | return EvalImpl<kernel_type, kTfLiteInt8>(context, node); |
1223 | case kTfLiteInt16: |
1224 | return EvalImpl<kernel_type, kTfLiteInt16>(context, node); |
1225 | default: |
1226 | TF_LITE_KERNEL_LOG(context, "Type %s not currently supported." , |
1227 | TfLiteTypeGetName(input->type)); |
1228 | return kTfLiteError; |
1229 | } |
1230 | } |
1231 | |
1232 | } // namespace conv |
1233 | |
1234 | TfLiteRegistration* Register_CONVOLUTION_REF() { |
1235 | static TfLiteRegistration r = {conv::Init, conv::Free, |
1236 | conv::Prepare<conv::kReference>, |
1237 | conv::Eval<conv::kReference>}; |
1238 | return &r; |
1239 | } |
1240 | |
1241 | TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT() { |
1242 | static TfLiteRegistration r = {conv::Init, conv::Free, |
1243 | conv::Prepare<conv::kGenericOptimized>, |
1244 | conv::Eval<conv::kGenericOptimized>}; |
1245 | return &r; |
1246 | } |
1247 | |
1248 | TfLiteRegistration* Register_CONVOLUTION_GENERIC_OPT_UINT8() { |
1249 | static TfLiteRegistration r = { |
1250 | conv::Init, conv::Free, conv::Prepare<conv::kGenericOptimized>, |
1251 | conv::EvalImpl<conv::kGenericOptimized, kTfLiteUInt8>}; |
1252 | return &r; |
1253 | } |
1254 | |
1255 | TfLiteRegistration* Register_CONVOLUTION_MULTITHREADED_OPT() { |
1256 | static TfLiteRegistration r = {conv::Init, conv::Free, |
1257 | conv::Prepare<conv::kMultithreadOptimized>, |
1258 | conv::Eval<conv::kMultithreadOptimized>}; |
1259 | return &r; |
1260 | } |
1261 | |
1262 | TfLiteRegistration* Register_CONVOLUTION_CBLAS_OPT() { |
1263 | static TfLiteRegistration r = {conv::Init, conv::Free, |
1264 | conv::Prepare<conv::kCblasOptimized>, |
1265 | conv::Eval<conv::kCblasOptimized>}; |
1266 | return &r; |
1267 | } |
1268 | |
1269 | TfLiteRegistration* Register_CONV_2D() { |
1270 | #if defined TFLITE_USE_APPLE_ACCELERATE_FOR_CONV |
1271 | return Register_CONVOLUTION_CBLAS_OPT(); |
1272 | #elif defined TFLITE_WITH_MULTITHREADED_EIGEN |
1273 | return Register_CONVOLUTION_MULTITHREADED_OPT(); |
1274 | #else |
1275 | return Register_CONVOLUTION_GENERIC_OPT(); |
1276 | #endif |
1277 | } |
1278 | |
1279 | // Warning: Clients using this variant are responsible for ensuring that their |
1280 | // models only need the UINT8 type. TFLite's op registration mechanism doesn't |
1281 | // yet allow for more nuanced registration mechanisms. |
1282 | TfLiteRegistration* Register_CONV_2D_UINT8() { |
1283 | #if defined TFLITE_WITH_RUY |
1284 | // TFLITE_WITH_RUY optimizes the generic kernel type. |
1285 | return Register_CONVOLUTION_GENERIC_OPT_UINT8(); |
1286 | #else |
1287 | return Register_CONV_2D(); |
1288 | #endif |
1289 | } |
1290 | |
1291 | } // namespace builtin |
1292 | } // namespace ops |
1293 | } // namespace tflite |
1294 | |