1/*
2 * Copyright (c) Facebook, Inc. and its affiliates.
3 * All rights reserved.
4 *
5 * This source code is licensed under the BSD-style license found in the
6 * LICENSE file in the root directory of this source tree.
7 */
8
9#include <assert.h>
10#include <math.h>
11#include <stdbool.h>
12#include <stddef.h>
13#include <stdint.h>
14#include <stdlib.h>
15#include <string.h>
16
17#include <fxdiv.h>
18
19#include <qnnpack.h>
20#include <qnnpack/operator.h>
21#include <qnnpack/log.h>
22#include <qnnpack/common.h>
23#include <qnnpack/math.h>
24#include <qnnpack/pack.h>
25#include <qnnpack/params.h>
26#include <qnnpack/indirection.h>
27
28
29static inline size_t compute_output_dimension(
30 size_t padded_input_dimension,
31 size_t kernel_dimension,
32 size_t dilation_dimension,
33 size_t subsampling_dimension)
34{
35 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
36 return (padded_input_dimension - effective_kernel_dimension) / subsampling_dimension + 1;
37}
38
39enum qnnp_status qnnp_create_convolution2d_nhwc_q8(
40 uint32_t input_padding_top,
41 uint32_t input_padding_right,
42 uint32_t input_padding_bottom,
43 uint32_t input_padding_left,
44 uint32_t kernel_height,
45 uint32_t kernel_width,
46 uint32_t subsampling_height,
47 uint32_t subsampling_width,
48 uint32_t dilation_height,
49 uint32_t dilation_width,
50 uint32_t groups,
51 size_t group_input_channels,
52 size_t group_output_channels,
53 uint8_t input_zero_point,
54 float input_scale,
55 uint8_t kernel_zero_point,
56 float kernel_scale,
57 const uint8_t* kernel,
58 const int32_t* bias,
59 uint8_t output_zero_point,
60 float output_scale,
61 uint8_t output_min,
62 uint8_t output_max,
63 uint32_t flags,
64 qnnp_operator_t* convolution_out)
65{
66 qnnp_operator_t convolution = NULL;
67 enum qnnp_status status = qnnp_status_uninitialized;
68
69 if (!qnnp_params.initialized) {
70 qnnp_log_error("qnnp_create_convolution2d_nhwc_q8 failed because QNNPACK is not properly initialized");
71 goto error;
72 }
73
74 status = qnnp_status_invalid_parameter;
75
76 if (kernel_width == 0 || kernel_height == 0) {
77 qnnp_log_error(
78 "failed to create convolution with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero",
79 kernel_width, kernel_height);
80 goto error;
81 }
82
83 if (subsampling_width == 0 || subsampling_height == 0) {
84 qnnp_log_error(
85 "failed to create convolution with %" PRIu32 "x%" PRIu32 " subsampling: "
86 "subsampling dimensions must be non-zero",
87 subsampling_width, subsampling_height);
88 goto error;
89 }
90
91 if (dilation_width == 0 || dilation_height == 0) {
92 qnnp_log_error(
93 "failed to create convolution with %" PRIu32 "x%" PRIu32 " dilation: "
94 "dilation dimensions must be non-zero",
95 dilation_width, dilation_height);
96 goto error;
97 }
98
99 if (input_scale <= 0.0f || !isnormal(input_scale)) {
100 qnnp_log_error(
101 "failed to create convolution with %.7g input scale: scale must be finite and positive", input_scale);
102 goto error;
103 }
104
105 if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) {
106 qnnp_log_error(
107 "failed to create convolution with %.7g kernel scale: scale must be finite and positive", kernel_scale);
108 goto error;
109 }
110
111 if (output_scale <= 0.0f || !isnormal(output_scale)) {
112 qnnp_log_error(
113 "failed to create convolution with %.7g output scale: scale must be finite and positive", output_scale);
114 goto error;
115 }
116
117 status = qnnp_status_unsupported_parameter;
118
119 if (subsampling_height > kernel_height) {
120 qnnp_log_info(
121 "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "x%" PRIu32 " subsampling: "
122 "height subsampling is greater than kernel height; subsampling should be performed before the convolution",
123 kernel_width, kernel_height, subsampling_width, subsampling_height);
124 }
125
126 if (subsampling_width > kernel_width) {
127 qnnp_log_info(
128 "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "x%" PRIu32 " subsampling: "
129 "width subsampling is greater than kernel width; subsampling should be performed before the convolution",
130 kernel_width, kernel_height, subsampling_width, subsampling_height);
131 }
132
133 if (input_padding_top >= kernel_height) {
134 qnnp_log_info(
135 "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " height padding: "
136 "input top padding is greater or equal to kernel height",
137 kernel_width, kernel_height, input_padding_top, input_padding_bottom);
138 }
139
140 if (input_padding_bottom >= kernel_height) {
141 qnnp_log_info(
142 "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " height padding: "
143 "input bottom padding is greater or equal to kernel height",
144 kernel_width, kernel_height, input_padding_top, input_padding_bottom);
145 }
146
147 if (input_padding_right >= kernel_width) {
148 qnnp_log_info(
149 "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " width padding: "
150 "input right padding is greater or equal to kernel width",
151 kernel_width, kernel_height, input_padding_left, input_padding_right);
152 }
153
154 if (input_padding_left >= kernel_width) {
155 qnnp_log_info(
156 "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " width padding: "
157 "input left padding is greater or equal to kernel width",
158 kernel_width, kernel_height, input_padding_left, input_padding_right);
159 }
160
161 const float convolution_scale = input_scale * kernel_scale / output_scale;
162 if (convolution_scale >= 1.0f) {
163 qnnp_log_error(
164 "failed to create convolution with %.7g input scale, %.7g kernel scale, and %.7g output scale: "
165 "convolution scale %.7g is greater or equal to 1.0",
166 input_scale, kernel_scale, output_scale, convolution_scale);
167 goto error;
168 }
169
170 status = qnnp_status_out_of_memory;
171
172 convolution = calloc(1, sizeof(struct qnnp_operator));
173 if (convolution == NULL) {
174 qnnp_log_error("failed to allocate %zu bytes for qnnp_operator structure", sizeof(struct qnnp_operator));
175 goto error;
176 }
177
178 const size_t kernel_size = kernel_height * kernel_width;
179
180 enum qnnp_ukernel_type ukernel_type = qnnp_ukernel_type_none;
181 const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0;
182 if ((kernel_size == 9 || kernel_size == 25) && group_input_channels == 1 && group_output_channels == 1 && groups > 1) {
183 ukernel_type = qnnp_ukernel_type_dwconv;
184 } else if (kernel_size == 1 && subsampling_height == 1 && subsampling_width == 1 && !any_padding) {
185 ukernel_type = group_input_channels >= qnnp_params.q8conv_xzp.kthreshold ?
186 qnnp_ukernel_type_xzp_gemm : qnnp_ukernel_type_gemm;
187 } else {
188 ukernel_type = qnnp_ukernel_type_conv;
189 }
190 size_t zero_size = 0, zero_offset = 0;
191
192 switch (ukernel_type) {
193 case qnnp_ukernel_type_dwconv:
194 {
195 const uint32_t cr = qnnp_params.q8dw9.cr;
196 const uint32_t c_stride = (groups + (cr - 1)) & -cr;
197 convolution->group_stride = c_stride;
198 const size_t packed_weights_size = (sizeof(uint8_t) * kernel_size + sizeof(int32_t)) * c_stride;
199 convolution->packed_weights = malloc(packed_weights_size);
200 if (convolution->packed_weights == NULL) {
201 qnnp_log_error("failed to allocate %zu bytes for packed weights", packed_weights_size);
202 goto error;
203 }
204
205 switch (kernel_size) {
206 case 9:
207 pack_q8dw_w(
208 kernel_height, kernel_width,
209 groups, cr,
210 input_zero_point, kernel_zero_point,
211 kernel, bias, convolution->packed_weights);
212 break;
213 case 25:
214 /* change this later */
215 pack_q8dw_w_dilation(
216 kernel_height, kernel_width,
217 groups, cr,
218 0, kernel_height, 0, 2,
219 kernel, bias, convolution->packed_weights, true);
220 pack_q8dw_w_dilation(
221 kernel_height, kernel_width,
222 groups, cr,
223 0, kernel_height, 2, 4,
224 kernel, bias, (char*)convolution->packed_weights + (10 + sizeof(int32_t) / sizeof(uint8_t)) * c_stride, false);
225 pack_q8dw_w_dilation(
226 kernel_height, kernel_width,
227 groups, cr,
228 0, kernel_height, 4, 5,
229 kernel, bias, (char*)convolution->packed_weights + (20 + sizeof(int32_t) / sizeof(uint8_t)) * c_stride, false);
230 break;
231 default:
232 QNNP_UNREACHABLE;
233 }
234
235 if (groups >= 8) {
236 zero_size = sizeof(uint8_t) * c_stride;
237 zero_offset = 0;
238 } else {
239 zero_size = sizeof(uint8_t) * c_stride + 8;
240 zero_offset = sizeof(uint8_t) * 8;
241 }
242 break;
243 }
244 case qnnp_ukernel_type_xzp_gemm:
245 {
246 const uint32_t nr = qnnp_params.q8conv_xzp.nr;
247 const uint32_t kr = qnnp_params.q8conv_xzp.kr;
248 const uint32_t sr = qnnp_params.q8conv_xzp.kc;
249 const uint32_t n_stride = (group_output_channels + (nr - 1)) & -nr;
250 const uint32_t k_stride = (group_input_channels + (kr - 1)) & -kr;
251
252 const size_t packed_group_weights_size =
253 (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
254 convolution->packed_weights = malloc(packed_group_weights_size * groups);
255 if (convolution->packed_weights == NULL) {
256 qnnp_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
257 goto error;
258 }
259 /* The XZP ukernel needs the padding to be 0 */
260 memset(convolution->packed_weights, 0, packed_group_weights_size * groups);
261
262 for (uint32_t group = 0; group < groups; group++) {
263 pack_swizzle_q8gemm_b(
264 group_output_channels, group_input_channels,
265 nr, kr, sr,
266 input_zero_point, kernel_zero_point,
267 kernel + group * group_output_channels * group_input_channels,
268 bias + group * group_output_channels,
269 (void*) ((uintptr_t) convolution->packed_weights + group * packed_group_weights_size));
270 }
271 break;
272 }
273 case qnnp_ukernel_type_gemm:
274 case qnnp_ukernel_type_conv:
275 {
276 const uint32_t nr = qnnp_params.q8conv.nr;
277 const uint32_t kr = qnnp_params.q8conv.kr;
278 const uint32_t n_stride = (group_output_channels + (nr - 1)) & -nr;
279 const uint32_t k_stride = (group_input_channels + (kr - 1)) & -kr;
280
281 const size_t packed_group_weights_size =
282 (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride;
283 convolution->packed_weights = malloc(packed_group_weights_size * groups);
284 if (convolution->packed_weights == NULL) {
285 qnnp_log_error("failed to allocate %zu bytes for packed weights", packed_group_weights_size * groups);
286 goto error;
287 }
288 memset(convolution->packed_weights, kernel_zero_point, packed_group_weights_size * groups);
289
290 switch (ukernel_type) {
291 case qnnp_ukernel_type_gemm:
292 for (uint32_t group = 0; group < groups; group++) {
293 pack_q8gemm_w(
294 group_output_channels, group_input_channels,
295 nr, nr, kr,
296 input_zero_point, kernel_zero_point,
297 kernel + group * group_output_channels * group_input_channels,
298 bias + group * group_output_channels,
299 (void*) ((uintptr_t) convolution->packed_weights + group * packed_group_weights_size));
300 }
301 break;
302 case qnnp_ukernel_type_conv:
303 for (uint32_t group = 0; group < groups; group++) {
304 pack_q8conv_w(
305 group_output_channels, kernel_size, group_input_channels,
306 nr, kr,
307 input_zero_point, kernel_zero_point,
308 kernel + group * group_output_channels * kernel_size * group_input_channels,
309 bias + group * group_output_channels,
310 (void*) ((uintptr_t) convolution->packed_weights + group * packed_group_weights_size));
311 }
312 break;
313 default:
314 QNNP_UNREACHABLE;
315 }
316
317 if (group_input_channels >= 8) {
318 zero_size = sizeof(uint8_t) * k_stride;
319 zero_offset = 0;
320 } else {
321 zero_size = sizeof(uint8_t) * k_stride + 8;
322 zero_offset = 8;
323 }
324 break;
325 }
326 default:
327 QNNP_UNREACHABLE;
328 }
329
330 if (any_padding) {
331 void* zero_buffer = malloc(zero_size);
332 if (zero_buffer == NULL) {
333 qnnp_log_error("failed to allocate %zu bytes for zero padding", zero_size);
334 goto error;
335 }
336 memset(zero_buffer, input_zero_point, zero_size);
337 convolution->zero_buffer = zero_buffer;
338 convolution->zero_pointer = (void*) ((uintptr_t) zero_buffer + zero_offset);
339 }
340
341 convolution->input_padding_top = input_padding_top;
342 convolution->input_padding_right = input_padding_right;
343 convolution->input_padding_bottom = input_padding_bottom;
344 convolution->input_padding_left = input_padding_left;
345
346 convolution->kernel_height = kernel_height;
347 convolution->kernel_width = kernel_width;
348 convolution->stride_height = subsampling_height;
349 convolution->stride_width = subsampling_width;
350 convolution->dilation_height = dilation_height;
351 convolution->dilation_width = dilation_width;
352 convolution->groups = groups;
353 convolution->group_input_channels = group_input_channels;
354 convolution->group_output_channels = group_output_channels;
355
356 convolution->kernel_zero_point = kernel_zero_point;
357
358 if (ukernel_type == qnnp_ukernel_type_xzp_gemm) {
359 convolution->requantization_params =
360 qnnp_compute_requantization_params(
361 convolution_scale, output_zero_point, output_min, output_max);
362 } else {
363 convolution->conv_quantization_params =
364 qnnp_compute_conv_quantization_params(
365 input_zero_point, kernel_zero_point,
366 convolution_scale, output_zero_point, output_min, output_max);
367 }
368
369 convolution->ukernel_type = ukernel_type;
370 convolution->format = qnnp_format_quint8;
371
372 *convolution_out = convolution;
373 return qnnp_status_success;
374
375error:
376 qnnp_delete_operator(convolution);
377 return status;
378}
379
380enum qnnp_status qnnp_setup_convolution2d_nhwc_q8(
381 qnnp_operator_t convolution,
382 size_t batch_size,
383 size_t input_height,
384 size_t input_width,
385 const uint8_t* input,
386 size_t input_pixel_stride,
387 uint8_t* output,
388 size_t output_pixel_stride,
389 pthreadpool_t threadpool)
390{
391 if (!qnnp_params.initialized) {
392 qnnp_log_error("qnnp_setup_convolution2d_nhwc_q8 failed because QNNPACK is not properly initialized");
393 return qnnp_status_uninitialized;
394 }
395
396 if (batch_size == 0) {
397 convolution->batch_size = 0;
398 return qnnp_status_success;
399 }
400
401 if (input_width == 0 || input_height == 0) {
402 qnnp_log_error(
403 "failed to setup convolution with %zux%zu input: input dimensions must be non-zero",
404 input_width,
405 input_height);
406 return qnnp_status_invalid_parameter;
407 }
408
409 convolution->batch_size = batch_size;
410 convolution->input_height = input_height;
411 convolution->input_width = input_width;
412 convolution->input = input;
413 convolution->input_pixel_stride = input_pixel_stride;
414
415 convolution->output_height = compute_output_dimension(
416 convolution->input_padding_top + input_height + convolution->input_padding_bottom,
417 convolution->kernel_height,
418 convolution->dilation_height,
419 convolution->stride_height);
420 convolution->output_width = compute_output_dimension(
421 convolution->input_padding_left + input_width + convolution->input_padding_right,
422 convolution->kernel_width,
423 convolution->dilation_width,
424 convolution->stride_width);
425 convolution->output = output;
426 convolution->output_pixel_stride = output_pixel_stride;
427
428 switch (convolution->ukernel_type) {
429 case qnnp_ukernel_type_gemm:
430 /* Convolution maps directly to GEMM and doesn't use indirection buffer */
431 return qnnp_status_success;
432 case qnnp_ukernel_type_xzp_gemm:
433 {
434 const size_t groups = convolution->groups;
435 void* a_sum = (void*) realloc(convolution->a_sum, sizeof(int32_t) * batch_size * groups * input_height * input_width);
436 if (a_sum == NULL) {
437 qnnp_log_error("failed to allocate %zu bytes for row sum data",
438 sizeof(int32_t) * batch_size * groups * input_height * input_width);
439 return qnnp_status_out_of_memory;
440 }
441 convolution->a_sum = a_sum;
442 return qnnp_status_success;
443 }
444 case qnnp_ukernel_type_conv:
445 {
446 const size_t groups = convolution->groups;
447 const size_t kernel_height = convolution->kernel_height;
448 const size_t kernel_width = convolution->kernel_width;
449 const size_t kernel_size = kernel_height * kernel_width;
450 const size_t output_height = convolution->output_height;
451 const size_t output_width = convolution->output_width;
452 const size_t output_size = output_height * output_width;
453 const size_t output_tile_size = qnnp_params.q8conv.mr;
454 const size_t tiled_output_size = round_up(output_size, output_tile_size);
455 const size_t indirection_buffer_size = sizeof(void*) * batch_size * groups * tiled_output_size * kernel_size;
456
457 const void** indirection_buffer = (const void**) realloc(convolution->indirection_buffer, indirection_buffer_size);
458 if (indirection_buffer == NULL) {
459 qnnp_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
460 return qnnp_status_out_of_memory;
461 }
462 convolution->indirection_buffer = indirection_buffer;
463
464 qnnp_indirection_init_conv2d(convolution, output_tile_size, tiled_output_size);
465 return qnnp_status_success;
466 }
467 case qnnp_ukernel_type_dwconv:
468 {
469 const size_t kernel_height = convolution->kernel_height;
470 const size_t kernel_width = convolution->kernel_width;
471 const size_t kernel_size = kernel_height * kernel_width;
472 const size_t output_height = convolution->output_height;
473 const size_t output_width = convolution->output_width;
474 const size_t step_width = convolution->dilation_width == 1 ? convolution->stride_width : kernel_width;
475 const size_t step_height = kernel_size + (output_width * step_width - 1) * kernel_height;
476 const size_t indirection_buffer_size = sizeof(void*) * batch_size * output_height * step_height;
477
478 const void** indirection_buffer =
479 (const void**) realloc(convolution->indirection_buffer, indirection_buffer_size);
480 if (indirection_buffer == NULL) {
481 qnnp_log_error("failed to allocate %zu bytes for indirection buffer", indirection_buffer_size);
482 return qnnp_status_out_of_memory;
483 }
484 convolution->indirection_buffer = indirection_buffer;
485
486 qnnp_indirection_init_dwconv2d(convolution, 0, step_height, step_width);
487 return qnnp_status_success;
488 }
489 default:
490 QNNP_UNREACHABLE;
491 }
492}
493