1 | /* |
2 | * Copyright (c) Facebook, Inc. and its affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under the BSD-style license found in the |
6 | * LICENSE file in the root directory of this source tree. |
7 | */ |
8 | |
9 | #include <assert.h> |
10 | #include <math.h> |
11 | #include <stdbool.h> |
12 | #include <stddef.h> |
13 | #include <stdint.h> |
14 | #include <stdlib.h> |
15 | #include <string.h> |
16 | |
17 | #include <fxdiv.h> |
18 | |
19 | #include <qnnpack.h> |
20 | #include <qnnpack/operator.h> |
21 | #include <qnnpack/log.h> |
22 | #include <qnnpack/common.h> |
23 | #include <qnnpack/math.h> |
24 | #include <qnnpack/pack.h> |
25 | #include <qnnpack/params.h> |
26 | #include <qnnpack/indirection.h> |
27 | |
28 | |
29 | static inline size_t compute_output_dimension( |
30 | size_t padded_input_dimension, |
31 | size_t kernel_dimension, |
32 | size_t dilation_dimension, |
33 | size_t subsampling_dimension) |
34 | { |
35 | const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1; |
36 | return (padded_input_dimension - effective_kernel_dimension) / subsampling_dimension + 1; |
37 | } |
38 | |
39 | enum qnnp_status qnnp_create_convolution2d_nhwc_q8( |
40 | uint32_t input_padding_top, |
41 | uint32_t input_padding_right, |
42 | uint32_t input_padding_bottom, |
43 | uint32_t input_padding_left, |
44 | uint32_t kernel_height, |
45 | uint32_t kernel_width, |
46 | uint32_t subsampling_height, |
47 | uint32_t subsampling_width, |
48 | uint32_t dilation_height, |
49 | uint32_t dilation_width, |
50 | uint32_t groups, |
51 | size_t group_input_channels, |
52 | size_t group_output_channels, |
53 | uint8_t input_zero_point, |
54 | float input_scale, |
55 | uint8_t kernel_zero_point, |
56 | float kernel_scale, |
57 | const uint8_t* kernel, |
58 | const int32_t* bias, |
59 | uint8_t output_zero_point, |
60 | float output_scale, |
61 | uint8_t output_min, |
62 | uint8_t output_max, |
63 | uint32_t flags, |
64 | qnnp_operator_t* convolution_out) |
65 | { |
66 | qnnp_operator_t convolution = NULL; |
67 | enum qnnp_status status = qnnp_status_uninitialized; |
68 | |
69 | if (!qnnp_params.initialized) { |
70 | qnnp_log_error("qnnp_create_convolution2d_nhwc_q8 failed because QNNPACK is not properly initialized" ); |
71 | goto error; |
72 | } |
73 | |
74 | status = qnnp_status_invalid_parameter; |
75 | |
76 | if (kernel_width == 0 || kernel_height == 0) { |
77 | qnnp_log_error( |
78 | "failed to create convolution with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero" , |
79 | kernel_width, kernel_height); |
80 | goto error; |
81 | } |
82 | |
83 | if (subsampling_width == 0 || subsampling_height == 0) { |
84 | qnnp_log_error( |
85 | "failed to create convolution with %" PRIu32 "x%" PRIu32 " subsampling: " |
86 | "subsampling dimensions must be non-zero" , |
87 | subsampling_width, subsampling_height); |
88 | goto error; |
89 | } |
90 | |
91 | if (dilation_width == 0 || dilation_height == 0) { |
92 | qnnp_log_error( |
93 | "failed to create convolution with %" PRIu32 "x%" PRIu32 " dilation: " |
94 | "dilation dimensions must be non-zero" , |
95 | dilation_width, dilation_height); |
96 | goto error; |
97 | } |
98 | |
99 | if (input_scale <= 0.0f || !isnormal(input_scale)) { |
100 | qnnp_log_error( |
101 | "failed to create convolution with %.7g input scale: scale must be finite and positive" , input_scale); |
102 | goto error; |
103 | } |
104 | |
105 | if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) { |
106 | qnnp_log_error( |
107 | "failed to create convolution with %.7g kernel scale: scale must be finite and positive" , kernel_scale); |
108 | goto error; |
109 | } |
110 | |
111 | if (output_scale <= 0.0f || !isnormal(output_scale)) { |
112 | qnnp_log_error( |
113 | "failed to create convolution with %.7g output scale: scale must be finite and positive" , output_scale); |
114 | goto error; |
115 | } |
116 | |
117 | status = qnnp_status_unsupported_parameter; |
118 | |
119 | if (subsampling_height > kernel_height) { |
120 | qnnp_log_info( |
121 | "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "x%" PRIu32 " subsampling: " |
122 | "height subsampling is greater than kernel height; subsampling should be performed before the convolution" , |
123 | kernel_width, kernel_height, subsampling_width, subsampling_height); |
124 | } |
125 | |
126 | if (subsampling_width > kernel_width) { |
127 | qnnp_log_info( |
128 | "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "x%" PRIu32 " subsampling: " |
129 | "width subsampling is greater than kernel width; subsampling should be performed before the convolution" , |
130 | kernel_width, kernel_height, subsampling_width, subsampling_height); |
131 | } |
132 | |
133 | if (input_padding_top >= kernel_height) { |
134 | qnnp_log_info( |
135 | "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " height padding: " |
136 | "input top padding is greater or equal to kernel height" , |
137 | kernel_width, kernel_height, input_padding_top, input_padding_bottom); |
138 | } |
139 | |
140 | if (input_padding_bottom >= kernel_height) { |
141 | qnnp_log_info( |
142 | "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " height padding: " |
143 | "input bottom padding is greater or equal to kernel height" , |
144 | kernel_width, kernel_height, input_padding_top, input_padding_bottom); |
145 | } |
146 | |
147 | if (input_padding_right >= kernel_width) { |
148 | qnnp_log_info( |
149 | "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " width padding: " |
150 | "input right padding is greater or equal to kernel width" , |
151 | kernel_width, kernel_height, input_padding_left, input_padding_right); |
152 | } |
153 | |
154 | if (input_padding_left >= kernel_width) { |
155 | qnnp_log_info( |
156 | "inefficiency in convolution with %" PRIu32 "x%" PRIu32 " kernel and %" PRIu32 "+%" PRIu32 " width padding: " |
157 | "input left padding is greater or equal to kernel width" , |
158 | kernel_width, kernel_height, input_padding_left, input_padding_right); |
159 | } |
160 | |
161 | const float convolution_scale = input_scale * kernel_scale / output_scale; |
162 | if (convolution_scale >= 1.0f) { |
163 | qnnp_log_error( |
164 | "failed to create convolution with %.7g input scale, %.7g kernel scale, and %.7g output scale: " |
165 | "convolution scale %.7g is greater or equal to 1.0" , |
166 | input_scale, kernel_scale, output_scale, convolution_scale); |
167 | goto error; |
168 | } |
169 | |
170 | status = qnnp_status_out_of_memory; |
171 | |
172 | convolution = calloc(1, sizeof(struct qnnp_operator)); |
173 | if (convolution == NULL) { |
174 | qnnp_log_error("failed to allocate %zu bytes for qnnp_operator structure" , sizeof(struct qnnp_operator)); |
175 | goto error; |
176 | } |
177 | |
178 | const size_t kernel_size = kernel_height * kernel_width; |
179 | |
180 | enum qnnp_ukernel_type ukernel_type = qnnp_ukernel_type_none; |
181 | const bool any_padding = (input_padding_left | input_padding_top | input_padding_right | input_padding_bottom) != 0; |
182 | if ((kernel_size == 9 || kernel_size == 25) && group_input_channels == 1 && group_output_channels == 1 && groups > 1) { |
183 | ukernel_type = qnnp_ukernel_type_dwconv; |
184 | } else if (kernel_size == 1 && subsampling_height == 1 && subsampling_width == 1 && !any_padding) { |
185 | ukernel_type = group_input_channels >= qnnp_params.q8conv_xzp.kthreshold ? |
186 | qnnp_ukernel_type_xzp_gemm : qnnp_ukernel_type_gemm; |
187 | } else { |
188 | ukernel_type = qnnp_ukernel_type_conv; |
189 | } |
190 | size_t zero_size = 0, zero_offset = 0; |
191 | |
192 | switch (ukernel_type) { |
193 | case qnnp_ukernel_type_dwconv: |
194 | { |
195 | const uint32_t cr = qnnp_params.q8dw9.cr; |
196 | const uint32_t c_stride = (groups + (cr - 1)) & -cr; |
197 | convolution->group_stride = c_stride; |
198 | const size_t packed_weights_size = (sizeof(uint8_t) * kernel_size + sizeof(int32_t)) * c_stride; |
199 | convolution->packed_weights = malloc(packed_weights_size); |
200 | if (convolution->packed_weights == NULL) { |
201 | qnnp_log_error("failed to allocate %zu bytes for packed weights" , packed_weights_size); |
202 | goto error; |
203 | } |
204 | |
205 | switch (kernel_size) { |
206 | case 9: |
207 | pack_q8dw_w( |
208 | kernel_height, kernel_width, |
209 | groups, cr, |
210 | input_zero_point, kernel_zero_point, |
211 | kernel, bias, convolution->packed_weights); |
212 | break; |
213 | case 25: |
214 | /* change this later */ |
215 | pack_q8dw_w_dilation( |
216 | kernel_height, kernel_width, |
217 | groups, cr, |
218 | 0, kernel_height, 0, 2, |
219 | kernel, bias, convolution->packed_weights, true); |
220 | pack_q8dw_w_dilation( |
221 | kernel_height, kernel_width, |
222 | groups, cr, |
223 | 0, kernel_height, 2, 4, |
224 | kernel, bias, (char*)convolution->packed_weights + (10 + sizeof(int32_t) / sizeof(uint8_t)) * c_stride, false); |
225 | pack_q8dw_w_dilation( |
226 | kernel_height, kernel_width, |
227 | groups, cr, |
228 | 0, kernel_height, 4, 5, |
229 | kernel, bias, (char*)convolution->packed_weights + (20 + sizeof(int32_t) / sizeof(uint8_t)) * c_stride, false); |
230 | break; |
231 | default: |
232 | QNNP_UNREACHABLE; |
233 | } |
234 | |
235 | if (groups >= 8) { |
236 | zero_size = sizeof(uint8_t) * c_stride; |
237 | zero_offset = 0; |
238 | } else { |
239 | zero_size = sizeof(uint8_t) * c_stride + 8; |
240 | zero_offset = sizeof(uint8_t) * 8; |
241 | } |
242 | break; |
243 | } |
244 | case qnnp_ukernel_type_xzp_gemm: |
245 | { |
246 | const uint32_t nr = qnnp_params.q8conv_xzp.nr; |
247 | const uint32_t kr = qnnp_params.q8conv_xzp.kr; |
248 | const uint32_t sr = qnnp_params.q8conv_xzp.kc; |
249 | const uint32_t n_stride = (group_output_channels + (nr - 1)) & -nr; |
250 | const uint32_t k_stride = (group_input_channels + (kr - 1)) & -kr; |
251 | |
252 | const size_t packed_group_weights_size = |
253 | (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride; |
254 | convolution->packed_weights = malloc(packed_group_weights_size * groups); |
255 | if (convolution->packed_weights == NULL) { |
256 | qnnp_log_error("failed to allocate %zu bytes for packed weights" , packed_group_weights_size * groups); |
257 | goto error; |
258 | } |
259 | /* The XZP ukernel needs the padding to be 0 */ |
260 | memset(convolution->packed_weights, 0, packed_group_weights_size * groups); |
261 | |
262 | for (uint32_t group = 0; group < groups; group++) { |
263 | pack_swizzle_q8gemm_b( |
264 | group_output_channels, group_input_channels, |
265 | nr, kr, sr, |
266 | input_zero_point, kernel_zero_point, |
267 | kernel + group * group_output_channels * group_input_channels, |
268 | bias + group * group_output_channels, |
269 | (void*) ((uintptr_t) convolution->packed_weights + group * packed_group_weights_size)); |
270 | } |
271 | break; |
272 | } |
273 | case qnnp_ukernel_type_gemm: |
274 | case qnnp_ukernel_type_conv: |
275 | { |
276 | const uint32_t nr = qnnp_params.q8conv.nr; |
277 | const uint32_t kr = qnnp_params.q8conv.kr; |
278 | const uint32_t n_stride = (group_output_channels + (nr - 1)) & -nr; |
279 | const uint32_t k_stride = (group_input_channels + (kr - 1)) & -kr; |
280 | |
281 | const size_t packed_group_weights_size = |
282 | (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride; |
283 | convolution->packed_weights = malloc(packed_group_weights_size * groups); |
284 | if (convolution->packed_weights == NULL) { |
285 | qnnp_log_error("failed to allocate %zu bytes for packed weights" , packed_group_weights_size * groups); |
286 | goto error; |
287 | } |
288 | memset(convolution->packed_weights, kernel_zero_point, packed_group_weights_size * groups); |
289 | |
290 | switch (ukernel_type) { |
291 | case qnnp_ukernel_type_gemm: |
292 | for (uint32_t group = 0; group < groups; group++) { |
293 | pack_q8gemm_w( |
294 | group_output_channels, group_input_channels, |
295 | nr, nr, kr, |
296 | input_zero_point, kernel_zero_point, |
297 | kernel + group * group_output_channels * group_input_channels, |
298 | bias + group * group_output_channels, |
299 | (void*) ((uintptr_t) convolution->packed_weights + group * packed_group_weights_size)); |
300 | } |
301 | break; |
302 | case qnnp_ukernel_type_conv: |
303 | for (uint32_t group = 0; group < groups; group++) { |
304 | pack_q8conv_w( |
305 | group_output_channels, kernel_size, group_input_channels, |
306 | nr, kr, |
307 | input_zero_point, kernel_zero_point, |
308 | kernel + group * group_output_channels * kernel_size * group_input_channels, |
309 | bias + group * group_output_channels, |
310 | (void*) ((uintptr_t) convolution->packed_weights + group * packed_group_weights_size)); |
311 | } |
312 | break; |
313 | default: |
314 | QNNP_UNREACHABLE; |
315 | } |
316 | |
317 | if (group_input_channels >= 8) { |
318 | zero_size = sizeof(uint8_t) * k_stride; |
319 | zero_offset = 0; |
320 | } else { |
321 | zero_size = sizeof(uint8_t) * k_stride + 8; |
322 | zero_offset = 8; |
323 | } |
324 | break; |
325 | } |
326 | default: |
327 | QNNP_UNREACHABLE; |
328 | } |
329 | |
330 | if (any_padding) { |
331 | void* zero_buffer = malloc(zero_size); |
332 | if (zero_buffer == NULL) { |
333 | qnnp_log_error("failed to allocate %zu bytes for zero padding" , zero_size); |
334 | goto error; |
335 | } |
336 | memset(zero_buffer, input_zero_point, zero_size); |
337 | convolution->zero_buffer = zero_buffer; |
338 | convolution->zero_pointer = (void*) ((uintptr_t) zero_buffer + zero_offset); |
339 | } |
340 | |
341 | convolution->input_padding_top = input_padding_top; |
342 | convolution->input_padding_right = input_padding_right; |
343 | convolution->input_padding_bottom = input_padding_bottom; |
344 | convolution->input_padding_left = input_padding_left; |
345 | |
346 | convolution->kernel_height = kernel_height; |
347 | convolution->kernel_width = kernel_width; |
348 | convolution->stride_height = subsampling_height; |
349 | convolution->stride_width = subsampling_width; |
350 | convolution->dilation_height = dilation_height; |
351 | convolution->dilation_width = dilation_width; |
352 | convolution->groups = groups; |
353 | convolution->group_input_channels = group_input_channels; |
354 | convolution->group_output_channels = group_output_channels; |
355 | |
356 | convolution->kernel_zero_point = kernel_zero_point; |
357 | |
358 | if (ukernel_type == qnnp_ukernel_type_xzp_gemm) { |
359 | convolution->requantization_params = |
360 | qnnp_compute_requantization_params( |
361 | convolution_scale, output_zero_point, output_min, output_max); |
362 | } else { |
363 | convolution->conv_quantization_params = |
364 | qnnp_compute_conv_quantization_params( |
365 | input_zero_point, kernel_zero_point, |
366 | convolution_scale, output_zero_point, output_min, output_max); |
367 | } |
368 | |
369 | convolution->ukernel_type = ukernel_type; |
370 | convolution->format = qnnp_format_quint8; |
371 | |
372 | *convolution_out = convolution; |
373 | return qnnp_status_success; |
374 | |
375 | error: |
376 | qnnp_delete_operator(convolution); |
377 | return status; |
378 | } |
379 | |
380 | enum qnnp_status qnnp_setup_convolution2d_nhwc_q8( |
381 | qnnp_operator_t convolution, |
382 | size_t batch_size, |
383 | size_t input_height, |
384 | size_t input_width, |
385 | const uint8_t* input, |
386 | size_t input_pixel_stride, |
387 | uint8_t* output, |
388 | size_t output_pixel_stride, |
389 | pthreadpool_t threadpool) |
390 | { |
391 | if (!qnnp_params.initialized) { |
392 | qnnp_log_error("qnnp_setup_convolution2d_nhwc_q8 failed because QNNPACK is not properly initialized" ); |
393 | return qnnp_status_uninitialized; |
394 | } |
395 | |
396 | if (batch_size == 0) { |
397 | convolution->batch_size = 0; |
398 | return qnnp_status_success; |
399 | } |
400 | |
401 | if (input_width == 0 || input_height == 0) { |
402 | qnnp_log_error( |
403 | "failed to setup convolution with %zux%zu input: input dimensions must be non-zero" , |
404 | input_width, |
405 | input_height); |
406 | return qnnp_status_invalid_parameter; |
407 | } |
408 | |
409 | convolution->batch_size = batch_size; |
410 | convolution->input_height = input_height; |
411 | convolution->input_width = input_width; |
412 | convolution->input = input; |
413 | convolution->input_pixel_stride = input_pixel_stride; |
414 | |
415 | convolution->output_height = compute_output_dimension( |
416 | convolution->input_padding_top + input_height + convolution->input_padding_bottom, |
417 | convolution->kernel_height, |
418 | convolution->dilation_height, |
419 | convolution->stride_height); |
420 | convolution->output_width = compute_output_dimension( |
421 | convolution->input_padding_left + input_width + convolution->input_padding_right, |
422 | convolution->kernel_width, |
423 | convolution->dilation_width, |
424 | convolution->stride_width); |
425 | convolution->output = output; |
426 | convolution->output_pixel_stride = output_pixel_stride; |
427 | |
428 | switch (convolution->ukernel_type) { |
429 | case qnnp_ukernel_type_gemm: |
430 | /* Convolution maps directly to GEMM and doesn't use indirection buffer */ |
431 | return qnnp_status_success; |
432 | case qnnp_ukernel_type_xzp_gemm: |
433 | { |
434 | const size_t groups = convolution->groups; |
435 | void* a_sum = (void*) realloc(convolution->a_sum, sizeof(int32_t) * batch_size * groups * input_height * input_width); |
436 | if (a_sum == NULL) { |
437 | qnnp_log_error("failed to allocate %zu bytes for row sum data" , |
438 | sizeof(int32_t) * batch_size * groups * input_height * input_width); |
439 | return qnnp_status_out_of_memory; |
440 | } |
441 | convolution->a_sum = a_sum; |
442 | return qnnp_status_success; |
443 | } |
444 | case qnnp_ukernel_type_conv: |
445 | { |
446 | const size_t groups = convolution->groups; |
447 | const size_t kernel_height = convolution->kernel_height; |
448 | const size_t kernel_width = convolution->kernel_width; |
449 | const size_t kernel_size = kernel_height * kernel_width; |
450 | const size_t output_height = convolution->output_height; |
451 | const size_t output_width = convolution->output_width; |
452 | const size_t output_size = output_height * output_width; |
453 | const size_t output_tile_size = qnnp_params.q8conv.mr; |
454 | const size_t tiled_output_size = round_up(output_size, output_tile_size); |
455 | const size_t indirection_buffer_size = sizeof(void*) * batch_size * groups * tiled_output_size * kernel_size; |
456 | |
457 | const void** indirection_buffer = (const void**) realloc(convolution->indirection_buffer, indirection_buffer_size); |
458 | if (indirection_buffer == NULL) { |
459 | qnnp_log_error("failed to allocate %zu bytes for indirection buffer" , indirection_buffer_size); |
460 | return qnnp_status_out_of_memory; |
461 | } |
462 | convolution->indirection_buffer = indirection_buffer; |
463 | |
464 | qnnp_indirection_init_conv2d(convolution, output_tile_size, tiled_output_size); |
465 | return qnnp_status_success; |
466 | } |
467 | case qnnp_ukernel_type_dwconv: |
468 | { |
469 | const size_t kernel_height = convolution->kernel_height; |
470 | const size_t kernel_width = convolution->kernel_width; |
471 | const size_t kernel_size = kernel_height * kernel_width; |
472 | const size_t output_height = convolution->output_height; |
473 | const size_t output_width = convolution->output_width; |
474 | const size_t step_width = convolution->dilation_width == 1 ? convolution->stride_width : kernel_width; |
475 | const size_t step_height = kernel_size + (output_width * step_width - 1) * kernel_height; |
476 | const size_t indirection_buffer_size = sizeof(void*) * batch_size * output_height * step_height; |
477 | |
478 | const void** indirection_buffer = |
479 | (const void**) realloc(convolution->indirection_buffer, indirection_buffer_size); |
480 | if (indirection_buffer == NULL) { |
481 | qnnp_log_error("failed to allocate %zu bytes for indirection buffer" , indirection_buffer_size); |
482 | return qnnp_status_out_of_memory; |
483 | } |
484 | convolution->indirection_buffer = indirection_buffer; |
485 | |
486 | qnnp_indirection_init_dwconv2d(convolution, 0, step_height, step_width); |
487 | return qnnp_status_success; |
488 | } |
489 | default: |
490 | QNNP_UNREACHABLE; |
491 | } |
492 | } |
493 | |