1 | /* |
2 | * Copyright (c) Facebook, Inc. and its affiliates. |
3 | * All rights reserved. |
4 | * |
5 | * This source code is licensed under the BSD-style license found in the |
6 | * LICENSE file in the root directory of this source tree. |
7 | */ |
8 | |
9 | #include <assert.h> |
10 | #include <stdbool.h> |
11 | #include <stddef.h> |
12 | #include <stdint.h> |
13 | #include <string.h> |
14 | #include <math.h> |
15 | |
16 | #include <qnnpack.h> |
17 | #include <qnnpack/operator.h> |
18 | #include <qnnpack/requantization.h> |
19 | #include <qnnpack/log.h> |
20 | #include <qnnpack/math.h> |
21 | #include <qnnpack/pack.h> |
22 | #include <qnnpack/params.h> |
23 | #include <qnnpack/indirection.h> |
24 | |
25 | |
26 | static inline size_t compute_output_dimension( |
27 | size_t input_dimension, |
28 | size_t input_padding_dimension, |
29 | size_t adjustment_dimension, |
30 | size_t kernel_dimension, |
31 | size_t dilation_dimension, |
32 | size_t stride_dimension) |
33 | { |
34 | const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1; |
35 | return stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension - input_padding_dimension; |
36 | } |
37 | |
38 | enum qnnp_status qnnp_create_deconvolution2d_nhwc_q8( |
39 | uint32_t input_padding_top, |
40 | uint32_t input_padding_right, |
41 | uint32_t input_padding_bottom, |
42 | uint32_t input_padding_left, |
43 | uint32_t adjustment_height, |
44 | uint32_t adjustment_width, |
45 | uint32_t kernel_height, |
46 | uint32_t kernel_width, |
47 | uint32_t stride_height, |
48 | uint32_t stride_width, |
49 | uint32_t dilation_height, |
50 | uint32_t dilation_width, |
51 | uint32_t groups, |
52 | size_t group_input_channels, |
53 | size_t group_output_channels, |
54 | uint8_t input_zero_point, |
55 | float input_scale, |
56 | uint8_t kernel_zero_point, |
57 | float kernel_scale, |
58 | const uint8_t* kernel, |
59 | const int32_t* bias, |
60 | uint8_t output_zero_point, |
61 | float output_scale, |
62 | uint8_t output_min, |
63 | uint8_t output_max, |
64 | uint32_t flags, |
65 | qnnp_operator_t* deconvolution_out) |
66 | { |
67 | qnnp_operator_t deconvolution = NULL; |
68 | enum qnnp_status status = qnnp_status_uninitialized; |
69 | |
70 | if (!qnnp_params.initialized) { |
71 | qnnp_log_error("qnnp_create_deconvolution2d_nhwc_q8 failed because QNNPACK is not properly initialized" ); |
72 | goto error; |
73 | } |
74 | |
75 | status = qnnp_status_invalid_parameter; |
76 | |
77 | if (kernel_width == 0 || kernel_height == 0) { |
78 | qnnp_log_error( |
79 | "failed to create deconvolution with %" PRIu32 "x%" PRIu32 " kernel: kernel dimensions must be non-zero" , |
80 | kernel_width, kernel_height); |
81 | goto error; |
82 | } |
83 | |
84 | if (stride_width == 0 || stride_height == 0) { |
85 | qnnp_log_error( |
86 | "failed to create deconvolution with %" PRIu32 "x%" PRIu32 " stride: " |
87 | "stride dimensions must be non-zero" , |
88 | stride_width, stride_height); |
89 | goto error; |
90 | } |
91 | |
92 | if (dilation_width == 0 || dilation_height == 0) { |
93 | qnnp_log_error( |
94 | "failed to create deconvolution with %" PRIu32 "x%" PRIu32 " dilation: " |
95 | "dilation dimensions must be non-zero" , |
96 | dilation_width, dilation_height); |
97 | goto error; |
98 | } |
99 | |
100 | if (input_scale <= 0.0f || !isnormal(input_scale)) { |
101 | qnnp_log_error( |
102 | "failed to create deconvolution with %.7g input scale: scale must be finite and positive" , input_scale); |
103 | goto error; |
104 | } |
105 | |
106 | if (kernel_scale <= 0.0f || !isnormal(kernel_scale)) { |
107 | qnnp_log_error( |
108 | "failed to create deconvolution with %.7g kernel scale: scale must be finite and positive" , kernel_scale); |
109 | goto error; |
110 | } |
111 | |
112 | if (output_scale <= 0.0f || !isnormal(output_scale)) { |
113 | qnnp_log_error( |
114 | "failed to create deconvolution with %.7g output scale: scale must be finite and positive" , output_scale); |
115 | goto error; |
116 | } |
117 | |
118 | status = qnnp_status_unsupported_parameter; |
119 | |
120 | const float deconvolution_scale = input_scale * kernel_scale / output_scale; |
121 | if (deconvolution_scale >= 1.0f) { |
122 | qnnp_log_error( |
123 | "failed to create deconvolution with %.7g input scale, %.7g kernel scale, and %.7g output scale: " |
124 | "deconvolution scale %.7g is greater or equal to 1.0" , |
125 | input_scale, kernel_scale, output_scale, deconvolution_scale); |
126 | goto error; |
127 | } |
128 | |
129 | status = qnnp_status_out_of_memory; |
130 | |
131 | deconvolution = calloc(1, sizeof(struct qnnp_operator)); |
132 | if (deconvolution == NULL) { |
133 | qnnp_log_error("failed to allocate %zu bytes for qnnp_operator structure" , sizeof(struct qnnp_operator)); |
134 | goto error; |
135 | } |
136 | |
137 | const uint32_t nr = qnnp_params.q8conv.nr; |
138 | const uint32_t kr = qnnp_params.q8conv.kr; |
139 | |
140 | const uint32_t n_stride = (group_output_channels + (nr - 1)) & -nr; |
141 | const uint32_t k_stride = (group_input_channels + (kr - 1)) & -kr; |
142 | const uint32_t kernel_size = kernel_height * kernel_width; |
143 | const size_t packed_group_weights_size = (sizeof(uint8_t) * kernel_size * k_stride + sizeof(int32_t)) * n_stride; |
144 | deconvolution->packed_weights = malloc(packed_group_weights_size * groups); |
145 | if (deconvolution->packed_weights == NULL) { |
146 | qnnp_log_error("failed to allocate %zu bytes for packed weights" , packed_group_weights_size * groups); |
147 | goto error; |
148 | } |
149 | memset(deconvolution->packed_weights, kernel_zero_point, packed_group_weights_size * groups); |
150 | |
151 | for (uint32_t group = 0; group < groups; group++) { |
152 | pack_q8deconv_w( |
153 | group_output_channels, kernel_size, group_input_channels, |
154 | nr, kr, |
155 | input_zero_point, kernel_zero_point, |
156 | kernel + group * group_output_channels * kernel_size * group_input_channels, |
157 | bias + group * group_output_channels, |
158 | (void*) ((uintptr_t) deconvolution->packed_weights + group * packed_group_weights_size)); |
159 | } |
160 | |
161 | size_t zero_size = sizeof(uint8_t) * k_stride; |
162 | size_t zero_offset = 0; |
163 | if (group_input_channels < 8) { |
164 | zero_size += 8; |
165 | zero_offset = 8; |
166 | } |
167 | |
168 | void* zero_buffer = malloc(zero_size); |
169 | if (zero_buffer == NULL) { |
170 | qnnp_log_error("failed to allocate %zu bytes for zero padding" , zero_size); |
171 | goto error; |
172 | } |
173 | memset(zero_buffer, input_zero_point, zero_size); |
174 | deconvolution->zero_buffer = zero_buffer; |
175 | deconvolution->zero_pointer = (void*) ((uintptr_t) zero_buffer + zero_offset); |
176 | |
177 | deconvolution->input_padding_top = input_padding_top; |
178 | deconvolution->input_padding_right = input_padding_right; |
179 | deconvolution->input_padding_bottom = input_padding_bottom; |
180 | deconvolution->input_padding_left = input_padding_left; |
181 | deconvolution->adjustment_height = adjustment_height; |
182 | deconvolution->adjustment_width = adjustment_width; |
183 | |
184 | deconvolution->kernel_height = kernel_height; |
185 | deconvolution->kernel_width = kernel_width; |
186 | deconvolution->stride_height = stride_height; |
187 | deconvolution->stride_width = stride_width; |
188 | deconvolution->dilation_height = dilation_height; |
189 | deconvolution->dilation_width = dilation_width; |
190 | deconvolution->groups = groups; |
191 | deconvolution->group_input_channels = group_input_channels; |
192 | deconvolution->group_output_channels = group_output_channels; |
193 | |
194 | deconvolution->kernel_zero_point = kernel_zero_point; |
195 | |
196 | deconvolution->conv_quantization_params = |
197 | qnnp_compute_conv_quantization_params( |
198 | input_zero_point, kernel_zero_point, |
199 | deconvolution_scale, output_zero_point, output_min, output_max); |
200 | |
201 | deconvolution->ukernel_type = qnnp_ukernel_type_conv; |
202 | deconvolution->format = qnnp_format_quint8; |
203 | |
204 | *deconvolution_out = deconvolution; |
205 | return qnnp_status_success; |
206 | |
207 | error: |
208 | qnnp_delete_operator(deconvolution); |
209 | return status; |
210 | } |
211 | |
212 | enum qnnp_status qnnp_setup_deconvolution2d_nhwc_q8( |
213 | qnnp_operator_t deconvolution, |
214 | size_t batch_size, |
215 | size_t input_height, |
216 | size_t input_width, |
217 | const uint8_t* input, |
218 | size_t input_pixel_stride, |
219 | uint8_t* output, |
220 | size_t output_pixel_stride, |
221 | pthreadpool_t threadpool) |
222 | { |
223 | if (!qnnp_params.initialized) { |
224 | qnnp_log_error("qnnp_setup_deconvolution2d_nhwc_q8 failed because QNNPACK is not properly initialized" ); |
225 | return qnnp_status_uninitialized; |
226 | } |
227 | |
228 | if (batch_size == 0) { |
229 | deconvolution->batch_size = 0; |
230 | return qnnp_status_success; |
231 | } |
232 | |
233 | if (input_width == 0 || input_height == 0) { |
234 | qnnp_log_error( |
235 | "failed to setup deconvolution with %zux%zu input: input dimensions must be non-zero" , |
236 | input_width, |
237 | input_height); |
238 | return qnnp_status_invalid_parameter; |
239 | } |
240 | |
241 | deconvolution->batch_size = batch_size; |
242 | deconvolution->input_height = input_height; |
243 | deconvolution->input_width = input_width; |
244 | deconvolution->input = input; |
245 | deconvolution->input_pixel_stride = input_pixel_stride; |
246 | deconvolution->output = output; |
247 | deconvolution->output_pixel_stride = output_pixel_stride; |
248 | |
249 | const size_t kernel_height = deconvolution->kernel_height; |
250 | const size_t kernel_width = deconvolution->kernel_width; |
251 | const size_t kernel_size = kernel_height * kernel_width; |
252 | const size_t stride_height = deconvolution->stride_height; |
253 | const size_t stride_width = deconvolution->stride_width; |
254 | const size_t output_height = deconvolution->output_height = compute_output_dimension( |
255 | input_height, deconvolution->input_padding_top + deconvolution->input_padding_bottom, |
256 | deconvolution->adjustment_height, kernel_height, deconvolution->dilation_height, stride_height); |
257 | const size_t output_width = deconvolution->output_width = compute_output_dimension( |
258 | input_width, deconvolution->input_padding_left + deconvolution->input_padding_right, |
259 | deconvolution->adjustment_width, kernel_width, deconvolution->dilation_width, stride_width); |
260 | |
261 | const size_t groups = deconvolution->groups; |
262 | const size_t output_size = output_height * output_width; |
263 | const size_t output_tile_size = qnnp_params.q8conv.mr; |
264 | const size_t tiled_output_size = round_up(output_size, output_tile_size); |
265 | const size_t indirection_buffer_size = sizeof(void*) * batch_size * groups * tiled_output_size * kernel_size; |
266 | |
267 | const void** indirection_buffer = (const void**) realloc(deconvolution->indirection_buffer, indirection_buffer_size); |
268 | if (indirection_buffer == NULL) { |
269 | qnnp_log_error("failed to allocate %zu bytes for indirection buffer" , indirection_buffer_size); |
270 | return qnnp_status_out_of_memory; |
271 | } |
272 | deconvolution->indirection_buffer = indirection_buffer; |
273 | |
274 | qnnp_indirection_init_deconv2d(deconvolution, output_tile_size, tiled_output_size); |
275 | |
276 | return qnnp_status_success; |
277 | } |
278 | |