1 | // Copyright (c) Facebook, Inc. and its affiliates. |
2 | // All rights reserved. |
3 | // |
4 | // Copyright 2019 Google LLC |
5 | // |
6 | // This source code is licensed under the BSD-style license found in the |
7 | // LICENSE file in the root directory of this source tree. |
8 | |
9 | #include <stddef.h> |
10 | #include <math.h> |
11 | |
12 | #include <fp16.h> |
13 | |
14 | #include <fxdiv.h> |
15 | |
16 | #include <xnnpack/indirection.h> |
17 | #include <xnnpack/operator.h> |
18 | #include <xnnpack/math.h> |
19 | |
20 | |
21 | void xnn_indirection_init_conv2d( |
22 | xnn_operator_t op, |
23 | size_t output_tile_size, |
24 | uint32_t log2_element_size) |
25 | { |
26 | const void** indirection_buffer = op->indirection_buffer; |
27 | const void* input = op->input; |
28 | const void* zero = op->zero_buffer; |
29 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
30 | const size_t input_height = op->input_height; |
31 | const size_t input_width = op->input_width; |
32 | const size_t output_height = op->output_height; |
33 | const size_t output_width = op->output_width; |
34 | const size_t kernel_height = op->kernel_height; |
35 | const size_t kernel_width = op->kernel_width; |
36 | const size_t stride_height = op->stride_height; |
37 | const size_t stride_width = op->stride_width; |
38 | const size_t dilation_height = op->dilation_height; |
39 | const size_t dilation_width = op->dilation_width; |
40 | const size_t input_padding_top = op->padding_top; |
41 | const size_t input_padding_left = op->padding_left; |
42 | |
43 | const size_t output_size = output_height * output_width; |
44 | const size_t tiled_output_size = round_up(output_size, output_tile_size); |
45 | const size_t kernel_size = kernel_height * kernel_width; |
46 | |
47 | const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); |
48 | |
49 | for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) { |
50 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { |
51 | const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); |
52 | const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); |
53 | const size_t output_x = output_y_x.remainder; |
54 | const size_t output_y = output_y_x.quotient; |
55 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { |
56 | const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top; |
57 | if (input_y < input_height) { |
58 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
59 | const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left; |
60 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; |
61 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; |
62 | if (input_x < input_width) { |
63 | indirection_buffer[index] = (const void*) |
64 | ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
65 | } else { |
66 | indirection_buffer[index] = zero; |
67 | } |
68 | } |
69 | } else { |
70 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
71 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; |
72 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; |
73 | indirection_buffer[index] = zero; |
74 | } |
75 | } |
76 | } |
77 | } |
78 | } |
79 | } |
80 | |
81 | void xnn_indirection_init_deconv2d( |
82 | xnn_operator_t op, |
83 | size_t output_tile_size, |
84 | uint32_t log2_element_size) |
85 | { |
86 | const void** indirection_buffer = op->indirection_buffer; |
87 | const void* input = op->input; |
88 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
89 | const void* zero = op->zero_buffer; |
90 | const size_t input_height = op->input_height; |
91 | const size_t input_width = op->input_width; |
92 | const size_t output_height = op->output_height; |
93 | const size_t output_width = op->output_width; |
94 | const size_t kernel_height = op->kernel_height; |
95 | const size_t kernel_width = op->kernel_width; |
96 | const size_t stride_height = op->stride_height; |
97 | const size_t stride_width = op->stride_width; |
98 | const size_t dilation_height = op->dilation_height; |
99 | const size_t dilation_width = op->dilation_width; |
100 | const size_t padding_top = op->padding_top; |
101 | const size_t padding_left = op->padding_left; |
102 | |
103 | const size_t output_size = output_height * output_width; |
104 | const size_t tiled_output_size = round_up(output_size, output_tile_size); |
105 | const size_t kernel_size = kernel_height * kernel_width; |
106 | |
107 | const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width); |
108 | const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height); |
109 | const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width); |
110 | |
111 | for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) { |
112 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { |
113 | const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1); |
114 | const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor); |
115 | const size_t output_x = output_y_x.remainder; |
116 | const size_t output_y = output_y_x.quotient; |
117 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { |
118 | const size_t y = output_y + padding_top - kernel_y * dilation_height; |
119 | const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor); |
120 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
121 | const size_t x = output_x + padding_left - kernel_x * dilation_width; |
122 | const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor); |
123 | const size_t kernel_index = kernel_y * kernel_width + kernel_x; |
124 | const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset; |
125 | if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) { |
126 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
127 | } else { |
128 | indirection_buffer[index] = zero; |
129 | } |
130 | } |
131 | } |
132 | } |
133 | } |
134 | } |
135 | |
136 | void xnn_indirection_init_subconv2d( |
137 | xnn_operator_t op, |
138 | size_t output_tile_size, |
139 | uint32_t log2_element_size) |
140 | { |
141 | const void** indirection_buffer = op->indirection_buffer; |
142 | struct subconvolution_params* subconvolution_params = op->subconvolution_buffer; |
143 | const void* input = op->input; |
144 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
145 | const void* zero = op->zero_buffer; |
146 | const size_t input_height = op->input_height; |
147 | const size_t input_width = op->input_width; |
148 | const size_t output_height = op->output_height; |
149 | const size_t output_width = op->output_width; |
150 | const size_t kernel_height = op->kernel_height; |
151 | const size_t kernel_width = op->kernel_width; |
152 | const size_t stride_height = op->stride_height; |
153 | const size_t stride_width = op->stride_width; |
154 | const size_t padding_top = op->padding_top; |
155 | const size_t padding_left = op->padding_left; |
156 | |
157 | const size_t modulo_padding_top = padding_top % stride_height; |
158 | const size_t modulo_padding_left = padding_left % stride_width; |
159 | for (size_t offset_y = 0; offset_y < stride_height; offset_y++) { |
160 | const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height); |
161 | for (size_t offset_x = 0; offset_x < stride_width; offset_x++) { |
162 | const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width); |
163 | const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width); |
164 | |
165 | subconvolution_params->indirection_buffer = indirection_buffer; |
166 | subconvolution_params->indirection_y_stride = |
167 | subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size); |
168 | ++subconvolution_params; |
169 | |
170 | for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) { |
171 | for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) { |
172 | for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) { |
173 | assert(doz(output_y + padding_top, kernel_y) % stride_height == 0); |
174 | const size_t y = output_y + padding_top - kernel_y; |
175 | const size_t input_y = y / stride_height; |
176 | |
177 | for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) { |
178 | for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) { |
179 | const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1); |
180 | const size_t output_x = output_x_start + sliced_output_x * stride_width; |
181 | |
182 | assert(doz(output_x + padding_left, kernel_x) % stride_width == 0); |
183 | const size_t x = output_x + padding_left - kernel_x; |
184 | const size_t input_x = x / stride_width; |
185 | |
186 | if (input_y < input_height && input_x < input_width) { |
187 | *indirection_buffer++ = |
188 | (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
189 | } else { |
190 | *indirection_buffer++ = zero; |
191 | } |
192 | } |
193 | } |
194 | } |
195 | } |
196 | } |
197 | } |
198 | } |
199 | } |
200 | |
201 | void xnn_indirection_init_dwconv2d( |
202 | xnn_operator_t op, |
203 | size_t step_height, |
204 | size_t step_width, |
205 | size_t primary_tile, |
206 | uint32_t log2_element_size) |
207 | { |
208 | const void** indirection_buffer = op->indirection_buffer; |
209 | const void* input = op->input; |
210 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
211 | const void* zero = op->zero_buffer; |
212 | const size_t input_height = op->input_height; |
213 | const size_t input_width = op->input_width; |
214 | const size_t output_height = op->output_height; |
215 | const size_t output_width = op->output_width; |
216 | const size_t kernel_height = op->kernel_height; |
217 | const size_t kernel_width = op->kernel_width; |
218 | const size_t stride_height = op->stride_height; |
219 | const size_t stride_width = op->stride_width; |
220 | const size_t dilation_height = op->dilation_height; |
221 | const size_t dilation_width = op->dilation_width; |
222 | const size_t input_padding_top = op->padding_top; |
223 | const size_t input_padding_left = op->padding_left; |
224 | |
225 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
226 | for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) { |
227 | const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top; |
228 | if (input_y < input_height) { |
229 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
230 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
231 | const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left; |
232 | const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y; |
233 | if (input_x < input_width) { |
234 | indirection_buffer[index] = |
235 | (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
236 | } else { |
237 | indirection_buffer[index] = zero; |
238 | } |
239 | } |
240 | } |
241 | } else { |
242 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
243 | for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) { |
244 | const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y; |
245 | indirection_buffer[index] = zero; |
246 | } |
247 | } |
248 | } |
249 | } |
250 | } |
251 | |
252 | const void* last_output_pixel = indirection_buffer[output_height * step_height - 1]; |
253 | const size_t last_kernel_index = output_height * step_height - (kernel_height * kernel_width); |
254 | for (size_t tile_index = kernel_height * kernel_width; tile_index < primary_tile; tile_index++) { |
255 | indirection_buffer[last_kernel_index + tile_index] = last_output_pixel; |
256 | } |
257 | } |
258 | |
259 | void xnn_indirection_init_maxpool2d( |
260 | xnn_operator_t op, |
261 | size_t step_height, |
262 | size_t step_width, |
263 | uint32_t log2_element_size) |
264 | { |
265 | const void** indirection_buffer = op->indirection_buffer; |
266 | const void* input = op->input; |
267 | const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size; |
268 | const size_t input_height = op->input_height; |
269 | const size_t input_width = op->input_width; |
270 | const size_t output_height = op->output_height; |
271 | const size_t output_width = op->output_width; |
272 | const size_t pooling_height = op->kernel_height; |
273 | const size_t pooling_width = op->kernel_width; |
274 | const size_t stride_height = op->stride_height; |
275 | const size_t stride_width = op->stride_width; |
276 | const size_t dilation_height = op->dilation_height; |
277 | const size_t dilation_width = op->dilation_width; |
278 | const size_t input_padding_top = op->padding_top; |
279 | const size_t input_padding_left = op->padding_left; |
280 | |
281 | const bool any_dilation = (dilation_height | dilation_width) > 1; |
282 | |
283 | if (any_dilation) { |
284 | // Clamp to the border doesn't work for pooling with dilation. |
285 | const size_t adjusted_padding_top = input_padding_top % dilation_height; |
286 | const size_t adjusted_padding_left = input_padding_left % dilation_width; |
287 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
288 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { |
289 | size_t safe_input_y = output_y * stride_height; |
290 | if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) { |
291 | safe_input_y += dilation_height; |
292 | } |
293 | safe_input_y -= adjusted_padding_top; |
294 | |
295 | size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top; |
296 | if XNN_UNPREDICTABLE(input_y >= input_height) { |
297 | input_y = safe_input_y; |
298 | } |
299 | |
300 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
301 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { |
302 | size_t safe_input_x = output_x * stride_width; |
303 | if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) { |
304 | safe_input_x += dilation_width; |
305 | } |
306 | safe_input_x -= adjusted_padding_left; |
307 | |
308 | size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left; |
309 | if XNN_UNPREDICTABLE(input_x >= input_width) { |
310 | input_x = safe_input_x; |
311 | } |
312 | |
313 | const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y; |
314 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
315 | } |
316 | } |
317 | } |
318 | } |
319 | } else { |
320 | const size_t input_x_max = input_width - 1; |
321 | const size_t input_y_max = input_height - 1; |
322 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
323 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { |
324 | const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max); |
325 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
326 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { |
327 | const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max); |
328 | const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y; |
329 | indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride); |
330 | } |
331 | } |
332 | } |
333 | } |
334 | } |
335 | } |
336 | |
337 | void xnn_indirection_init_resize_bilinear2d_hwc_f16( |
338 | size_t input_pixel_stride, |
339 | size_t input_height, |
340 | size_t input_width, |
341 | size_t output_height, |
342 | size_t output_width, |
343 | const void* input, |
344 | const void** indirection_buffer, |
345 | void* packed_weights, |
346 | bool align_corners, |
347 | bool tensorflow_legacy) |
348 | { |
349 | assert(input_height != 0); |
350 | assert(input_height < 16777216 /* 2**24 */); |
351 | assert(input_width != 0); |
352 | assert(input_width < 16777216 /* 2**24 */); |
353 | assert(output_height != 0); |
354 | assert(output_height < 16777216 /* 2**24 */); |
355 | assert(output_width != 0); |
356 | assert(output_width < 16777216 /* 2**24 */); |
357 | |
358 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
359 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
360 | const float width_scale = |
361 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
362 | const float height_scale = |
363 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
364 | |
365 | uint16_t* w = (uint16_t*) packed_weights; |
366 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
367 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
368 | if (tensorflow_legacy || align_corners) { |
369 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
370 | const float input_y = (float) (int32_t) output_y * height_scale; |
371 | assert(input_y >= 0.0f); |
372 | assert(input_y < (float) input_height); |
373 | |
374 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
375 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
376 | const float alpha_y = input_y - (float) input_y_top; |
377 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
378 | const float input_x = (float) (int32_t) output_x * width_scale; |
379 | assert(input_x >= 0.0f); |
380 | assert(input_x < (float) input_width); |
381 | |
382 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
383 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
384 | const float alpha_x = input_x - (float) input_x_left; |
385 | indirection_buffer[0] = |
386 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
387 | indirection_buffer[1] = |
388 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
389 | indirection_buffer[2] = |
390 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
391 | indirection_buffer[3] = |
392 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
393 | w[0] = fp16_ieee_from_fp32_value(alpha_x); |
394 | w[1] = fp16_ieee_from_fp32_value(alpha_y); |
395 | indirection_buffer += 4; |
396 | w += 2; |
397 | } |
398 | } |
399 | } else { |
400 | const float height_offset = 0.5f * height_scale - 0.5f; |
401 | const float width_offset = 0.5f * width_scale - 0.5f; |
402 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
403 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
404 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
405 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
406 | assert((int32_t) input_y_top >= 0); |
407 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
408 | const float alpha_y = input_y - (float) input_y_top; |
409 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
410 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
411 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
412 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
413 | assert((int32_t) input_x_left >= 0); |
414 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
415 | const float alpha_x = input_x - (float) input_x_left; |
416 | indirection_buffer[0] = |
417 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
418 | indirection_buffer[1] = |
419 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
420 | indirection_buffer[2] = |
421 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
422 | indirection_buffer[3] = |
423 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
424 | w[0] = fp16_ieee_from_fp32_value(alpha_x); |
425 | w[1] = fp16_ieee_from_fp32_value(alpha_y); |
426 | indirection_buffer += 4; |
427 | w += 2; |
428 | } |
429 | } |
430 | } |
431 | } |
432 | |
433 | void xnn_indirection_init_resize_bilinear2d_hwc_f32( |
434 | size_t input_pixel_stride, |
435 | size_t input_height, |
436 | size_t input_width, |
437 | size_t output_height, |
438 | size_t output_width, |
439 | const void* input, |
440 | const void** indirection_buffer, |
441 | float* packed_weights, |
442 | bool align_corners, |
443 | bool tensorflow_legacy) |
444 | { |
445 | assert(input_height != 0); |
446 | assert(input_height < 16777216 /* 2**24 */); |
447 | assert(input_width != 0); |
448 | assert(input_width < 16777216 /* 2**24 */); |
449 | assert(output_height != 0); |
450 | assert(output_height < 16777216 /* 2**24 */); |
451 | assert(output_width != 0); |
452 | assert(output_width < 16777216 /* 2**24 */); |
453 | |
454 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
455 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
456 | const float width_scale = |
457 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
458 | const float height_scale = |
459 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
460 | |
461 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
462 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
463 | if (tensorflow_legacy || align_corners) { |
464 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
465 | const float input_y = (float) (int32_t) output_y * height_scale; |
466 | assert(input_y >= 0.0f); |
467 | assert(input_y < (float) input_height); |
468 | |
469 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
470 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
471 | const float alpha_y = input_y - (float) input_y_top; |
472 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
473 | const float input_x = (float) (int32_t) output_x * width_scale; |
474 | assert(input_x >= 0.0f); |
475 | assert(input_x < (float) input_width); |
476 | |
477 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
478 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
479 | const float alpha_x = input_x - (float) input_x_left; |
480 | indirection_buffer[0] = |
481 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
482 | indirection_buffer[1] = |
483 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
484 | indirection_buffer[2] = |
485 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
486 | indirection_buffer[3] = |
487 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
488 | packed_weights[0] = alpha_x; |
489 | packed_weights[1] = alpha_y; |
490 | indirection_buffer += 4; |
491 | packed_weights += 2; |
492 | } |
493 | } |
494 | } else { |
495 | const float height_offset = 0.5f * height_scale - 0.5f; |
496 | const float width_offset = 0.5f * width_scale - 0.5f; |
497 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
498 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
499 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
500 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
501 | assert((int32_t) input_y_top >= 0); |
502 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
503 | const float alpha_y = input_y - (float) input_y_top; |
504 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
505 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
506 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
507 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
508 | assert((int32_t) input_x_left >= 0); |
509 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
510 | const float alpha_x = input_x - (float) input_x_left; |
511 | indirection_buffer[0] = |
512 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
513 | indirection_buffer[1] = |
514 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
515 | indirection_buffer[2] = |
516 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
517 | indirection_buffer[3] = |
518 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
519 | packed_weights[0] = alpha_x; |
520 | packed_weights[1] = alpha_y; |
521 | indirection_buffer += 4; |
522 | packed_weights += 2; |
523 | } |
524 | } |
525 | } |
526 | } |
527 | |
528 | void xnn_indirection_init_resize_bilinear2d_hwc_q11( |
529 | size_t input_pixel_stride, |
530 | size_t input_height, |
531 | size_t input_width, |
532 | size_t output_height, |
533 | size_t output_width, |
534 | const void* input, |
535 | const void** indirection_buffer, |
536 | int16_t* packed_weights, |
537 | bool align_corners, |
538 | bool tensorflow_legacy) |
539 | { |
540 | assert(input_height != 0); |
541 | assert(input_height < 16777216 /* 2**24 */); |
542 | assert(input_width != 0); |
543 | assert(input_width < 16777216 /* 2**24 */); |
544 | assert(output_height != 0); |
545 | assert(output_height < 16777216 /* 2**24 */); |
546 | assert(output_width != 0); |
547 | assert(output_width < 16777216 /* 2**24 */); |
548 | |
549 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
550 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
551 | const float width_scale = |
552 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
553 | const float height_scale = |
554 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
555 | |
556 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
557 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
558 | if (tensorflow_legacy || align_corners) { |
559 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
560 | const float input_y = (float) (int32_t) output_y * height_scale; |
561 | assert(input_y >= 0.0f); |
562 | assert(input_y < (float) input_height); |
563 | |
564 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
565 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
566 | const float alpha_y = input_y - (float) input_y_top; |
567 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
568 | const float input_x = (float) (int32_t) output_x * width_scale; |
569 | assert(input_x >= 0.0f); |
570 | assert(input_x < (float) input_width); |
571 | |
572 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
573 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
574 | const float alpha_x = input_x - (float) input_x_left; |
575 | indirection_buffer[0] = |
576 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
577 | indirection_buffer[1] = |
578 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
579 | indirection_buffer[2] = |
580 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
581 | indirection_buffer[3] = |
582 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
583 | packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f); |
584 | packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f); |
585 | indirection_buffer += 4; |
586 | packed_weights += 2; |
587 | } |
588 | } |
589 | } else { |
590 | const float height_offset = 0.5f * height_scale - 0.5f; |
591 | const float width_offset = 0.5f * width_scale - 0.5f; |
592 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
593 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
594 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
595 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
596 | assert((int32_t) input_y_top >= 0); |
597 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
598 | const float alpha_y = input_y - (float) input_y_top; |
599 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
600 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
601 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
602 | const uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
603 | assert((int32_t) input_x_left >= 0); |
604 | const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max); |
605 | const float alpha_x = input_x - (float) input_x_left; |
606 | indirection_buffer[0] = |
607 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
608 | indirection_buffer[1] = |
609 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride); |
610 | indirection_buffer[2] = |
611 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
612 | indirection_buffer[3] = |
613 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride); |
614 | packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f); |
615 | packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f); |
616 | indirection_buffer += 4; |
617 | packed_weights += 2; |
618 | } |
619 | } |
620 | } |
621 | } |
622 | |
623 | void xnn_indirection_init_resize_bilinear2d_chw_f16( |
624 | size_t input_pixel_stride, |
625 | size_t input_height, |
626 | size_t input_width, |
627 | size_t output_height, |
628 | size_t output_width, |
629 | const void* input, |
630 | const void** indirection_buffer, |
631 | void* packed_weights, |
632 | bool align_corners, |
633 | bool tensorflow_legacy) |
634 | { |
635 | assert(input_height > 1); |
636 | assert(input_height < 16777216 /* 2**24 */); |
637 | assert(input_width > 1); |
638 | assert(input_width < 16777216 /* 2**24 */); |
639 | assert(output_height != 0); |
640 | assert(output_height < 16777216 /* 2**24 */); |
641 | assert(output_width != 0); |
642 | assert(output_width < 16777216 /* 2**24 */); |
643 | |
644 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
645 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
646 | const float width_scale = |
647 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
648 | const float height_scale = |
649 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
650 | |
651 | uint16_t* w = (uint16_t*) packed_weights; |
652 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
653 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
654 | if (tensorflow_legacy || align_corners) { |
655 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
656 | const float input_y = (float) (int32_t) output_y * height_scale; |
657 | assert(input_y >= 0.0f); |
658 | assert(input_y < (float) input_height); |
659 | |
660 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
661 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
662 | const float alpha_y = input_y - (float) input_y_top; |
663 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
664 | const float input_x = (float) (int32_t) output_x * width_scale; |
665 | assert(input_x >= 0.0f); |
666 | assert(input_x < (float) input_width); |
667 | |
668 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
669 | |
670 | float alpha_x = input_x - (float) input_x_left; |
671 | if (input_x_left == input_x_max) { |
672 | // Ensure that there is a pixel to the right of the one pointed at, |
673 | // as required by some CHW kernels. |
674 | --input_x_left; |
675 | alpha_x = 1.0f; |
676 | } |
677 | indirection_buffer[0] = |
678 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
679 | indirection_buffer[1] = |
680 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
681 | w[0] = fp16_ieee_from_fp32_value(alpha_x); |
682 | w[1] = fp16_ieee_from_fp32_value(alpha_y); |
683 | indirection_buffer += 2; |
684 | w += 2; |
685 | } |
686 | } |
687 | } else { |
688 | const float height_offset = 0.5f * height_scale - 0.5f; |
689 | const float width_offset = 0.5f * width_scale - 0.5f; |
690 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
691 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
692 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
693 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
694 | assert((int32_t) input_y_top >= 0); |
695 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
696 | const float alpha_y = input_y - (float) input_y_top; |
697 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
698 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
699 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
700 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
701 | assert((int32_t) input_x_left >= 0); |
702 | |
703 | float alpha_x = input_x - (float) input_x_left; |
704 | if (input_x_left == input_x_max) { |
705 | // Ensure that there is a pixel to the right of the one pointed at, |
706 | // as required by some CHW kernels. |
707 | --input_x_left; |
708 | alpha_x = 1.0f; |
709 | } |
710 | |
711 | indirection_buffer[0] = |
712 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
713 | indirection_buffer[1] = |
714 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
715 | w[0] = fp16_ieee_from_fp32_value(alpha_x); |
716 | w[1] = fp16_ieee_from_fp32_value(alpha_y); |
717 | indirection_buffer += 2; |
718 | w += 2; |
719 | } |
720 | } |
721 | } |
722 | } |
723 | |
724 | void xnn_indirection_init_resize_bilinear2d_chw_f32( |
725 | size_t input_pixel_stride, |
726 | size_t input_height, |
727 | size_t input_width, |
728 | size_t output_height, |
729 | size_t output_width, |
730 | const void* input, |
731 | const void** indirection_buffer, |
732 | float* packed_weights, |
733 | bool align_corners, |
734 | bool tensorflow_legacy) |
735 | { |
736 | assert(input_height > 1); |
737 | assert(input_height < 16777216 /* 2**24 */); |
738 | assert(input_width > 1); |
739 | assert(input_width < 16777216 /* 2**24 */); |
740 | assert(output_height != 0); |
741 | assert(output_height < 16777216 /* 2**24 */); |
742 | assert(output_width != 0); |
743 | assert(output_width < 16777216 /* 2**24 */); |
744 | |
745 | const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1); |
746 | const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1); |
747 | const float width_scale = |
748 | (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment); |
749 | const float height_scale = |
750 | (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment); |
751 | |
752 | const uint32_t input_y_max = (uint32_t) input_height - 1; |
753 | const uint32_t input_x_max = (uint32_t) input_width - 1; |
754 | if (tensorflow_legacy || align_corners) { |
755 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
756 | const float input_y = (float) (int32_t) output_y * height_scale; |
757 | assert(input_y >= 0.0f); |
758 | assert(input_y < (float) input_height); |
759 | |
760 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
761 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
762 | const float alpha_y = input_y - (float) input_y_top; |
763 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
764 | const float input_x = (float) (int32_t) output_x * width_scale; |
765 | assert(input_x >= 0.0f); |
766 | assert(input_x < (float) input_width); |
767 | |
768 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
769 | |
770 | float alpha_x = input_x - (float) input_x_left; |
771 | if (input_x_left == input_x_max) { |
772 | // Ensure that there is a pixel to the right of the one pointed at, |
773 | // as required by some CHW kernels. |
774 | --input_x_left; |
775 | alpha_x = 1.0f; |
776 | } |
777 | indirection_buffer[0] = |
778 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
779 | indirection_buffer[1] = |
780 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
781 | packed_weights[0] = alpha_x; |
782 | packed_weights[1] = alpha_y; |
783 | indirection_buffer += 2; |
784 | packed_weights += 2; |
785 | } |
786 | } |
787 | } else { |
788 | const float height_offset = 0.5f * height_scale - 0.5f; |
789 | const float width_offset = 0.5f * width_scale - 0.5f; |
790 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
791 | float input_y = (float) (int32_t) output_y * height_scale + height_offset; |
792 | input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max); |
793 | const uint32_t input_y_top = (uint32_t) (int32_t) input_y; |
794 | assert((int32_t) input_y_top >= 0); |
795 | const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max); |
796 | const float alpha_y = input_y - (float) input_y_top; |
797 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
798 | float input_x = (float) (int32_t) output_x * width_scale + width_offset; |
799 | input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max); |
800 | uint32_t input_x_left = (uint32_t) (int32_t) input_x; |
801 | assert((int32_t) input_x_left >= 0); |
802 | |
803 | float alpha_x = input_x - (float) input_x_left; |
804 | if (input_x_left == input_x_max) { |
805 | // Ensure that there is a pixel to the right of the one pointed at, |
806 | // as required by some CHW kernels. |
807 | --input_x_left; |
808 | alpha_x = 1.0f; |
809 | } |
810 | |
811 | indirection_buffer[0] = |
812 | (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride); |
813 | indirection_buffer[1] = |
814 | (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride); |
815 | packed_weights[0] = alpha_x; |
816 | packed_weights[1] = alpha_y; |
817 | indirection_buffer += 2; |
818 | packed_weights += 2; |
819 | } |
820 | } |
821 | } |
822 | } |
823 | |
824 | void xnn_indirection_init_unpool2d( |
825 | xnn_operator_t op, |
826 | size_t batch_start, |
827 | uint32_t log2_element_size) |
828 | { |
829 | const void** indirection_buffer = op->indirection_buffer; |
830 | const void* output = op->output; |
831 | const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size; |
832 | const size_t batch_size = op->batch_size; |
833 | const size_t input_height = op->input_height; |
834 | const size_t input_width = op->input_width; |
835 | const size_t output_height = op->output_height; |
836 | const size_t output_width = op->output_width; |
837 | const size_t pooling_height = op->kernel_height; |
838 | const size_t pooling_width = op->kernel_width; |
839 | const size_t output_padding_top = op->padding_top; |
840 | const size_t output_padding_left = op->padding_left; |
841 | |
842 | for (size_t image = batch_start; image < batch_size; image++) { |
843 | for (size_t input_y = 0; input_y < input_height; input_y++) { |
844 | for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) { |
845 | const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1); |
846 | for (size_t input_x = 0; input_x < input_width; input_x++) { |
847 | for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) { |
848 | const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1); |
849 | indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] = |
850 | (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride); |
851 | } |
852 | } |
853 | } |
854 | } |
855 | } |
856 | } |
857 | |
858 | void xnn_indirection_init_pavgpool2d_f16( |
859 | size_t input_height, |
860 | size_t input_width, |
861 | size_t output_height, |
862 | size_t output_width, |
863 | size_t pooling_height, |
864 | size_t pooling_width, |
865 | size_t stride_height, |
866 | size_t stride_width, |
867 | size_t padding_top, |
868 | size_t padding_left, |
869 | uint16_t* pixelwise_buffer) |
870 | { |
871 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
872 | const size_t input_y_start = doz(output_y * stride_height, padding_top); |
873 | const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height); |
874 | const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start); |
875 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
876 | const size_t input_x_start = doz(output_x * stride_width, padding_left); |
877 | const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width); |
878 | const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start); |
879 | *pixelwise_buffer++ = fp16_ieee_from_fp32_value(1.0f / ((float) (int32_t) (input_y_range * input_x_range))); |
880 | } |
881 | } |
882 | } |
883 | |
884 | void xnn_indirection_init_pavgpool2d_f32( |
885 | size_t input_height, |
886 | size_t input_width, |
887 | size_t output_height, |
888 | size_t output_width, |
889 | size_t pooling_height, |
890 | size_t pooling_width, |
891 | size_t stride_height, |
892 | size_t stride_width, |
893 | size_t padding_top, |
894 | size_t padding_left, |
895 | float* pixelwise_buffer) |
896 | { |
897 | for (size_t output_y = 0; output_y < output_height; output_y++) { |
898 | const size_t input_y_start = doz(output_y * stride_height, padding_top); |
899 | const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height); |
900 | const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start); |
901 | for (size_t output_x = 0; output_x < output_width; output_x++) { |
902 | const size_t input_x_start = doz(output_x * stride_width, padding_left); |
903 | const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width); |
904 | const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start); |
905 | *pixelwise_buffer++ = 1.0f / ((float) (int32_t) (input_y_range * input_x_range)); |
906 | } |
907 | } |
908 | } |
909 | |