1// Copyright (c) Facebook, Inc. and its affiliates.
2// All rights reserved.
3//
4// Copyright 2019 Google LLC
5//
6// This source code is licensed under the BSD-style license found in the
7// LICENSE file in the root directory of this source tree.
8
9#include <stddef.h>
10#include <math.h>
11
12#include <fp16.h>
13
14#include <fxdiv.h>
15
16#include <xnnpack/indirection.h>
17#include <xnnpack/operator.h>
18#include <xnnpack/math.h>
19
20
21void xnn_indirection_init_conv2d(
22 xnn_operator_t op,
23 size_t output_tile_size,
24 uint32_t log2_element_size)
25{
26 const void** indirection_buffer = op->indirection_buffer;
27 const void* input = op->input;
28 const void* zero = op->zero_buffer;
29 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
30 const size_t input_height = op->input_height;
31 const size_t input_width = op->input_width;
32 const size_t output_height = op->output_height;
33 const size_t output_width = op->output_width;
34 const size_t kernel_height = op->kernel_height;
35 const size_t kernel_width = op->kernel_width;
36 const size_t stride_height = op->stride_height;
37 const size_t stride_width = op->stride_width;
38 const size_t dilation_height = op->dilation_height;
39 const size_t dilation_width = op->dilation_width;
40 const size_t input_padding_top = op->padding_top;
41 const size_t input_padding_left = op->padding_left;
42
43 const size_t output_size = output_height * output_width;
44 const size_t tiled_output_size = round_up(output_size, output_tile_size);
45 const size_t kernel_size = kernel_height * kernel_width;
46
47 const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
48
49 for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
50 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
51 const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
52 const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
53 const size_t output_x = output_y_x.remainder;
54 const size_t output_y = output_y_x.quotient;
55 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
56 const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
57 if (input_y < input_height) {
58 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
59 const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
60 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
61 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
62 if (input_x < input_width) {
63 indirection_buffer[index] = (const void*)
64 ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
65 } else {
66 indirection_buffer[index] = zero;
67 }
68 }
69 } else {
70 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
71 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
72 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
73 indirection_buffer[index] = zero;
74 }
75 }
76 }
77 }
78 }
79}
80
81void xnn_indirection_init_deconv2d(
82 xnn_operator_t op,
83 size_t output_tile_size,
84 uint32_t log2_element_size)
85{
86 const void** indirection_buffer = op->indirection_buffer;
87 const void* input = op->input;
88 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
89 const void* zero = op->zero_buffer;
90 const size_t input_height = op->input_height;
91 const size_t input_width = op->input_width;
92 const size_t output_height = op->output_height;
93 const size_t output_width = op->output_width;
94 const size_t kernel_height = op->kernel_height;
95 const size_t kernel_width = op->kernel_width;
96 const size_t stride_height = op->stride_height;
97 const size_t stride_width = op->stride_width;
98 const size_t dilation_height = op->dilation_height;
99 const size_t dilation_width = op->dilation_width;
100 const size_t padding_top = op->padding_top;
101 const size_t padding_left = op->padding_left;
102
103 const size_t output_size = output_height * output_width;
104 const size_t tiled_output_size = round_up(output_size, output_tile_size);
105 const size_t kernel_size = kernel_height * kernel_width;
106
107 const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
108 const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height);
109 const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width);
110
111 for (size_t output_tile_start = 0; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
112 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
113 const size_t output_index = min(output_tile_start + output_tile_offset, output_size - 1);
114 const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
115 const size_t output_x = output_y_x.remainder;
116 const size_t output_y = output_y_x.quotient;
117 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
118 const size_t y = output_y + padding_top - kernel_y * dilation_height;
119 const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor);
120 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
121 const size_t x = output_x + padding_left - kernel_x * dilation_width;
122 const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor);
123 const size_t kernel_index = kernel_y * kernel_width + kernel_x;
124 const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
125 if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) {
126 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
127 } else {
128 indirection_buffer[index] = zero;
129 }
130 }
131 }
132 }
133 }
134}
135
136void xnn_indirection_init_subconv2d(
137 xnn_operator_t op,
138 size_t output_tile_size,
139 uint32_t log2_element_size)
140{
141 const void** indirection_buffer = op->indirection_buffer;
142 struct subconvolution_params* subconvolution_params = op->subconvolution_buffer;
143 const void* input = op->input;
144 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
145 const void* zero = op->zero_buffer;
146 const size_t input_height = op->input_height;
147 const size_t input_width = op->input_width;
148 const size_t output_height = op->output_height;
149 const size_t output_width = op->output_width;
150 const size_t kernel_height = op->kernel_height;
151 const size_t kernel_width = op->kernel_width;
152 const size_t stride_height = op->stride_height;
153 const size_t stride_width = op->stride_width;
154 const size_t padding_top = op->padding_top;
155 const size_t padding_left = op->padding_left;
156
157 const size_t modulo_padding_top = padding_top % stride_height;
158 const size_t modulo_padding_left = padding_left % stride_width;
159 for (size_t offset_y = 0; offset_y < stride_height; offset_y++) {
160 const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
161 for (size_t offset_x = 0; offset_x < stride_width; offset_x++) {
162 const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
163 const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width);
164
165 subconvolution_params->indirection_buffer = indirection_buffer;
166 subconvolution_params->indirection_y_stride =
167 subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size);
168 ++subconvolution_params;
169
170 for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) {
171 for (size_t output_tile_start = 0; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) {
172 for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) {
173 assert(doz(output_y + padding_top, kernel_y) % stride_height == 0);
174 const size_t y = output_y + padding_top - kernel_y;
175 const size_t input_y = y / stride_height;
176
177 for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) {
178 for (size_t output_tile_offset = 0; output_tile_offset < output_tile_size; output_tile_offset++) {
179 const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - 1);
180 const size_t output_x = output_x_start + sliced_output_x * stride_width;
181
182 assert(doz(output_x + padding_left, kernel_x) % stride_width == 0);
183 const size_t x = output_x + padding_left - kernel_x;
184 const size_t input_x = x / stride_width;
185
186 if (input_y < input_height && input_x < input_width) {
187 *indirection_buffer++ =
188 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
189 } else {
190 *indirection_buffer++ = zero;
191 }
192 }
193 }
194 }
195 }
196 }
197 }
198 }
199}
200
201void xnn_indirection_init_dwconv2d(
202 xnn_operator_t op,
203 size_t step_height,
204 size_t step_width,
205 size_t primary_tile,
206 uint32_t log2_element_size)
207{
208 const void** indirection_buffer = op->indirection_buffer;
209 const void* input = op->input;
210 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
211 const void* zero = op->zero_buffer;
212 const size_t input_height = op->input_height;
213 const size_t input_width = op->input_width;
214 const size_t output_height = op->output_height;
215 const size_t output_width = op->output_width;
216 const size_t kernel_height = op->kernel_height;
217 const size_t kernel_width = op->kernel_width;
218 const size_t stride_height = op->stride_height;
219 const size_t stride_width = op->stride_width;
220 const size_t dilation_height = op->dilation_height;
221 const size_t dilation_width = op->dilation_width;
222 const size_t input_padding_top = op->padding_top;
223 const size_t input_padding_left = op->padding_left;
224
225 for (size_t output_y = 0; output_y < output_height; output_y++) {
226 for (size_t kernel_y = 0; kernel_y < kernel_height; kernel_y++) {
227 const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
228 if (input_y < input_height) {
229 for (size_t output_x = 0; output_x < output_width; output_x++) {
230 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
231 const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
232 const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
233 if (input_x < input_width) {
234 indirection_buffer[index] =
235 (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
236 } else {
237 indirection_buffer[index] = zero;
238 }
239 }
240 }
241 } else {
242 for (size_t output_x = 0; output_x < output_width; output_x++) {
243 for (size_t kernel_x = 0; kernel_x < kernel_width; kernel_x++) {
244 const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
245 indirection_buffer[index] = zero;
246 }
247 }
248 }
249 }
250 }
251
252 const void* last_output_pixel = indirection_buffer[output_height * step_height - 1];
253 const size_t last_kernel_index = output_height * step_height - (kernel_height * kernel_width);
254 for (size_t tile_index = kernel_height * kernel_width; tile_index < primary_tile; tile_index++) {
255 indirection_buffer[last_kernel_index + tile_index] = last_output_pixel;
256 }
257}
258
259void xnn_indirection_init_maxpool2d(
260 xnn_operator_t op,
261 size_t step_height,
262 size_t step_width,
263 uint32_t log2_element_size)
264{
265 const void** indirection_buffer = op->indirection_buffer;
266 const void* input = op->input;
267 const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
268 const size_t input_height = op->input_height;
269 const size_t input_width = op->input_width;
270 const size_t output_height = op->output_height;
271 const size_t output_width = op->output_width;
272 const size_t pooling_height = op->kernel_height;
273 const size_t pooling_width = op->kernel_width;
274 const size_t stride_height = op->stride_height;
275 const size_t stride_width = op->stride_width;
276 const size_t dilation_height = op->dilation_height;
277 const size_t dilation_width = op->dilation_width;
278 const size_t input_padding_top = op->padding_top;
279 const size_t input_padding_left = op->padding_left;
280
281 const bool any_dilation = (dilation_height | dilation_width) > 1;
282
283 if (any_dilation) {
284 // Clamp to the border doesn't work for pooling with dilation.
285 const size_t adjusted_padding_top = input_padding_top % dilation_height;
286 const size_t adjusted_padding_left = input_padding_left % dilation_width;
287 for (size_t output_y = 0; output_y < output_height; output_y++) {
288 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
289 size_t safe_input_y = output_y * stride_height;
290 if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) {
291 safe_input_y += dilation_height;
292 }
293 safe_input_y -= adjusted_padding_top;
294
295 size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top;
296 if XNN_UNPREDICTABLE(input_y >= input_height) {
297 input_y = safe_input_y;
298 }
299
300 for (size_t output_x = 0; output_x < output_width; output_x++) {
301 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
302 size_t safe_input_x = output_x * stride_width;
303 if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) {
304 safe_input_x += dilation_width;
305 }
306 safe_input_x -= adjusted_padding_left;
307
308 size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left;
309 if XNN_UNPREDICTABLE(input_x >= input_width) {
310 input_x = safe_input_x;
311 }
312
313 const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
314 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
315 }
316 }
317 }
318 }
319 } else {
320 const size_t input_x_max = input_width - 1;
321 const size_t input_y_max = input_height - 1;
322 for (size_t output_y = 0; output_y < output_height; output_y++) {
323 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
324 const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max);
325 for (size_t output_x = 0; output_x < output_width; output_x++) {
326 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
327 const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max);
328 const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
329 indirection_buffer[index] = (const void*) ((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
330 }
331 }
332 }
333 }
334 }
335}
336
337void xnn_indirection_init_resize_bilinear2d_hwc_f16(
338 size_t input_pixel_stride,
339 size_t input_height,
340 size_t input_width,
341 size_t output_height,
342 size_t output_width,
343 const void* input,
344 const void** indirection_buffer,
345 void* packed_weights,
346 bool align_corners,
347 bool tensorflow_legacy)
348{
349 assert(input_height != 0);
350 assert(input_height < 16777216 /* 2**24 */);
351 assert(input_width != 0);
352 assert(input_width < 16777216 /* 2**24 */);
353 assert(output_height != 0);
354 assert(output_height < 16777216 /* 2**24 */);
355 assert(output_width != 0);
356 assert(output_width < 16777216 /* 2**24 */);
357
358 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
359 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
360 const float width_scale =
361 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
362 const float height_scale =
363 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
364
365 uint16_t* w = (uint16_t*) packed_weights;
366 const uint32_t input_y_max = (uint32_t) input_height - 1;
367 const uint32_t input_x_max = (uint32_t) input_width - 1;
368 if (tensorflow_legacy || align_corners) {
369 for (size_t output_y = 0; output_y < output_height; output_y++) {
370 const float input_y = (float) (int32_t) output_y * height_scale;
371 assert(input_y >= 0.0f);
372 assert(input_y < (float) input_height);
373
374 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
375 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
376 const float alpha_y = input_y - (float) input_y_top;
377 for (size_t output_x = 0; output_x < output_width; output_x++) {
378 const float input_x = (float) (int32_t) output_x * width_scale;
379 assert(input_x >= 0.0f);
380 assert(input_x < (float) input_width);
381
382 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
383 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
384 const float alpha_x = input_x - (float) input_x_left;
385 indirection_buffer[0] =
386 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
387 indirection_buffer[1] =
388 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
389 indirection_buffer[2] =
390 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
391 indirection_buffer[3] =
392 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
393 w[0] = fp16_ieee_from_fp32_value(alpha_x);
394 w[1] = fp16_ieee_from_fp32_value(alpha_y);
395 indirection_buffer += 4;
396 w += 2;
397 }
398 }
399 } else {
400 const float height_offset = 0.5f * height_scale - 0.5f;
401 const float width_offset = 0.5f * width_scale - 0.5f;
402 for (size_t output_y = 0; output_y < output_height; output_y++) {
403 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
404 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
405 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
406 assert((int32_t) input_y_top >= 0);
407 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
408 const float alpha_y = input_y - (float) input_y_top;
409 for (size_t output_x = 0; output_x < output_width; output_x++) {
410 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
411 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
412 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
413 assert((int32_t) input_x_left >= 0);
414 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
415 const float alpha_x = input_x - (float) input_x_left;
416 indirection_buffer[0] =
417 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
418 indirection_buffer[1] =
419 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
420 indirection_buffer[2] =
421 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
422 indirection_buffer[3] =
423 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
424 w[0] = fp16_ieee_from_fp32_value(alpha_x);
425 w[1] = fp16_ieee_from_fp32_value(alpha_y);
426 indirection_buffer += 4;
427 w += 2;
428 }
429 }
430 }
431}
432
433void xnn_indirection_init_resize_bilinear2d_hwc_f32(
434 size_t input_pixel_stride,
435 size_t input_height,
436 size_t input_width,
437 size_t output_height,
438 size_t output_width,
439 const void* input,
440 const void** indirection_buffer,
441 float* packed_weights,
442 bool align_corners,
443 bool tensorflow_legacy)
444{
445 assert(input_height != 0);
446 assert(input_height < 16777216 /* 2**24 */);
447 assert(input_width != 0);
448 assert(input_width < 16777216 /* 2**24 */);
449 assert(output_height != 0);
450 assert(output_height < 16777216 /* 2**24 */);
451 assert(output_width != 0);
452 assert(output_width < 16777216 /* 2**24 */);
453
454 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
455 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
456 const float width_scale =
457 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
458 const float height_scale =
459 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
460
461 const uint32_t input_y_max = (uint32_t) input_height - 1;
462 const uint32_t input_x_max = (uint32_t) input_width - 1;
463 if (tensorflow_legacy || align_corners) {
464 for (size_t output_y = 0; output_y < output_height; output_y++) {
465 const float input_y = (float) (int32_t) output_y * height_scale;
466 assert(input_y >= 0.0f);
467 assert(input_y < (float) input_height);
468
469 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
470 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
471 const float alpha_y = input_y - (float) input_y_top;
472 for (size_t output_x = 0; output_x < output_width; output_x++) {
473 const float input_x = (float) (int32_t) output_x * width_scale;
474 assert(input_x >= 0.0f);
475 assert(input_x < (float) input_width);
476
477 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
478 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
479 const float alpha_x = input_x - (float) input_x_left;
480 indirection_buffer[0] =
481 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
482 indirection_buffer[1] =
483 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
484 indirection_buffer[2] =
485 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
486 indirection_buffer[3] =
487 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
488 packed_weights[0] = alpha_x;
489 packed_weights[1] = alpha_y;
490 indirection_buffer += 4;
491 packed_weights += 2;
492 }
493 }
494 } else {
495 const float height_offset = 0.5f * height_scale - 0.5f;
496 const float width_offset = 0.5f * width_scale - 0.5f;
497 for (size_t output_y = 0; output_y < output_height; output_y++) {
498 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
499 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
500 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
501 assert((int32_t) input_y_top >= 0);
502 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
503 const float alpha_y = input_y - (float) input_y_top;
504 for (size_t output_x = 0; output_x < output_width; output_x++) {
505 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
506 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
507 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
508 assert((int32_t) input_x_left >= 0);
509 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
510 const float alpha_x = input_x - (float) input_x_left;
511 indirection_buffer[0] =
512 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
513 indirection_buffer[1] =
514 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
515 indirection_buffer[2] =
516 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
517 indirection_buffer[3] =
518 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
519 packed_weights[0] = alpha_x;
520 packed_weights[1] = alpha_y;
521 indirection_buffer += 4;
522 packed_weights += 2;
523 }
524 }
525 }
526}
527
528void xnn_indirection_init_resize_bilinear2d_hwc_q11(
529 size_t input_pixel_stride,
530 size_t input_height,
531 size_t input_width,
532 size_t output_height,
533 size_t output_width,
534 const void* input,
535 const void** indirection_buffer,
536 int16_t* packed_weights,
537 bool align_corners,
538 bool tensorflow_legacy)
539{
540 assert(input_height != 0);
541 assert(input_height < 16777216 /* 2**24 */);
542 assert(input_width != 0);
543 assert(input_width < 16777216 /* 2**24 */);
544 assert(output_height != 0);
545 assert(output_height < 16777216 /* 2**24 */);
546 assert(output_width != 0);
547 assert(output_width < 16777216 /* 2**24 */);
548
549 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
550 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
551 const float width_scale =
552 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
553 const float height_scale =
554 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
555
556 const uint32_t input_y_max = (uint32_t) input_height - 1;
557 const uint32_t input_x_max = (uint32_t) input_width - 1;
558 if (tensorflow_legacy || align_corners) {
559 for (size_t output_y = 0; output_y < output_height; output_y++) {
560 const float input_y = (float) (int32_t) output_y * height_scale;
561 assert(input_y >= 0.0f);
562 assert(input_y < (float) input_height);
563
564 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
565 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
566 const float alpha_y = input_y - (float) input_y_top;
567 for (size_t output_x = 0; output_x < output_width; output_x++) {
568 const float input_x = (float) (int32_t) output_x * width_scale;
569 assert(input_x >= 0.0f);
570 assert(input_x < (float) input_width);
571
572 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
573 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
574 const float alpha_x = input_x - (float) input_x_left;
575 indirection_buffer[0] =
576 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
577 indirection_buffer[1] =
578 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
579 indirection_buffer[2] =
580 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
581 indirection_buffer[3] =
582 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
583 packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
584 packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
585 indirection_buffer += 4;
586 packed_weights += 2;
587 }
588 }
589 } else {
590 const float height_offset = 0.5f * height_scale - 0.5f;
591 const float width_offset = 0.5f * width_scale - 0.5f;
592 for (size_t output_y = 0; output_y < output_height; output_y++) {
593 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
594 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
595 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
596 assert((int32_t) input_y_top >= 0);
597 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
598 const float alpha_y = input_y - (float) input_y_top;
599 for (size_t output_x = 0; output_x < output_width; output_x++) {
600 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
601 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
602 const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
603 assert((int32_t) input_x_left >= 0);
604 const uint32_t input_x_right = math_min_u32(input_x_left + 1, input_x_max);
605 const float alpha_x = input_x - (float) input_x_left;
606 indirection_buffer[0] =
607 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
608 indirection_buffer[1] =
609 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_right) * input_pixel_stride);
610 indirection_buffer[2] =
611 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
612 indirection_buffer[3] =
613 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_right) * input_pixel_stride);
614 packed_weights[0] = (int16_t) lrintf(alpha_x * 0x1.0p+11f);
615 packed_weights[1] = (int16_t) lrintf(alpha_y * 0x1.0p+11f);
616 indirection_buffer += 4;
617 packed_weights += 2;
618 }
619 }
620 }
621}
622
623void xnn_indirection_init_resize_bilinear2d_chw_f16(
624 size_t input_pixel_stride,
625 size_t input_height,
626 size_t input_width,
627 size_t output_height,
628 size_t output_width,
629 const void* input,
630 const void** indirection_buffer,
631 void* packed_weights,
632 bool align_corners,
633 bool tensorflow_legacy)
634{
635 assert(input_height > 1);
636 assert(input_height < 16777216 /* 2**24 */);
637 assert(input_width > 1);
638 assert(input_width < 16777216 /* 2**24 */);
639 assert(output_height != 0);
640 assert(output_height < 16777216 /* 2**24 */);
641 assert(output_width != 0);
642 assert(output_width < 16777216 /* 2**24 */);
643
644 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
645 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
646 const float width_scale =
647 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
648 const float height_scale =
649 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
650
651 uint16_t* w = (uint16_t*) packed_weights;
652 const uint32_t input_y_max = (uint32_t) input_height - 1;
653 const uint32_t input_x_max = (uint32_t) input_width - 1;
654 if (tensorflow_legacy || align_corners) {
655 for (size_t output_y = 0; output_y < output_height; output_y++) {
656 const float input_y = (float) (int32_t) output_y * height_scale;
657 assert(input_y >= 0.0f);
658 assert(input_y < (float) input_height);
659
660 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
661 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
662 const float alpha_y = input_y - (float) input_y_top;
663 for (size_t output_x = 0; output_x < output_width; output_x++) {
664 const float input_x = (float) (int32_t) output_x * width_scale;
665 assert(input_x >= 0.0f);
666 assert(input_x < (float) input_width);
667
668 uint32_t input_x_left = (uint32_t) (int32_t) input_x;
669
670 float alpha_x = input_x - (float) input_x_left;
671 if (input_x_left == input_x_max) {
672 // Ensure that there is a pixel to the right of the one pointed at,
673 // as required by some CHW kernels.
674 --input_x_left;
675 alpha_x = 1.0f;
676 }
677 indirection_buffer[0] =
678 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
679 indirection_buffer[1] =
680 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
681 w[0] = fp16_ieee_from_fp32_value(alpha_x);
682 w[1] = fp16_ieee_from_fp32_value(alpha_y);
683 indirection_buffer += 2;
684 w += 2;
685 }
686 }
687 } else {
688 const float height_offset = 0.5f * height_scale - 0.5f;
689 const float width_offset = 0.5f * width_scale - 0.5f;
690 for (size_t output_y = 0; output_y < output_height; output_y++) {
691 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
692 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
693 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
694 assert((int32_t) input_y_top >= 0);
695 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
696 const float alpha_y = input_y - (float) input_y_top;
697 for (size_t output_x = 0; output_x < output_width; output_x++) {
698 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
699 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
700 uint32_t input_x_left = (uint32_t) (int32_t) input_x;
701 assert((int32_t) input_x_left >= 0);
702
703 float alpha_x = input_x - (float) input_x_left;
704 if (input_x_left == input_x_max) {
705 // Ensure that there is a pixel to the right of the one pointed at,
706 // as required by some CHW kernels.
707 --input_x_left;
708 alpha_x = 1.0f;
709 }
710
711 indirection_buffer[0] =
712 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
713 indirection_buffer[1] =
714 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
715 w[0] = fp16_ieee_from_fp32_value(alpha_x);
716 w[1] = fp16_ieee_from_fp32_value(alpha_y);
717 indirection_buffer += 2;
718 w += 2;
719 }
720 }
721 }
722}
723
724void xnn_indirection_init_resize_bilinear2d_chw_f32(
725 size_t input_pixel_stride,
726 size_t input_height,
727 size_t input_width,
728 size_t output_height,
729 size_t output_width,
730 const void* input,
731 const void** indirection_buffer,
732 float* packed_weights,
733 bool align_corners,
734 bool tensorflow_legacy)
735{
736 assert(input_height > 1);
737 assert(input_height < 16777216 /* 2**24 */);
738 assert(input_width > 1);
739 assert(input_width < 16777216 /* 2**24 */);
740 assert(output_height != 0);
741 assert(output_height < 16777216 /* 2**24 */);
742 assert(output_width != 0);
743 assert(output_width < 16777216 /* 2**24 */);
744
745 const int32_t width_adjustment = (int32_t) (align_corners && output_width != 1);
746 const int32_t height_adjustment = (int32_t) (align_corners && output_height != 1);
747 const float width_scale =
748 (float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
749 const float height_scale =
750 (float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
751
752 const uint32_t input_y_max = (uint32_t) input_height - 1;
753 const uint32_t input_x_max = (uint32_t) input_width - 1;
754 if (tensorflow_legacy || align_corners) {
755 for (size_t output_y = 0; output_y < output_height; output_y++) {
756 const float input_y = (float) (int32_t) output_y * height_scale;
757 assert(input_y >= 0.0f);
758 assert(input_y < (float) input_height);
759
760 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
761 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
762 const float alpha_y = input_y - (float) input_y_top;
763 for (size_t output_x = 0; output_x < output_width; output_x++) {
764 const float input_x = (float) (int32_t) output_x * width_scale;
765 assert(input_x >= 0.0f);
766 assert(input_x < (float) input_width);
767
768 uint32_t input_x_left = (uint32_t) (int32_t) input_x;
769
770 float alpha_x = input_x - (float) input_x_left;
771 if (input_x_left == input_x_max) {
772 // Ensure that there is a pixel to the right of the one pointed at,
773 // as required by some CHW kernels.
774 --input_x_left;
775 alpha_x = 1.0f;
776 }
777 indirection_buffer[0] =
778 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
779 indirection_buffer[1] =
780 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
781 packed_weights[0] = alpha_x;
782 packed_weights[1] = alpha_y;
783 indirection_buffer += 2;
784 packed_weights += 2;
785 }
786 }
787 } else {
788 const float height_offset = 0.5f * height_scale - 0.5f;
789 const float width_offset = 0.5f * width_scale - 0.5f;
790 for (size_t output_y = 0; output_y < output_height; output_y++) {
791 float input_y = (float) (int32_t) output_y * height_scale + height_offset;
792 input_y = math_min_f32(math_max_f32(input_y, 0.0f), (float) input_y_max);
793 const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
794 assert((int32_t) input_y_top >= 0);
795 const uint32_t input_y_bottom = math_min_u32(input_y_top + 1, input_y_max);
796 const float alpha_y = input_y - (float) input_y_top;
797 for (size_t output_x = 0; output_x < output_width; output_x++) {
798 float input_x = (float) (int32_t) output_x * width_scale + width_offset;
799 input_x = math_min_f32(math_max_f32(input_x, 0.0f), (float) input_x_max);
800 uint32_t input_x_left = (uint32_t) (int32_t) input_x;
801 assert((int32_t) input_x_left >= 0);
802
803 float alpha_x = input_x - (float) input_x_left;
804 if (input_x_left == input_x_max) {
805 // Ensure that there is a pixel to the right of the one pointed at,
806 // as required by some CHW kernels.
807 --input_x_left;
808 alpha_x = 1.0f;
809 }
810
811 indirection_buffer[0] =
812 (void*) ((uintptr_t) input + (input_y_top * input_width + input_x_left) * input_pixel_stride);
813 indirection_buffer[1] =
814 (void*) ((uintptr_t) input + (input_y_bottom * input_width + input_x_left) * input_pixel_stride);
815 packed_weights[0] = alpha_x;
816 packed_weights[1] = alpha_y;
817 indirection_buffer += 2;
818 packed_weights += 2;
819 }
820 }
821 }
822}
823
824void xnn_indirection_init_unpool2d(
825 xnn_operator_t op,
826 size_t batch_start,
827 uint32_t log2_element_size)
828{
829 const void** indirection_buffer = op->indirection_buffer;
830 const void* output = op->output;
831 const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size;
832 const size_t batch_size = op->batch_size;
833 const size_t input_height = op->input_height;
834 const size_t input_width = op->input_width;
835 const size_t output_height = op->output_height;
836 const size_t output_width = op->output_width;
837 const size_t pooling_height = op->kernel_height;
838 const size_t pooling_width = op->kernel_width;
839 const size_t output_padding_top = op->padding_top;
840 const size_t output_padding_left = op->padding_left;
841
842 for (size_t image = batch_start; image < batch_size; image++) {
843 for (size_t input_y = 0; input_y < input_height; input_y++) {
844 for (size_t pooling_y = 0; pooling_y < pooling_height; pooling_y++) {
845 const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - 1);
846 for (size_t input_x = 0; input_x < input_width; input_x++) {
847 for (size_t pooling_x = 0; pooling_x < pooling_width; pooling_x++) {
848 const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - 1);
849 indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] =
850 (const void*) ((uintptr_t) output + ((image * output_height + output_y) * output_width + output_x) * output_pixel_stride);
851 }
852 }
853 }
854 }
855 }
856}
857
858void xnn_indirection_init_pavgpool2d_f16(
859 size_t input_height,
860 size_t input_width,
861 size_t output_height,
862 size_t output_width,
863 size_t pooling_height,
864 size_t pooling_width,
865 size_t stride_height,
866 size_t stride_width,
867 size_t padding_top,
868 size_t padding_left,
869 uint16_t* pixelwise_buffer)
870{
871 for (size_t output_y = 0; output_y < output_height; output_y++) {
872 const size_t input_y_start = doz(output_y * stride_height, padding_top);
873 const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height);
874 const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
875 for (size_t output_x = 0; output_x < output_width; output_x++) {
876 const size_t input_x_start = doz(output_x * stride_width, padding_left);
877 const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width);
878 const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
879 *pixelwise_buffer++ = fp16_ieee_from_fp32_value(1.0f / ((float) (int32_t) (input_y_range * input_x_range)));
880 }
881 }
882}
883
884void xnn_indirection_init_pavgpool2d_f32(
885 size_t input_height,
886 size_t input_width,
887 size_t output_height,
888 size_t output_width,
889 size_t pooling_height,
890 size_t pooling_width,
891 size_t stride_height,
892 size_t stride_width,
893 size_t padding_top,
894 size_t padding_left,
895 float* pixelwise_buffer)
896{
897 for (size_t output_y = 0; output_y < output_height; output_y++) {
898 const size_t input_y_start = doz(output_y * stride_height, padding_top);
899 const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height);
900 const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
901 for (size_t output_x = 0; output_x < output_width; output_x++) {
902 const size_t input_x_start = doz(output_x * stride_width, padding_left);
903 const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width);
904 const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
905 *pixelwise_buffer++ = 1.0f / ((float) (int32_t) (input_y_range * input_x_range));
906 }
907 }
908}
909