indirection.c source code [tensorflow/external/XNNPACK/src/indirection.c]

1	// Copyright (c) Facebook, Inc. and its affiliates.
2	// All rights reserved.
3	//
4	// Copyright 2019 Google LLC
5	//
6	// This source code is licensed under the BSD-style license found in the
7	// LICENSE file in the root directory of this source tree.
8
9	#include <stddef.h>
10	#include <math.h>
11
12	#include <fp16.h>
13
14	#include <fxdiv.h>
15
16	#include <xnnpack/indirection.h>
17	#include <xnnpack/operator.h>
18	#include <xnnpack/math.h>
19
20
21	void xnn_indirection_init_conv2d(
22	xnn_operator_t op,
23	size_t output_tile_size,
24	uint32_t log2_element_size)
25	{
26	const void** indirection_buffer = op->indirection_buffer;
27	const void* input = op->input;
28	const void* zero = op->zero_buffer;
29	const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
30	const size_t input_height = op->input_height;
31	const size_t input_width = op->input_width;
32	const size_t output_height = op->output_height;
33	const size_t output_width = op->output_width;
34	const size_t kernel_height = op->kernel_height;
35	const size_t kernel_width = op->kernel_width;
36	const size_t stride_height = op->stride_height;
37	const size_t stride_width = op->stride_width;
38	const size_t dilation_height = op->dilation_height;
39	const size_t dilation_width = op->dilation_width;
40	const size_t input_padding_top = op->padding_top;
41	const size_t input_padding_left = op->padding_left;
42
43	const size_t output_size = output_height * output_width;
44	const size_t tiled_output_size = round_up(output_size, output_tile_size);
45	const size_t kernel_size = kernel_height * kernel_width;
46
47	const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
48
49	for (size_t output_tile_start = `0`; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
50	for (size_t output_tile_offset = `0`; output_tile_offset < output_tile_size; output_tile_offset++) {
51	const size_t output_index = min(output_tile_start + output_tile_offset, output_size - `1`);
52	const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
53	const size_t output_x = output_y_x.remainder;
54	const size_t output_y = output_y_x.quotient;
55	for (size_t kernel_y = `0`; kernel_y < kernel_height; kernel_y++) {
56	const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
57	if (input_y < input_height) {
58	for (size_t kernel_x = `0`; kernel_x < kernel_width; kernel_x++) {
59	const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
60	const size_t kernel_index = kernel_y * kernel_width + kernel_x;
61	const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
62	if (input_x < input_width) {
63	indirection_buffer[index] = (const void*)
64	((uintptr_t) input + (input_y * input_width + input_x) * input_pixel_stride);
65	} else {
66	indirection_buffer[index] = zero;
67	}
68	}
69	} else {
70	for (size_t kernel_x = `0`; kernel_x < kernel_width; kernel_x++) {
71	const size_t kernel_index = kernel_y * kernel_width + kernel_x;
72	const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
73	indirection_buffer[index] = zero;
74	}
75	}
76	}
77	}
78	}
79	}
80
81	void xnn_indirection_init_deconv2d(
82	xnn_operator_t op,
83	size_t output_tile_size,
84	uint32_t log2_element_size)
85	{
86	const void** indirection_buffer = op->indirection_buffer;
87	const void* input = op->input;
88	const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
89	const void* zero = op->zero_buffer;
90	const size_t input_height = op->input_height;
91	const size_t input_width = op->input_width;
92	const size_t output_height = op->output_height;
93	const size_t output_width = op->output_width;
94	const size_t kernel_height = op->kernel_height;
95	const size_t kernel_width = op->kernel_width;
96	const size_t stride_height = op->stride_height;
97	const size_t stride_width = op->stride_width;
98	const size_t dilation_height = op->dilation_height;
99	const size_t dilation_width = op->dilation_width;
100	const size_t padding_top = op->padding_top;
101	const size_t padding_left = op->padding_left;
102
103	const size_t output_size = output_height * output_width;
104	const size_t tiled_output_size = round_up(output_size, output_tile_size);
105	const size_t kernel_size = kernel_height * kernel_width;
106
107	const struct fxdiv_divisor_size_t output_width_divisor = fxdiv_init_size_t(output_width);
108	const struct fxdiv_divisor_size_t stride_height_divisor = fxdiv_init_size_t(stride_height);
109	const struct fxdiv_divisor_size_t stride_width_divisor = fxdiv_init_size_t(stride_width);
110
111	for (size_t output_tile_start = `0`; output_tile_start < tiled_output_size; output_tile_start += output_tile_size) {
112	for (size_t output_tile_offset = `0`; output_tile_offset < output_tile_size; output_tile_offset++) {
113	const size_t output_index = min(output_tile_start + output_tile_offset, output_size - `1`);
114	const struct fxdiv_result_size_t output_y_x = fxdiv_divide_size_t(output_index, output_width_divisor);
115	const size_t output_x = output_y_x.remainder;
116	const size_t output_y = output_y_x.quotient;
117	for (size_t kernel_y = `0`; kernel_y < kernel_height; kernel_y++) {
118	const size_t y = output_y + padding_top - kernel_y * dilation_height;
119	const size_t input_y = fxdiv_quotient_size_t(y, stride_height_divisor);
120	for (size_t kernel_x = `0`; kernel_x < kernel_width; kernel_x++) {
121	const size_t x = output_x + padding_left - kernel_x * dilation_width;
122	const size_t input_x = fxdiv_quotient_size_t(x, stride_width_divisor);
123	const size_t kernel_index = kernel_y * kernel_width + kernel_x;
124	const size_t index = output_tile_start * kernel_size + kernel_index * output_tile_size + output_tile_offset;
125	if (input_y * stride_height == y && input_y < input_height && input_x * stride_width == x && input_x < input_width) {
126	indirection_buffer[index] = (const void) ((uintptr_t) input + (input_y input_width + input_x) * input_pixel_stride);
127	} else {
128	indirection_buffer[index] = zero;
129	}
130	}
131	}
132	}
133	}
134	}
135
136	void xnn_indirection_init_subconv2d(
137	xnn_operator_t op,
138	size_t output_tile_size,
139	uint32_t log2_element_size)
140	{
141	const void** indirection_buffer = op->indirection_buffer;
142	struct subconvolution_params* subconvolution_params = op->subconvolution_buffer;
143	const void* input = op->input;
144	const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
145	const void* zero = op->zero_buffer;
146	const size_t input_height = op->input_height;
147	const size_t input_width = op->input_width;
148	const size_t output_height = op->output_height;
149	const size_t output_width = op->output_width;
150	const size_t kernel_height = op->kernel_height;
151	const size_t kernel_width = op->kernel_width;
152	const size_t stride_height = op->stride_height;
153	const size_t stride_width = op->stride_width;
154	const size_t padding_top = op->padding_top;
155	const size_t padding_left = op->padding_left;
156
157	const size_t modulo_padding_top = padding_top % stride_height;
158	const size_t modulo_padding_left = padding_left % stride_width;
159	for (size_t offset_y = `0`; offset_y < stride_height; offset_y++) {
160	const size_t output_y_start = subtract_modulo(offset_y, modulo_padding_top, stride_height);
161	for (size_t offset_x = `0`; offset_x < stride_width; offset_x++) {
162	const size_t output_x_start = subtract_modulo(offset_x, modulo_padding_left, stride_width);
163	const size_t sliced_output_width = divide_round_up(output_width - output_x_start, stride_width);
164
165	subconvolution_params->indirection_buffer = indirection_buffer;
166	subconvolution_params->indirection_y_stride =
167	subconvolution_params->indirection_x_stride * round_up(sliced_output_width, output_tile_size);
168	++subconvolution_params;
169
170	for (size_t output_y = output_y_start; output_y < output_height; output_y += stride_height) {
171	for (size_t output_tile_start = `0`; output_tile_start < sliced_output_width; output_tile_start += output_tile_size) {
172	for (size_t kernel_y = offset_y; kernel_y < kernel_height; kernel_y += stride_height) {
173	assert(doz(output_y + padding_top, kernel_y) % stride_height == `0`);
174	const size_t y = output_y + padding_top - kernel_y;
175	const size_t input_y = y / stride_height;
176
177	for (size_t kernel_x = offset_x; kernel_x < kernel_width; kernel_x += stride_width) {
178	for (size_t output_tile_offset = `0`; output_tile_offset < output_tile_size; output_tile_offset++) {
179	const size_t sliced_output_x = min(output_tile_start + output_tile_offset, sliced_output_width - `1`);
180	const size_t output_x = output_x_start + sliced_output_x * stride_width;
181
182	assert(doz(output_x + padding_left, kernel_x) % stride_width == `0`);
183	const size_t x = output_x + padding_left - kernel_x;
184	const size_t input_x = x / stride_width;
185
186	if (input_y < input_height && input_x < input_width) {
187	*indirection_buffer++ =
188	(const void) ((uintptr_t) input + (input_y input_width + input_x) * input_pixel_stride);
189	} else {
190	*indirection_buffer++ = zero;
191	}
192	}
193	}
194	}
195	}
196	}
197	}
198	}
199	}
200
201	void xnn_indirection_init_dwconv2d(
202	xnn_operator_t op,
203	size_t step_height,
204	size_t step_width,
205	size_t primary_tile,
206	uint32_t log2_element_size)
207	{
208	const void** indirection_buffer = op->indirection_buffer;
209	const void* input = op->input;
210	const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
211	const void* zero = op->zero_buffer;
212	const size_t input_height = op->input_height;
213	const size_t input_width = op->input_width;
214	const size_t output_height = op->output_height;
215	const size_t output_width = op->output_width;
216	const size_t kernel_height = op->kernel_height;
217	const size_t kernel_width = op->kernel_width;
218	const size_t stride_height = op->stride_height;
219	const size_t stride_width = op->stride_width;
220	const size_t dilation_height = op->dilation_height;
221	const size_t dilation_width = op->dilation_width;
222	const size_t input_padding_top = op->padding_top;
223	const size_t input_padding_left = op->padding_left;
224
225	for (size_t output_y = `0`; output_y < output_height; output_y++) {
226	for (size_t kernel_y = `0`; kernel_y < kernel_height; kernel_y++) {
227	const size_t input_y = output_y * stride_height + kernel_y * dilation_height - input_padding_top;
228	if (input_y < input_height) {
229	for (size_t output_x = `0`; output_x < output_width; output_x++) {
230	for (size_t kernel_x = `0`; kernel_x < kernel_width; kernel_x++) {
231	const size_t input_x = output_x * stride_width + kernel_x * dilation_width - input_padding_left;
232	const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
233	if (input_x < input_width) {
234	indirection_buffer[index] =
235	(const void) ((uintptr_t) input + (input_y input_width + input_x) * input_pixel_stride);
236	} else {
237	indirection_buffer[index] = zero;
238	}
239	}
240	}
241	} else {
242	for (size_t output_x = `0`; output_x < output_width; output_x++) {
243	for (size_t kernel_x = `0`; kernel_x < kernel_width; kernel_x++) {
244	const size_t index = output_y * step_height + output_x * step_width * kernel_height + kernel_x * kernel_height + kernel_y;
245	indirection_buffer[index] = zero;
246	}
247	}
248	}
249	}
250	}
251
252	const void* last_output_pixel = indirection_buffer[output_height * step_height - `1`];
253	const size_t last_kernel_index = output_height * step_height - (kernel_height * kernel_width);
254	for (size_t tile_index = kernel_height * kernel_width; tile_index < primary_tile; tile_index++) {
255	indirection_buffer[last_kernel_index + tile_index] = last_output_pixel;
256	}
257	}
258
259	void xnn_indirection_init_maxpool2d(
260	xnn_operator_t op,
261	size_t step_height,
262	size_t step_width,
263	uint32_t log2_element_size)
264	{
265	const void** indirection_buffer = op->indirection_buffer;
266	const void* input = op->input;
267	const size_t input_pixel_stride = op->input_pixel_stride << log2_element_size;
268	const size_t input_height = op->input_height;
269	const size_t input_width = op->input_width;
270	const size_t output_height = op->output_height;
271	const size_t output_width = op->output_width;
272	const size_t pooling_height = op->kernel_height;
273	const size_t pooling_width = op->kernel_width;
274	const size_t stride_height = op->stride_height;
275	const size_t stride_width = op->stride_width;
276	const size_t dilation_height = op->dilation_height;
277	const size_t dilation_width = op->dilation_width;
278	const size_t input_padding_top = op->padding_top;
279	const size_t input_padding_left = op->padding_left;
280
281	const bool any_dilation = (dilation_height \| dilation_width) > `1`;
282
283	if (any_dilation) {
284	// Clamp to the border doesn't work for pooling with dilation.
285	const size_t adjusted_padding_top = input_padding_top % dilation_height;
286	const size_t adjusted_padding_left = input_padding_left % dilation_width;
287	for (size_t output_y = `0`; output_y < output_height; output_y++) {
288	for (size_t pooling_y = `0`; pooling_y < pooling_height; pooling_y++) {
289	size_t safe_input_y = output_y * stride_height;
290	if XNN_UNPREDICTABLE(safe_input_y < adjusted_padding_top) {
291	safe_input_y += dilation_height;
292	}
293	safe_input_y -= adjusted_padding_top;
294
295	size_t input_y = output_y * stride_height + pooling_y * dilation_height - input_padding_top;
296	if XNN_UNPREDICTABLE(input_y >= input_height) {
297	input_y = safe_input_y;
298	}
299
300	for (size_t output_x = `0`; output_x < output_width; output_x++) {
301	for (size_t pooling_x = `0`; pooling_x < pooling_width; pooling_x++) {
302	size_t safe_input_x = output_x * stride_width;
303	if XNN_UNPREDICTABLE(safe_input_x < adjusted_padding_left) {
304	safe_input_x += dilation_width;
305	}
306	safe_input_x -= adjusted_padding_left;
307
308	size_t input_x = output_x * stride_width + pooling_x * dilation_width - input_padding_left;
309	if XNN_UNPREDICTABLE(input_x >= input_width) {
310	input_x = safe_input_x;
311	}
312
313	const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
314	indirection_buffer[index] = (const void) ((uintptr_t) input + (input_y input_width + input_x) * input_pixel_stride);
315	}
316	}
317	}
318	}
319	} else {
320	const size_t input_x_max = input_width - `1`;
321	const size_t input_y_max = input_height - `1`;
322	for (size_t output_y = `0`; output_y < output_height; output_y++) {
323	for (size_t pooling_y = `0`; pooling_y < pooling_height; pooling_y++) {
324	const size_t input_y = min(doz(output_y * stride_height + pooling_y * dilation_height, input_padding_top), input_y_max);
325	for (size_t output_x = `0`; output_x < output_width; output_x++) {
326	for (size_t pooling_x = `0`; pooling_x < pooling_width; pooling_x++) {
327	const size_t input_x = min(doz(output_x * stride_width + pooling_x * dilation_width, input_padding_left), input_x_max);
328	const size_t index = output_y * step_height + output_x * step_width * pooling_height + pooling_x * pooling_height + pooling_y;
329	indirection_buffer[index] = (const void) ((uintptr_t) input + (input_y input_width + input_x) * input_pixel_stride);
330	}
331	}
332	}
333	}
334	}
335	}
336
337	void xnn_indirection_init_resize_bilinear2d_hwc_f16(
338	size_t input_pixel_stride,
339	size_t input_height,
340	size_t input_width,
341	size_t output_height,
342	size_t output_width,
343	const void* input,
344	const void** indirection_buffer,
345	void* packed_weights,
346	bool align_corners,
347	bool tensorflow_legacy)
348	{
349	assert(input_height != `0`);
350	assert(input_height < `16777216` / 2*24 /*);
351	assert(input_width != `0`);
352	assert(input_width < `16777216` / 2*24 /*);
353	assert(output_height != `0`);
354	assert(output_height < `16777216` / 2*24 /*);
355	assert(output_width != `0`);
356	assert(output_width < `16777216` / 2*24 /*);
357
358	const int32_t width_adjustment = (int32_t) (align_corners && output_width != `1`);
359	const int32_t height_adjustment = (int32_t) (align_corners && output_height != `1`);
360	const float width_scale =
361	(float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
362	const float height_scale =
363	(float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
364
365	uint16_t* w = (uint16_t*) packed_weights;
366	const uint32_t input_y_max = (uint32_t) input_height - `1`;
367	const uint32_t input_x_max = (uint32_t) input_width - `1`;
368	if (tensorflow_legacy \|\| align_corners) {
369	for (size_t output_y = `0`; output_y < output_height; output_y++) {
370	const float input_y = (float) (int32_t) output_y * height_scale;
371	assert(input_y >= `0.0f`);
372	assert(input_y < (float) input_height);
373
374	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
375	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
376	const float alpha_y = input_y - (float) input_y_top;
377	for (size_t output_x = `0`; output_x < output_width; output_x++) {
378	const float input_x = (float) (int32_t) output_x * width_scale;
379	assert(input_x >= `0.0f`);
380	assert(input_x < (float) input_width);
381
382	const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
383	const uint32_t input_x_right = math_min_u32(input_x_left + `1`, input_x_max);
384	const float alpha_x = input_x - (float) input_x_left;
385	indirection_buffer[`0`] =
386	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
387	indirection_buffer[`1`] =
388	(void) ((uintptr_t) input + (input_y_top input_width + input_x_right) * input_pixel_stride);
389	indirection_buffer[`2`] =
390	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
391	indirection_buffer[`3`] =
392	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_right) * input_pixel_stride);
393	w[`0`] = fp16_ieee_from_fp32_value(alpha_x);
394	w[`1`] = fp16_ieee_from_fp32_value(alpha_y);
395	indirection_buffer += `4`;
396	w += `2`;
397	}
398	}
399	} else {
400	const float height_offset = `0.5f` * height_scale - `0.5f`;
401	const float width_offset = `0.5f` * width_scale - `0.5f`;
402	for (size_t output_y = `0`; output_y < output_height; output_y++) {
403	float input_y = (float) (int32_t) output_y * height_scale + height_offset;
404	input_y = math_min_f32(math_max_f32(input_y, `0.0f`), (float) input_y_max);
405	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
406	assert((int32_t) input_y_top >= `0`);
407	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
408	const float alpha_y = input_y - (float) input_y_top;
409	for (size_t output_x = `0`; output_x < output_width; output_x++) {
410	float input_x = (float) (int32_t) output_x * width_scale + width_offset;
411	input_x = math_min_f32(math_max_f32(input_x, `0.0f`), (float) input_x_max);
412	const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
413	assert((int32_t) input_x_left >= `0`);
414	const uint32_t input_x_right = math_min_u32(input_x_left + `1`, input_x_max);
415	const float alpha_x = input_x - (float) input_x_left;
416	indirection_buffer[`0`] =
417	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
418	indirection_buffer[`1`] =
419	(void) ((uintptr_t) input + (input_y_top input_width + input_x_right) * input_pixel_stride);
420	indirection_buffer[`2`] =
421	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
422	indirection_buffer[`3`] =
423	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_right) * input_pixel_stride);
424	w[`0`] = fp16_ieee_from_fp32_value(alpha_x);
425	w[`1`] = fp16_ieee_from_fp32_value(alpha_y);
426	indirection_buffer += `4`;
427	w += `2`;
428	}
429	}
430	}
431	}
432
433	void xnn_indirection_init_resize_bilinear2d_hwc_f32(
434	size_t input_pixel_stride,
435	size_t input_height,
436	size_t input_width,
437	size_t output_height,
438	size_t output_width,
439	const void* input,
440	const void** indirection_buffer,
441	float* packed_weights,
442	bool align_corners,
443	bool tensorflow_legacy)
444	{
445	assert(input_height != `0`);
446	assert(input_height < `16777216` / 2*24 /*);
447	assert(input_width != `0`);
448	assert(input_width < `16777216` / 2*24 /*);
449	assert(output_height != `0`);
450	assert(output_height < `16777216` / 2*24 /*);
451	assert(output_width != `0`);
452	assert(output_width < `16777216` / 2*24 /*);
453
454	const int32_t width_adjustment = (int32_t) (align_corners && output_width != `1`);
455	const int32_t height_adjustment = (int32_t) (align_corners && output_height != `1`);
456	const float width_scale =
457	(float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
458	const float height_scale =
459	(float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
460
461	const uint32_t input_y_max = (uint32_t) input_height - `1`;
462	const uint32_t input_x_max = (uint32_t) input_width - `1`;
463	if (tensorflow_legacy \|\| align_corners) {
464	for (size_t output_y = `0`; output_y < output_height; output_y++) {
465	const float input_y = (float) (int32_t) output_y * height_scale;
466	assert(input_y >= `0.0f`);
467	assert(input_y < (float) input_height);
468
469	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
470	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
471	const float alpha_y = input_y - (float) input_y_top;
472	for (size_t output_x = `0`; output_x < output_width; output_x++) {
473	const float input_x = (float) (int32_t) output_x * width_scale;
474	assert(input_x >= `0.0f`);
475	assert(input_x < (float) input_width);
476
477	const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
478	const uint32_t input_x_right = math_min_u32(input_x_left + `1`, input_x_max);
479	const float alpha_x = input_x - (float) input_x_left;
480	indirection_buffer[`0`] =
481	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
482	indirection_buffer[`1`] =
483	(void) ((uintptr_t) input + (input_y_top input_width + input_x_right) * input_pixel_stride);
484	indirection_buffer[`2`] =
485	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
486	indirection_buffer[`3`] =
487	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_right) * input_pixel_stride);
488	packed_weights[`0`] = alpha_x;
489	packed_weights[`1`] = alpha_y;
490	indirection_buffer += `4`;
491	packed_weights += `2`;
492	}
493	}
494	} else {
495	const float height_offset = `0.5f` * height_scale - `0.5f`;
496	const float width_offset = `0.5f` * width_scale - `0.5f`;
497	for (size_t output_y = `0`; output_y < output_height; output_y++) {
498	float input_y = (float) (int32_t) output_y * height_scale + height_offset;
499	input_y = math_min_f32(math_max_f32(input_y, `0.0f`), (float) input_y_max);
500	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
501	assert((int32_t) input_y_top >= `0`);
502	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
503	const float alpha_y = input_y - (float) input_y_top;
504	for (size_t output_x = `0`; output_x < output_width; output_x++) {
505	float input_x = (float) (int32_t) output_x * width_scale + width_offset;
506	input_x = math_min_f32(math_max_f32(input_x, `0.0f`), (float) input_x_max);
507	const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
508	assert((int32_t) input_x_left >= `0`);
509	const uint32_t input_x_right = math_min_u32(input_x_left + `1`, input_x_max);
510	const float alpha_x = input_x - (float) input_x_left;
511	indirection_buffer[`0`] =
512	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
513	indirection_buffer[`1`] =
514	(void) ((uintptr_t) input + (input_y_top input_width + input_x_right) * input_pixel_stride);
515	indirection_buffer[`2`] =
516	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
517	indirection_buffer[`3`] =
518	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_right) * input_pixel_stride);
519	packed_weights[`0`] = alpha_x;
520	packed_weights[`1`] = alpha_y;
521	indirection_buffer += `4`;
522	packed_weights += `2`;
523	}
524	}
525	}
526	}
527
528	void xnn_indirection_init_resize_bilinear2d_hwc_q11(
529	size_t input_pixel_stride,
530	size_t input_height,
531	size_t input_width,
532	size_t output_height,
533	size_t output_width,
534	const void* input,
535	const void** indirection_buffer,
536	int16_t* packed_weights,
537	bool align_corners,
538	bool tensorflow_legacy)
539	{
540	assert(input_height != `0`);
541	assert(input_height < `16777216` / 2*24 /*);
542	assert(input_width != `0`);
543	assert(input_width < `16777216` / 2*24 /*);
544	assert(output_height != `0`);
545	assert(output_height < `16777216` / 2*24 /*);
546	assert(output_width != `0`);
547	assert(output_width < `16777216` / 2*24 /*);
548
549	const int32_t width_adjustment = (int32_t) (align_corners && output_width != `1`);
550	const int32_t height_adjustment = (int32_t) (align_corners && output_height != `1`);
551	const float width_scale =
552	(float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
553	const float height_scale =
554	(float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
555
556	const uint32_t input_y_max = (uint32_t) input_height - `1`;
557	const uint32_t input_x_max = (uint32_t) input_width - `1`;
558	if (tensorflow_legacy \|\| align_corners) {
559	for (size_t output_y = `0`; output_y < output_height; output_y++) {
560	const float input_y = (float) (int32_t) output_y * height_scale;
561	assert(input_y >= `0.0f`);
562	assert(input_y < (float) input_height);
563
564	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
565	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
566	const float alpha_y = input_y - (float) input_y_top;
567	for (size_t output_x = `0`; output_x < output_width; output_x++) {
568	const float input_x = (float) (int32_t) output_x * width_scale;
569	assert(input_x >= `0.0f`);
570	assert(input_x < (float) input_width);
571
572	const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
573	const uint32_t input_x_right = math_min_u32(input_x_left + `1`, input_x_max);
574	const float alpha_x = input_x - (float) input_x_left;
575	indirection_buffer[`0`] =
576	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
577	indirection_buffer[`1`] =
578	(void) ((uintptr_t) input + (input_y_top input_width + input_x_right) * input_pixel_stride);
579	indirection_buffer[`2`] =
580	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
581	indirection_buffer[`3`] =
582	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_right) * input_pixel_stride);
583	packed_weights[`0`] = (int16_t) lrintf(alpha_x * `0x1.0p+11f`);
584	packed_weights[`1`] = (int16_t) lrintf(alpha_y * `0x1.0p+11f`);
585	indirection_buffer += `4`;
586	packed_weights += `2`;
587	}
588	}
589	} else {
590	const float height_offset = `0.5f` * height_scale - `0.5f`;
591	const float width_offset = `0.5f` * width_scale - `0.5f`;
592	for (size_t output_y = `0`; output_y < output_height; output_y++) {
593	float input_y = (float) (int32_t) output_y * height_scale + height_offset;
594	input_y = math_min_f32(math_max_f32(input_y, `0.0f`), (float) input_y_max);
595	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
596	assert((int32_t) input_y_top >= `0`);
597	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
598	const float alpha_y = input_y - (float) input_y_top;
599	for (size_t output_x = `0`; output_x < output_width; output_x++) {
600	float input_x = (float) (int32_t) output_x * width_scale + width_offset;
601	input_x = math_min_f32(math_max_f32(input_x, `0.0f`), (float) input_x_max);
602	const uint32_t input_x_left = (uint32_t) (int32_t) input_x;
603	assert((int32_t) input_x_left >= `0`);
604	const uint32_t input_x_right = math_min_u32(input_x_left + `1`, input_x_max);
605	const float alpha_x = input_x - (float) input_x_left;
606	indirection_buffer[`0`] =
607	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
608	indirection_buffer[`1`] =
609	(void) ((uintptr_t) input + (input_y_top input_width + input_x_right) * input_pixel_stride);
610	indirection_buffer[`2`] =
611	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
612	indirection_buffer[`3`] =
613	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_right) * input_pixel_stride);
614	packed_weights[`0`] = (int16_t) lrintf(alpha_x * `0x1.0p+11f`);
615	packed_weights[`1`] = (int16_t) lrintf(alpha_y * `0x1.0p+11f`);
616	indirection_buffer += `4`;
617	packed_weights += `2`;
618	}
619	}
620	}
621	}
622
623	void xnn_indirection_init_resize_bilinear2d_chw_f16(
624	size_t input_pixel_stride,
625	size_t input_height,
626	size_t input_width,
627	size_t output_height,
628	size_t output_width,
629	const void* input,
630	const void** indirection_buffer,
631	void* packed_weights,
632	bool align_corners,
633	bool tensorflow_legacy)
634	{
635	assert(input_height > `1`);
636	assert(input_height < `16777216` / 2*24 /*);
637	assert(input_width > `1`);
638	assert(input_width < `16777216` / 2*24 /*);
639	assert(output_height != `0`);
640	assert(output_height < `16777216` / 2*24 /*);
641	assert(output_width != `0`);
642	assert(output_width < `16777216` / 2*24 /*);
643
644	const int32_t width_adjustment = (int32_t) (align_corners && output_width != `1`);
645	const int32_t height_adjustment = (int32_t) (align_corners && output_height != `1`);
646	const float width_scale =
647	(float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
648	const float height_scale =
649	(float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
650
651	uint16_t* w = (uint16_t*) packed_weights;
652	const uint32_t input_y_max = (uint32_t) input_height - `1`;
653	const uint32_t input_x_max = (uint32_t) input_width - `1`;
654	if (tensorflow_legacy \|\| align_corners) {
655	for (size_t output_y = `0`; output_y < output_height; output_y++) {
656	const float input_y = (float) (int32_t) output_y * height_scale;
657	assert(input_y >= `0.0f`);
658	assert(input_y < (float) input_height);
659
660	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
661	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
662	const float alpha_y = input_y - (float) input_y_top;
663	for (size_t output_x = `0`; output_x < output_width; output_x++) {
664	const float input_x = (float) (int32_t) output_x * width_scale;
665	assert(input_x >= `0.0f`);
666	assert(input_x < (float) input_width);
667
668	uint32_t input_x_left = (uint32_t) (int32_t) input_x;
669
670	float alpha_x = input_x - (float) input_x_left;
671	if (input_x_left == input_x_max) {
672	// Ensure that there is a pixel to the right of the one pointed at,
673	// as required by some CHW kernels.
674	--input_x_left;
675	alpha_x = `1.0f`;
676	}
677	indirection_buffer[`0`] =
678	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
679	indirection_buffer[`1`] =
680	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
681	w[`0`] = fp16_ieee_from_fp32_value(alpha_x);
682	w[`1`] = fp16_ieee_from_fp32_value(alpha_y);
683	indirection_buffer += `2`;
684	w += `2`;
685	}
686	}
687	} else {
688	const float height_offset = `0.5f` * height_scale - `0.5f`;
689	const float width_offset = `0.5f` * width_scale - `0.5f`;
690	for (size_t output_y = `0`; output_y < output_height; output_y++) {
691	float input_y = (float) (int32_t) output_y * height_scale + height_offset;
692	input_y = math_min_f32(math_max_f32(input_y, `0.0f`), (float) input_y_max);
693	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
694	assert((int32_t) input_y_top >= `0`);
695	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
696	const float alpha_y = input_y - (float) input_y_top;
697	for (size_t output_x = `0`; output_x < output_width; output_x++) {
698	float input_x = (float) (int32_t) output_x * width_scale + width_offset;
699	input_x = math_min_f32(math_max_f32(input_x, `0.0f`), (float) input_x_max);
700	uint32_t input_x_left = (uint32_t) (int32_t) input_x;
701	assert((int32_t) input_x_left >= `0`);
702
703	float alpha_x = input_x - (float) input_x_left;
704	if (input_x_left == input_x_max) {
705	// Ensure that there is a pixel to the right of the one pointed at,
706	// as required by some CHW kernels.
707	--input_x_left;
708	alpha_x = `1.0f`;
709	}
710
711	indirection_buffer[`0`] =
712	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
713	indirection_buffer[`1`] =
714	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
715	w[`0`] = fp16_ieee_from_fp32_value(alpha_x);
716	w[`1`] = fp16_ieee_from_fp32_value(alpha_y);
717	indirection_buffer += `2`;
718	w += `2`;
719	}
720	}
721	}
722	}
723
724	void xnn_indirection_init_resize_bilinear2d_chw_f32(
725	size_t input_pixel_stride,
726	size_t input_height,
727	size_t input_width,
728	size_t output_height,
729	size_t output_width,
730	const void* input,
731	const void** indirection_buffer,
732	float* packed_weights,
733	bool align_corners,
734	bool tensorflow_legacy)
735	{
736	assert(input_height > `1`);
737	assert(input_height < `16777216` / 2*24 /*);
738	assert(input_width > `1`);
739	assert(input_width < `16777216` / 2*24 /*);
740	assert(output_height != `0`);
741	assert(output_height < `16777216` / 2*24 /*);
742	assert(output_width != `0`);
743	assert(output_width < `16777216` / 2*24 /*);
744
745	const int32_t width_adjustment = (int32_t) (align_corners && output_width != `1`);
746	const int32_t height_adjustment = (int32_t) (align_corners && output_height != `1`);
747	const float width_scale =
748	(float) ((int32_t) input_width - width_adjustment) / (float) ((int32_t) output_width - width_adjustment);
749	const float height_scale =
750	(float) ((int32_t) input_height - height_adjustment) / (float) ((int32_t) output_height - height_adjustment);
751
752	const uint32_t input_y_max = (uint32_t) input_height - `1`;
753	const uint32_t input_x_max = (uint32_t) input_width - `1`;
754	if (tensorflow_legacy \|\| align_corners) {
755	for (size_t output_y = `0`; output_y < output_height; output_y++) {
756	const float input_y = (float) (int32_t) output_y * height_scale;
757	assert(input_y >= `0.0f`);
758	assert(input_y < (float) input_height);
759
760	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
761	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
762	const float alpha_y = input_y - (float) input_y_top;
763	for (size_t output_x = `0`; output_x < output_width; output_x++) {
764	const float input_x = (float) (int32_t) output_x * width_scale;
765	assert(input_x >= `0.0f`);
766	assert(input_x < (float) input_width);
767
768	uint32_t input_x_left = (uint32_t) (int32_t) input_x;
769
770	float alpha_x = input_x - (float) input_x_left;
771	if (input_x_left == input_x_max) {
772	// Ensure that there is a pixel to the right of the one pointed at,
773	// as required by some CHW kernels.
774	--input_x_left;
775	alpha_x = `1.0f`;
776	}
777	indirection_buffer[`0`] =
778	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
779	indirection_buffer[`1`] =
780	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
781	packed_weights[`0`] = alpha_x;
782	packed_weights[`1`] = alpha_y;
783	indirection_buffer += `2`;
784	packed_weights += `2`;
785	}
786	}
787	} else {
788	const float height_offset = `0.5f` * height_scale - `0.5f`;
789	const float width_offset = `0.5f` * width_scale - `0.5f`;
790	for (size_t output_y = `0`; output_y < output_height; output_y++) {
791	float input_y = (float) (int32_t) output_y * height_scale + height_offset;
792	input_y = math_min_f32(math_max_f32(input_y, `0.0f`), (float) input_y_max);
793	const uint32_t input_y_top = (uint32_t) (int32_t) input_y;
794	assert((int32_t) input_y_top >= `0`);
795	const uint32_t input_y_bottom = math_min_u32(input_y_top + `1`, input_y_max);
796	const float alpha_y = input_y - (float) input_y_top;
797	for (size_t output_x = `0`; output_x < output_width; output_x++) {
798	float input_x = (float) (int32_t) output_x * width_scale + width_offset;
799	input_x = math_min_f32(math_max_f32(input_x, `0.0f`), (float) input_x_max);
800	uint32_t input_x_left = (uint32_t) (int32_t) input_x;
801	assert((int32_t) input_x_left >= `0`);
802
803	float alpha_x = input_x - (float) input_x_left;
804	if (input_x_left == input_x_max) {
805	// Ensure that there is a pixel to the right of the one pointed at,
806	// as required by some CHW kernels.
807	--input_x_left;
808	alpha_x = `1.0f`;
809	}
810
811	indirection_buffer[`0`] =
812	(void) ((uintptr_t) input + (input_y_top input_width + input_x_left) * input_pixel_stride);
813	indirection_buffer[`1`] =
814	(void) ((uintptr_t) input + (input_y_bottom input_width + input_x_left) * input_pixel_stride);
815	packed_weights[`0`] = alpha_x;
816	packed_weights[`1`] = alpha_y;
817	indirection_buffer += `2`;
818	packed_weights += `2`;
819	}
820	}
821	}
822	}
823
824	void xnn_indirection_init_unpool2d(
825	xnn_operator_t op,
826	size_t batch_start,
827	uint32_t log2_element_size)
828	{
829	const void** indirection_buffer = op->indirection_buffer;
830	const void* output = op->output;
831	const size_t output_pixel_stride = op->output_pixel_stride << log2_element_size;
832	const size_t batch_size = op->batch_size;
833	const size_t input_height = op->input_height;
834	const size_t input_width = op->input_width;
835	const size_t output_height = op->output_height;
836	const size_t output_width = op->output_width;
837	const size_t pooling_height = op->kernel_height;
838	const size_t pooling_width = op->kernel_width;
839	const size_t output_padding_top = op->padding_top;
840	const size_t output_padding_left = op->padding_left;
841
842	for (size_t image = batch_start; image < batch_size; image++) {
843	for (size_t input_y = `0`; input_y < input_height; input_y++) {
844	for (size_t pooling_y = `0`; pooling_y < pooling_height; pooling_y++) {
845	const size_t output_y = min(doz(input_y * pooling_height + pooling_y, output_padding_top), output_height - `1`);
846	for (size_t input_x = `0`; input_x < input_width; input_x++) {
847	for (size_t pooling_x = `0`; pooling_x < pooling_width; pooling_x++) {
848	const size_t output_x = min(doz(input_x * pooling_width + pooling_x, output_padding_left), output_width - `1`);
849	indirection_buffer[(((image * input_height + input_y) * input_width + input_x) * pooling_width + pooling_x) * pooling_height + pooling_y] =
850	(const void) ((uintptr_t) output + ((image output_height + output_y) * output_width + output_x) * output_pixel_stride);
851	}
852	}
853	}
854	}
855	}
856	}
857
858	void xnn_indirection_init_pavgpool2d_f16(
859	size_t input_height,
860	size_t input_width,
861	size_t output_height,
862	size_t output_width,
863	size_t pooling_height,
864	size_t pooling_width,
865	size_t stride_height,
866	size_t stride_width,
867	size_t padding_top,
868	size_t padding_left,
869	uint16_t* pixelwise_buffer)
870	{
871	for (size_t output_y = `0`; output_y < output_height; output_y++) {
872	const size_t input_y_start = doz(output_y * stride_height, padding_top);
873	const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height);
874	const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
875	for (size_t output_x = `0`; output_x < output_width; output_x++) {
876	const size_t input_x_start = doz(output_x * stride_width, padding_left);
877	const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width);
878	const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
879	pixelwise_buffer++ = fp16_ieee_from_fp32_value(`1.0f` / ((float) (int32_t) (input_y_range input_x_range)));
880	}
881	}
882	}
883
884	void xnn_indirection_init_pavgpool2d_f32(
885	size_t input_height,
886	size_t input_width,
887	size_t output_height,
888	size_t output_width,
889	size_t pooling_height,
890	size_t pooling_width,
891	size_t stride_height,
892	size_t stride_width,
893	size_t padding_top,
894	size_t padding_left,
895	float* pixelwise_buffer)
896	{
897	for (size_t output_y = `0`; output_y < output_height; output_y++) {
898	const size_t input_y_start = doz(output_y * stride_height, padding_top);
899	const size_t input_y_end = min(doz(output_y * stride_height + pooling_height, padding_top), input_height);
900	const uint32_t input_y_range = (uint32_t) (input_y_end - input_y_start);
901	for (size_t output_x = `0`; output_x < output_width; output_x++) {
902	const size_t input_x_start = doz(output_x * stride_width, padding_left);
903	const size_t input_x_end = min(doz(output_x * stride_width + pooling_width, padding_left), input_width);
904	const uint32_t input_x_range = (uint32_t) (input_x_end - input_x_start);
905	pixelwise_buffer++ = `1.0f` / ((float) (int32_t) (input_y_range input_x_range));
906	}
907	}
908	}
909

Browse the source code of tensorflow/external/XNNPACK/src/indirection.c