operator-utils.c source code [tensorflow/external/XNNPACK/src/operator-utils.c]

1	// Copyright 2022 Google LLC
2	//
3	// This source code is licensed under the BSD-style license found in the
4	// LICENSE file in the root directory of this source tree.
5
6	#include <xnnpack.h> // For xnn_caches_t, xnn_operator_t.
7	#include <xnnpack/allocator.h> // For XNN_ALLOCATION_ALIGNMENT.
8	#include <xnnpack/cache.h> // For xnn_caches.
9	#include <xnnpack/operator.h> // For xnn_operator definition.
10
11	void* xnn_get_pointer_to_write_weights(
12	xnn_operator_t op,
13	size_t aligned_weights_size,
14	int padding_byte)
15	{
16	assert(aligned_weights_size % XNN_ALLOCATION_ALIGNMENT == `0`);
17	void* weights_ptr = NULL;
18	if (use_weights_cache(op)) {
19	weights_ptr = xnn_reserve_space_in_weights_cache(op->weights_cache, aligned_weights_size);
20	if (weights_ptr == NULL) {
21	return NULL;
22	}
23	} else {
24	op->packed_weights.pointer = xnn_allocate_simd_memory(aligned_weights_size);
25	if (op->packed_weights.pointer == NULL) {
26	return NULL;
27	}
28	weights_ptr = op->packed_weights.pointer;
29	}
30	memset(weights_ptr, padding_byte, aligned_weights_size);
31	return weights_ptr;
32	}
33
34	size_t xnn_compute_convolution_output_dimension(
35	size_t padded_input_dimension,
36	size_t kernel_dimension,
37	size_t dilation_dimension,
38	size_t subsampling_dimension)
39	{
40	const size_t effective_kernel_dimension = (kernel_dimension - `1`) * dilation_dimension + `1`;
41	return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + `1`;
42	}
43
44	size_t xnn_compute_deconvolution_output_dimension(
45	size_t input_dimension,
46	size_t output_padding_dimension,
47	size_t adjustment_dimension,
48	size_t kernel_dimension,
49	size_t dilation_dimension,
50	size_t stride_dimension)
51	{
52	const size_t effective_kernel_dimension = (kernel_dimension - `1`) * dilation_dimension + `1`;
53	return doz(
54	stride_dimension * (input_dimension - `1`) + adjustment_dimension + effective_kernel_dimension,
55	output_padding_dimension);
56	}
57
58	size_t xnn_compute_unpooling_output_dimension(
59	size_t input_dimension,
60	size_t input_padding_dimension,
61	size_t kernel_dimension)
62	{
63	return xnn_compute_deconvolution_output_dimension(
64	input_dimension, input_padding_dimension, /adjustment_dimension=/`0`,
65	kernel_dimension, /dilation_dimension=/`1`, /stride_dimension=/kernel_dimension);
66	}
67
68	// Calculate how much work a microkernel does.
69	// A MxN microkernel does M+N (scalar) loads and MN (scalar) FMAs.*
70	// So, given batch_size, the microkernel does:
71	// divide_round_up(batch_size, mr) (mr + nr) loads, and*
72	// divide_round_up(batch_size, mr) (mr * nr) FMAs.*
73	// The total cost is then a linear combination of these 2 operations. From experimental data, use a multiplier of 3 for
74	// loads, to prefer higher tile sizes which have better computation intensity.
75	static size_t calculate_microkernel_cost(size_t batch_size, uint32_t mr, uint32_t nr)
76	{
77	return divide_round_up(batch_size, mr) * (`3` * (mr + nr) + mr * nr);
78	}
79
80	uint32_t xnn_get_heuristic_mr_gemm(
81	size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_gemm_ukernel *gemm_cases)
82	{
83	assert(gemm_cases[max_mr-`1`].function[XNN_UARCH_DEFAULT] != NULL);
84	if (batch_size <= max_mr && gemm_cases[batch_size-`1`].function[XNN_UARCH_DEFAULT] != NULL) {
85	// We have a microkernel with MR that is the exact match with batch_size.
86	return batch_size;
87	}
88
89	// Try to find the best fitting mr.
90	// - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost)
91	// - smaller cost is better
92	uint32_t best_mr = max_mr;
93	size_t best_cost = SIZE_MAX;
94	for (uint32_t mr = `1`; mr <= max_mr; mr++) {
95	if (gemm_cases[mr-`1`].function[XNN_UARCH_DEFAULT] == NULL) {
96	continue;
97	}
98	const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr);
99	if (current_cost <= best_cost) {
100	best_mr = mr;
101	best_cost = current_cost;
102	}
103	}
104	assert(gemm_cases[best_mr-`1`].function[XNN_UARCH_DEFAULT] != NULL);
105	return best_mr;
106	}
107
108	uint32_t xnn_get_heuristic_mr_igemm(
109	size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_igemm_ukernel *igemm_cases)
110	{
111	assert(igemm_cases[max_mr-`1`].function[XNN_UARCH_DEFAULT] != NULL);
112	if (batch_size <= max_mr && igemm_cases[batch_size-`1`].function[XNN_UARCH_DEFAULT] != NULL) {
113	// We have a microkernel with MR that is the exact match with batch_size.
114	return batch_size;
115	}
116
117	// Try to find the best fitting mr.
118	// - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost)
119	// - smaller cost is better
120	uint32_t best_mr = max_mr;
121	size_t best_cost = SIZE_MAX;
122	for (uint32_t mr = `1`; mr <= max_mr; mr++) {
123	if (igemm_cases[mr-`1`].function[XNN_UARCH_DEFAULT] == NULL) {
124	continue;
125	}
126	const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr);
127	if (current_cost <= best_cost) {
128	best_mr = mr;
129	best_cost = current_cost;
130	}
131	}
132	assert(igemm_cases[best_mr-`1`].function[XNN_UARCH_DEFAULT] != NULL);
133	return best_mr;
134	}
135

Browse the source code of tensorflow/external/XNNPACK/src/operator-utils.c