operator-utils.c source code [pytorch/third_party/XNNPACK/src/operator-utils.c]

1	// Copyright 2022 Google LLC
2	//
3	// This source code is licensed under the BSD-style license found in the
4	// LICENSE file in the root directory of this source tree.
5
6	#include <assert.h>
7
8	#include <xnnpack.h> // For xnn_caches_t, xnn_operator_t.
9	#include <xnnpack/common.h> // For XNN_ALLOCATION_ALIGNMENT.
10	#include <xnnpack/cache.h> // For xnn_caches.
11	#include <xnnpack/math.h>
12	#include <xnnpack/operator.h> // For xnn_operator definition.
13	#include <xnnpack/operator-utils.h>
14
15	void* xnn_get_pointer_to_write_weights(
16	xnn_operator_t op,
17	size_t aligned_weights_size,
18	int padding_byte)
19	{
20	assert(aligned_weights_size % XNN_ALLOCATION_ALIGNMENT == `0`);
21	void* weights_ptr = NULL;
22	if (use_weights_cache(op)) {
23	weights_ptr = xnn_reserve_space_in_weights_cache(op->weights_cache, aligned_weights_size);
24	if (weights_ptr == NULL) {
25	return NULL;
26	}
27	} else {
28	op->packed_weights.pointer = xnn_allocate_simd_memory(aligned_weights_size);
29	if (op->packed_weights.pointer == NULL) {
30	return NULL;
31	}
32	weights_ptr = op->packed_weights.pointer;
33	}
34	memset(weights_ptr, padding_byte, aligned_weights_size);
35	return weights_ptr;
36	}
37
38	size_t xnn_compute_convolution_output_dimension(
39	size_t padded_input_dimension,
40	size_t kernel_dimension,
41	size_t dilation_dimension,
42	size_t subsampling_dimension)
43	{
44	const size_t effective_kernel_dimension = (kernel_dimension - `1`) * dilation_dimension + `1`;
45	return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + `1`;
46	}
47
48	size_t xnn_compute_deconvolution_output_dimension(
49	size_t input_dimension,
50	size_t output_padding_dimension,
51	size_t adjustment_dimension,
52	size_t kernel_dimension,
53	size_t dilation_dimension,
54	size_t stride_dimension)
55	{
56	const size_t effective_kernel_dimension = (kernel_dimension - `1`) * dilation_dimension + `1`;
57	return doz(
58	stride_dimension * (input_dimension - `1`) + adjustment_dimension + effective_kernel_dimension,
59	output_padding_dimension);
60	}
61
62	size_t xnn_compute_unpooling_output_dimension(
63	size_t input_dimension,
64	size_t input_padding_dimension,
65	size_t kernel_dimension)
66	{
67	return xnn_compute_deconvolution_output_dimension(
68	input_dimension, input_padding_dimension, /adjustment_dimension=/`0`,
69	kernel_dimension, /dilation_dimension=/`1`, /stride_dimension=/kernel_dimension);
70	}
71
72	// Calculate how much work a microkernel does.
73	// A MxN microkernel does M+N (scalar) loads and MN (scalar) FMAs.*
74	// So, given batch_size, the microkernel does:
75	// divide_round_up(batch_size, mr) (mr + nr) loads, and*
76	// divide_round_up(batch_size, mr) (mr * nr) FMAs.*
77	// The total cost is then a linear combination of these 2 operations. From experimental data, use a multiplier of 3 for
78	// loads, to prefer higher tile sizes which have better computation intensity.
79	static size_t calculate_microkernel_cost(size_t batch_size, uint32_t mr, uint32_t nr)
80	{
81	return divide_round_up(batch_size, mr) * (`3` * (mr + nr) + mr * nr);
82	}
83
84	static bool mr_is_available_gemm(size_t mr, struct xnn_hmp_gemm_ukernel *gemm_cases, bool code_cache_available)
85	{
86	#if XNN_PLATFORM_JIT
87	if (code_cache_available) {
88	return gemm_cases[mr-`1`].generated_code_offset[XNN_UARCH_DEFAULT] != XNN_CACHE_NOT_FOUND \|\|
89	gemm_cases[mr-`1`].function[XNN_UARCH_DEFAULT] != NULL;
90	}
91	#endif
92	return gemm_cases[mr-`1`].function[XNN_UARCH_DEFAULT] != NULL;
93	}
94
95	uint32_t xnn_get_heuristic_mr_gemm(
96	size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_gemm_ukernel *gemm_cases, bool code_cache_available)
97	{
98	if (batch_size <= max_mr && mr_is_available_gemm(batch_size, gemm_cases, code_cache_available)) {
99	// We have a microkernel with MR that is the exact match with batch_size.
100	return batch_size;
101	}
102
103	// Try to find the best fitting mr.
104	// - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost)
105	// - smaller cost is better
106	uint32_t best_mr = max_mr;
107	size_t best_cost = SIZE_MAX;
108	for (uint32_t mr = `1`; mr <= max_mr; mr++) {
109	if (!mr_is_available_gemm(mr, gemm_cases, code_cache_available)){
110	continue;
111	}
112	const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr);
113	if (current_cost <= best_cost) {
114	best_mr = mr;
115	best_cost = current_cost;
116	}
117	}
118	return best_mr;
119	}
120
121	static bool mr_is_available_igemm(size_t mr, struct xnn_hmp_igemm_ukernel *igemm_cases, bool code_cache_available)
122	{
123	#if XNN_PLATFORM_JIT
124	if (code_cache_available) {
125	return igemm_cases[mr-`1`].generated_code_offset[XNN_UARCH_DEFAULT] != XNN_CACHE_NOT_FOUND \|\|
126	igemm_cases[mr-`1`].function[XNN_UARCH_DEFAULT] != NULL;
127	}
128	#endif
129	return igemm_cases[mr-`1`].function[XNN_UARCH_DEFAULT] != NULL;
130	}
131
132	uint32_t xnn_get_heuristic_mr_igemm(
133	size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_igemm_ukernel *igemm_cases,
134	bool code_cache_available)
135	{
136	if (batch_size <= max_mr && mr_is_available_igemm(batch_size, igemm_cases, code_cache_available)) {
137	// We have a microkernel with MR that is the exact match with batch_size.
138	return batch_size;
139	}
140
141	// Try to find the best fitting mr.
142	// - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost)
143	// - smaller cost is better
144	uint32_t best_mr = max_mr;
145	size_t best_cost = SIZE_MAX;
146	for (uint32_t mr = `1`; mr <= max_mr; mr++) {
147	if (!mr_is_available_igemm(mr, igemm_cases, code_cache_available)){
148	continue;
149	}
150	const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr);
151	if (current_cost <= best_cost) {
152	best_mr = mr;
153	best_cost = current_cost;
154	}
155	}
156	return best_mr;
157	}
158

Browse the source code of pytorch/third_party/XNNPACK/src/operator-utils.c