1 | // Copyright 2022 Google LLC |
2 | // |
3 | // This source code is licensed under the BSD-style license found in the |
4 | // LICENSE file in the root directory of this source tree. |
5 | |
6 | #include <assert.h> |
7 | |
8 | #include <xnnpack.h> // For xnn_caches_t, xnn_operator_t. |
9 | #include <xnnpack/common.h> // For XNN_ALLOCATION_ALIGNMENT. |
10 | #include <xnnpack/cache.h> // For xnn_caches. |
11 | #include <xnnpack/math.h> |
12 | #include <xnnpack/operator.h> // For xnn_operator definition. |
13 | #include <xnnpack/operator-utils.h> |
14 | |
15 | void* xnn_get_pointer_to_write_weights( |
16 | xnn_operator_t op, |
17 | size_t aligned_weights_size, |
18 | int padding_byte) |
19 | { |
20 | assert(aligned_weights_size % XNN_ALLOCATION_ALIGNMENT == 0); |
21 | void* weights_ptr = NULL; |
22 | if (use_weights_cache(op)) { |
23 | weights_ptr = xnn_reserve_space_in_weights_cache(op->weights_cache, aligned_weights_size); |
24 | if (weights_ptr == NULL) { |
25 | return NULL; |
26 | } |
27 | } else { |
28 | op->packed_weights.pointer = xnn_allocate_simd_memory(aligned_weights_size); |
29 | if (op->packed_weights.pointer == NULL) { |
30 | return NULL; |
31 | } |
32 | weights_ptr = op->packed_weights.pointer; |
33 | } |
34 | memset(weights_ptr, padding_byte, aligned_weights_size); |
35 | return weights_ptr; |
36 | } |
37 | |
38 | size_t xnn_compute_convolution_output_dimension( |
39 | size_t padded_input_dimension, |
40 | size_t kernel_dimension, |
41 | size_t dilation_dimension, |
42 | size_t subsampling_dimension) |
43 | { |
44 | const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1; |
45 | return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1; |
46 | } |
47 | |
48 | size_t xnn_compute_deconvolution_output_dimension( |
49 | size_t input_dimension, |
50 | size_t output_padding_dimension, |
51 | size_t adjustment_dimension, |
52 | size_t kernel_dimension, |
53 | size_t dilation_dimension, |
54 | size_t stride_dimension) |
55 | { |
56 | const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1; |
57 | return doz( |
58 | stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension, |
59 | output_padding_dimension); |
60 | } |
61 | |
62 | size_t xnn_compute_unpooling_output_dimension( |
63 | size_t input_dimension, |
64 | size_t input_padding_dimension, |
65 | size_t kernel_dimension) |
66 | { |
67 | return xnn_compute_deconvolution_output_dimension( |
68 | input_dimension, input_padding_dimension, /*adjustment_dimension=*/0, |
69 | kernel_dimension, /*dilation_dimension=*/1, /*stride_dimension=*/kernel_dimension); |
70 | } |
71 | |
72 | // Calculate how much work a microkernel does. |
73 | // A MxN microkernel does M+N (scalar) loads and M*N (scalar) FMAs. |
74 | // So, given batch_size, the microkernel does: |
75 | // divide_round_up(batch_size, mr) * (mr + nr) loads, and |
76 | // divide_round_up(batch_size, mr) * (mr * nr) FMAs. |
77 | // The total cost is then a linear combination of these 2 operations. From experimental data, use a multiplier of 3 for |
78 | // loads, to prefer higher tile sizes which have better computation intensity. |
79 | static size_t calculate_microkernel_cost(size_t batch_size, uint32_t mr, uint32_t nr) |
80 | { |
81 | return divide_round_up(batch_size, mr) * (3 * (mr + nr) + mr * nr); |
82 | } |
83 | |
84 | static bool mr_is_available_gemm(size_t mr, struct xnn_hmp_gemm_ukernel *gemm_cases, bool code_cache_available) |
85 | { |
86 | #if XNN_PLATFORM_JIT |
87 | if (code_cache_available) { |
88 | return gemm_cases[mr-1].generated_code_offset[XNN_UARCH_DEFAULT] != XNN_CACHE_NOT_FOUND || |
89 | gemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL; |
90 | } |
91 | #endif |
92 | return gemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL; |
93 | } |
94 | |
95 | uint32_t xnn_get_heuristic_mr_gemm( |
96 | size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_gemm_ukernel *gemm_cases, bool code_cache_available) |
97 | { |
98 | if (batch_size <= max_mr && mr_is_available_gemm(batch_size, gemm_cases, code_cache_available)) { |
99 | // We have a microkernel with MR that is the exact match with batch_size. |
100 | return batch_size; |
101 | } |
102 | |
103 | // Try to find the best fitting mr. |
104 | // - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost) |
105 | // - smaller cost is better |
106 | uint32_t best_mr = max_mr; |
107 | size_t best_cost = SIZE_MAX; |
108 | for (uint32_t mr = 1; mr <= max_mr; mr++) { |
109 | if (!mr_is_available_gemm(mr, gemm_cases, code_cache_available)){ |
110 | continue; |
111 | } |
112 | const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr); |
113 | if (current_cost <= best_cost) { |
114 | best_mr = mr; |
115 | best_cost = current_cost; |
116 | } |
117 | } |
118 | return best_mr; |
119 | } |
120 | |
121 | static bool mr_is_available_igemm(size_t mr, struct xnn_hmp_igemm_ukernel *igemm_cases, bool code_cache_available) |
122 | { |
123 | #if XNN_PLATFORM_JIT |
124 | if (code_cache_available) { |
125 | return igemm_cases[mr-1].generated_code_offset[XNN_UARCH_DEFAULT] != XNN_CACHE_NOT_FOUND || |
126 | igemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL; |
127 | } |
128 | #endif |
129 | return igemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL; |
130 | } |
131 | |
132 | uint32_t xnn_get_heuristic_mr_igemm( |
133 | size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_igemm_ukernel *igemm_cases, |
134 | bool code_cache_available) |
135 | { |
136 | if (batch_size <= max_mr && mr_is_available_igemm(batch_size, igemm_cases, code_cache_available)) { |
137 | // We have a microkernel with MR that is the exact match with batch_size. |
138 | return batch_size; |
139 | } |
140 | |
141 | // Try to find the best fitting mr. |
142 | // - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost) |
143 | // - smaller cost is better |
144 | uint32_t best_mr = max_mr; |
145 | size_t best_cost = SIZE_MAX; |
146 | for (uint32_t mr = 1; mr <= max_mr; mr++) { |
147 | if (!mr_is_available_igemm(mr, igemm_cases, code_cache_available)){ |
148 | continue; |
149 | } |
150 | const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr); |
151 | if (current_cost <= best_cost) { |
152 | best_mr = mr; |
153 | best_cost = current_cost; |
154 | } |
155 | } |
156 | return best_mr; |
157 | } |
158 | |