1// Copyright 2022 Google LLC
2//
3// This source code is licensed under the BSD-style license found in the
4// LICENSE file in the root directory of this source tree.
5
6#include <assert.h>
7
8#include <xnnpack.h> // For xnn_caches_t, xnn_operator_t.
9#include <xnnpack/common.h> // For XNN_ALLOCATION_ALIGNMENT.
10#include <xnnpack/cache.h> // For xnn_caches.
11#include <xnnpack/math.h>
12#include <xnnpack/operator.h> // For xnn_operator definition.
13#include <xnnpack/operator-utils.h>
14
15void* xnn_get_pointer_to_write_weights(
16 xnn_operator_t op,
17 size_t aligned_weights_size,
18 int padding_byte)
19{
20 assert(aligned_weights_size % XNN_ALLOCATION_ALIGNMENT == 0);
21 void* weights_ptr = NULL;
22 if (use_weights_cache(op)) {
23 weights_ptr = xnn_reserve_space_in_weights_cache(op->weights_cache, aligned_weights_size);
24 if (weights_ptr == NULL) {
25 return NULL;
26 }
27 } else {
28 op->packed_weights.pointer = xnn_allocate_simd_memory(aligned_weights_size);
29 if (op->packed_weights.pointer == NULL) {
30 return NULL;
31 }
32 weights_ptr = op->packed_weights.pointer;
33 }
34 memset(weights_ptr, padding_byte, aligned_weights_size);
35 return weights_ptr;
36}
37
38size_t xnn_compute_convolution_output_dimension(
39 size_t padded_input_dimension,
40 size_t kernel_dimension,
41 size_t dilation_dimension,
42 size_t subsampling_dimension)
43{
44 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
45 return doz(padded_input_dimension, effective_kernel_dimension) / subsampling_dimension + 1;
46}
47
48size_t xnn_compute_deconvolution_output_dimension(
49 size_t input_dimension,
50 size_t output_padding_dimension,
51 size_t adjustment_dimension,
52 size_t kernel_dimension,
53 size_t dilation_dimension,
54 size_t stride_dimension)
55{
56 const size_t effective_kernel_dimension = (kernel_dimension - 1) * dilation_dimension + 1;
57 return doz(
58 stride_dimension * (input_dimension - 1) + adjustment_dimension + effective_kernel_dimension,
59 output_padding_dimension);
60}
61
62size_t xnn_compute_unpooling_output_dimension(
63 size_t input_dimension,
64 size_t input_padding_dimension,
65 size_t kernel_dimension)
66{
67 return xnn_compute_deconvolution_output_dimension(
68 input_dimension, input_padding_dimension, /*adjustment_dimension=*/0,
69 kernel_dimension, /*dilation_dimension=*/1, /*stride_dimension=*/kernel_dimension);
70}
71
72// Calculate how much work a microkernel does.
73// A MxN microkernel does M+N (scalar) loads and M*N (scalar) FMAs.
74// So, given batch_size, the microkernel does:
75// divide_round_up(batch_size, mr) * (mr + nr) loads, and
76// divide_round_up(batch_size, mr) * (mr * nr) FMAs.
77// The total cost is then a linear combination of these 2 operations. From experimental data, use a multiplier of 3 for
78// loads, to prefer higher tile sizes which have better computation intensity.
79static size_t calculate_microkernel_cost(size_t batch_size, uint32_t mr, uint32_t nr)
80{
81 return divide_round_up(batch_size, mr) * (3 * (mr + nr) + mr * nr);
82}
83
84static bool mr_is_available_gemm(size_t mr, struct xnn_hmp_gemm_ukernel *gemm_cases, bool code_cache_available)
85{
86 #if XNN_PLATFORM_JIT
87 if (code_cache_available) {
88 return gemm_cases[mr-1].generated_code_offset[XNN_UARCH_DEFAULT] != XNN_CACHE_NOT_FOUND ||
89 gemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL;
90 }
91 #endif
92 return gemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL;
93}
94
95uint32_t xnn_get_heuristic_mr_gemm(
96 size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_gemm_ukernel *gemm_cases, bool code_cache_available)
97{
98 if (batch_size <= max_mr && mr_is_available_gemm(batch_size, gemm_cases, code_cache_available)) {
99 // We have a microkernel with MR that is the exact match with batch_size.
100 return batch_size;
101 }
102
103 // Try to find the best fitting mr.
104 // - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost)
105 // - smaller cost is better
106 uint32_t best_mr = max_mr;
107 size_t best_cost = SIZE_MAX;
108 for (uint32_t mr = 1; mr <= max_mr; mr++) {
109 if (!mr_is_available_gemm(mr, gemm_cases, code_cache_available)){
110 continue;
111 }
112 const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr);
113 if (current_cost <= best_cost) {
114 best_mr = mr;
115 best_cost = current_cost;
116 }
117 }
118 return best_mr;
119}
120
121static bool mr_is_available_igemm(size_t mr, struct xnn_hmp_igemm_ukernel *igemm_cases, bool code_cache_available)
122{
123 #if XNN_PLATFORM_JIT
124 if (code_cache_available) {
125 return igemm_cases[mr-1].generated_code_offset[XNN_UARCH_DEFAULT] != XNN_CACHE_NOT_FOUND ||
126 igemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL;
127 }
128 #endif
129 return igemm_cases[mr-1].function[XNN_UARCH_DEFAULT] != NULL;
130}
131
132uint32_t xnn_get_heuristic_mr_igemm(
133 size_t batch_size, uint32_t max_mr, uint32_t nr, struct xnn_hmp_igemm_ukernel *igemm_cases,
134 bool code_cache_available)
135{
136 if (batch_size <= max_mr && mr_is_available_igemm(batch_size, igemm_cases, code_cache_available)) {
137 // We have a microkernel with MR that is the exact match with batch_size.
138 return batch_size;
139 }
140
141 // Try to find the best fitting mr.
142 // - use a cost heuristic to calculate how much work is done by the microkernel (see calculate_microkernel_cost)
143 // - smaller cost is better
144 uint32_t best_mr = max_mr;
145 size_t best_cost = SIZE_MAX;
146 for (uint32_t mr = 1; mr <= max_mr; mr++) {
147 if (!mr_is_available_igemm(mr, igemm_cases, code_cache_available)){
148 continue;
149 }
150 const size_t current_cost = calculate_microkernel_cost(batch_size, mr, nr);
151 if (current_cost <= best_cost) {
152 best_mr = mr;
153 best_cost = current_cost;
154 }
155 }
156 return best_mr;
157}
158