1 | // Copyright 2020 Google LLC |
2 | // |
3 | // This source code is licensed under the BSD-style license found in the |
4 | // LICENSE file in the root directory of this source tree. |
5 | |
6 | #include <assert.h> |
7 | #include <stdbool.h> |
8 | #include <stdint.h> |
9 | #include <stdlib.h> |
10 | |
11 | #include <xnnpack/memory-planner.h> |
12 | #include <xnnpack/subgraph.h> |
13 | |
14 | // Check if two xnn_value's lifecycles overlap. |
15 | inline static bool value_lifecycle_overlap(const struct xnn_value_usage* a, const struct xnn_value_usage* b) { |
16 | assert(a->last_node >= a->first_node); |
17 | assert(b->last_node >= b->first_node); |
18 | if (a->first_node < b->first_node) { |
19 | return a->last_node >= b->first_node; |
20 | } else { |
21 | return b->last_node >= a->first_node; |
22 | } |
23 | } |
24 | |
25 | // Use this comparison function to sort xnn_value_usage according to the |
26 | // tensor_size in decreasing order. |
27 | static inline int cmp_value_usage_tensor_size(const void* a, const void* b) { |
28 | const size_t tensor_size_a = (*(struct xnn_value_usage *const*)a)->tensor_size; |
29 | const size_t tensor_size_b = (*(struct xnn_value_usage *const*)b)->tensor_size; |
30 | return (tensor_size_b > tensor_size_a) - (tensor_size_b < tensor_size_a); |
31 | } |
32 | |
33 | static void populate_value_lifecycle(const xnn_subgraph_t subgraph, struct xnn_value_usage* usage) { |
34 | assert(subgraph != NULL); |
35 | if (subgraph->num_nodes == 0) { |
36 | return; |
37 | } |
38 | // As we initialized first/last_node in each xnn_value_usage to 0 as in 'xnn_init_value_mem_allocation_tracker', |
39 | // we start with the second node to tell whether first/last_node have been set or not, and check the first node last. |
40 | for (uint32_t nid = 1; nid < subgraph->num_nodes; ++nid) { |
41 | const struct xnn_node* node = subgraph->nodes + nid; |
42 | for (uint32_t i = 0; i < node->num_inputs; ++i) { |
43 | if (usage[node->inputs[i]].first_node == 0) { |
44 | usage[node->inputs[i]].first_node = nid; |
45 | } |
46 | usage[node->inputs[i]].last_node = nid; |
47 | } |
48 | for (uint32_t i = 0; i < node->num_outputs; ++i) { |
49 | if (usage[node->outputs[i]].first_node == 0) { |
50 | usage[node->outputs[i]].first_node = nid; |
51 | } |
52 | usage[node->outputs[i]].last_node = nid; |
53 | } |
54 | } |
55 | const struct xnn_node* first_node = subgraph->nodes; |
56 | for (uint32_t i = 0; i < first_node->num_inputs; ++i) { |
57 | usage[first_node->inputs[i]].first_node = 0; |
58 | } |
59 | for (uint32_t i = 0; i < first_node->num_outputs; ++i) { |
60 | usage[first_node->outputs[i]].first_node = 0; |
61 | } |
62 | // Separate loop over all values to make sure we have usage records properly initialized with invalid reuse_value_id. |
63 | // Some usage records are not associated with any nodes, and they will not be visited by the loops over nodes above. |
64 | for (uint32_t i = 0; i < subgraph->num_values; i++) { |
65 | usage[i].reuse_value_id = XNN_INVALID_VALUE_ID; |
66 | usage[i].alloc_offset = SIZE_MAX; |
67 | } |
68 | } |
69 | |
70 | // Represent a memory block [start, end) |
71 | struct memory_block { |
72 | size_t start; |
73 | size_t end; |
74 | }; |
75 | |
76 | // Use this comparison function to sort memory_block according to the 'start' |
77 | // in increasing order. |
78 | static inline int cmp_memory_block(const void* a, const void* b) { |
79 | const size_t start_a = ((const struct memory_block*)a)->start; |
80 | const size_t start_b = ((const struct memory_block*)b)->start; |
81 | return (start_a > start_b) - (start_a < start_b); |
82 | } |
83 | |
84 | // Given the current live memory blocks, return the offset in a memory arena for a to-be-allocated value of size |
85 | // 'to_alloc_size'. |
86 | static size_t find_value_alloc_offset(struct memory_block* live_mem_blocks, |
87 | size_t num_mem_blocks, |
88 | size_t to_alloc_size) { |
89 | if (num_mem_blocks == 0) { |
90 | return 0; |
91 | } |
92 | |
93 | if (num_mem_blocks == 1) { |
94 | return live_mem_blocks[0].end; |
95 | } |
96 | |
97 | // Sort memory blocks according to 'start' in increasing order. |
98 | qsort(live_mem_blocks, num_mem_blocks, sizeof(struct memory_block), cmp_memory_block); |
99 | |
100 | // Coalesce overlapping or immediate adjacent memory blocks to form a list of non-overlapping memory blocks in order |
101 | // to find the smallest gap. |
102 | size_t num_coalesced_mem_blocks = 1; |
103 | for (size_t i = 1; i < num_mem_blocks; ++i) { |
104 | const size_t current_coalesced_end = |
105 | live_mem_blocks[num_coalesced_mem_blocks - 1].end; |
106 | if (live_mem_blocks[i].start > current_coalesced_end) { |
107 | assert(num_coalesced_mem_blocks <= i); |
108 | live_mem_blocks[num_coalesced_mem_blocks] = live_mem_blocks[i]; |
109 | num_coalesced_mem_blocks++; |
110 | continue; |
111 | } |
112 | if (live_mem_blocks[i].end > current_coalesced_end) { |
113 | live_mem_blocks[num_coalesced_mem_blocks - 1].end = live_mem_blocks[i].end; |
114 | } |
115 | } |
116 | |
117 | size_t smallest_gap_size = SIZE_MAX; |
118 | // The first index to live_mem_blocks that the 'to_alloc_size' should be allocated after. |
119 | size_t smallest_gap_index = num_coalesced_mem_blocks - 1; |
120 | for (size_t i = 0; i < num_coalesced_mem_blocks - 1; ++i) { |
121 | assert(live_mem_blocks[i + 1].start > live_mem_blocks[i].end); |
122 | const size_t gap = live_mem_blocks[i + 1].start - live_mem_blocks[i].end; |
123 | if (gap >= to_alloc_size && gap < smallest_gap_size) { |
124 | smallest_gap_index = i; |
125 | smallest_gap_size = gap; |
126 | } |
127 | } |
128 | return live_mem_blocks[smallest_gap_index].end; |
129 | } |
130 | |
131 | void xnn_init_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker, const xnn_subgraph_t subgraph) { |
132 | tracker->subgraph = subgraph; |
133 | tracker->mem_arena_size = 0; |
134 | tracker->usage = xnn_allocate_zero_memory(sizeof(struct xnn_value_usage) * subgraph->num_values); |
135 | #if XNN_ENABLE_MEMOPT |
136 | populate_value_lifecycle(tracker->subgraph, tracker->usage); |
137 | #endif |
138 | tracker->min_value_id = XNN_INVALID_VALUE_ID; |
139 | tracker->max_value_id = XNN_INVALID_VALUE_ID; |
140 | } |
141 | |
142 | void xnn_mark_tensor_as_reuse(struct xnn_value_allocation_tracker* tracker, |
143 | uint32_t value_id, |
144 | uint32_t reuse_value_id, |
145 | uint32_t new_last_node) { |
146 | // Set tensor_size to 0 so memory planner will not try to find memory for these tensors. |
147 | tracker->usage[value_id].tensor_size = 0; |
148 | tracker->usage[value_id].reuse_value_id = reuse_value_id; |
149 | // The reused tensor has an expanded live-range. |
150 | tracker->usage[reuse_value_id].last_node = new_last_node; |
151 | } |
152 | |
153 | void xnn_add_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker, |
154 | uint32_t value_id, |
155 | size_t tensor_size) { |
156 | tracker->usage[value_id].tensor_size = tensor_size; |
157 | if (tracker->min_value_id == XNN_INVALID_VALUE_ID) { |
158 | tracker->min_value_id = value_id; |
159 | } else { |
160 | // Note that values are expected to be added in increasing order. |
161 | assert(value_id > tracker->min_value_id); |
162 | assert(value_id > tracker->max_value_id); |
163 | } |
164 | |
165 | tracker->max_value_id = value_id; |
166 | } |
167 | |
168 | void xnn_plan_value_allocation_tracker(struct xnn_value_allocation_tracker* tracker) { |
169 | #if XNN_ENABLE_MEMOPT |
170 | if (tracker->min_value_id == XNN_INVALID_VALUE_ID) { |
171 | assert(tracker->max_value_id == XNN_INVALID_VALUE_ID); |
172 | return; |
173 | } |
174 | |
175 | const uint32_t num_values = tracker->max_value_id - tracker->min_value_id + 1; |
176 | struct xnn_value_usage** sorted_usage = xnn_allocate_zero_memory(sizeof(struct xnn_value_usage*) * num_values); |
177 | size_t num_values_to_alloc = 0; |
178 | for (size_t i = tracker->min_value_id; i <= tracker->max_value_id; ++i) { |
179 | struct xnn_value_usage* info = tracker->usage + i; |
180 | if (info->tensor_size != 0) { |
181 | sorted_usage[num_values_to_alloc++] = info; |
182 | } |
183 | } |
184 | qsort(sorted_usage, num_values_to_alloc, sizeof(struct xnn_value_usage*), cmp_value_usage_tensor_size); |
185 | |
186 | // Start the allocation planning process. |
187 | struct memory_block* current_live_mem_blocks = xnn_allocate_zero_memory( |
188 | sizeof(struct memory_block) * num_values_to_alloc); |
189 | size_t mem_arena_size = 0; |
190 | for (size_t i = 0; i < num_values_to_alloc; ++i) { |
191 | size_t num_live_mem_blocks = 0; |
192 | struct xnn_value_usage* current = sorted_usage[i]; |
193 | for (size_t j = 0; j < i; ++j) { |
194 | const struct xnn_value_usage* allocated = sorted_usage[j]; |
195 | if (value_lifecycle_overlap(current, allocated)) { |
196 | current_live_mem_blocks[num_live_mem_blocks++] = (struct memory_block){ |
197 | .start = allocated->alloc_offset, |
198 | .end = allocated->alloc_offset + allocated->tensor_size, |
199 | }; |
200 | } |
201 | } |
202 | current->alloc_offset = find_value_alloc_offset(current_live_mem_blocks, num_live_mem_blocks, current->tensor_size); |
203 | if (mem_arena_size < current->alloc_offset + current->tensor_size) { |
204 | mem_arena_size = current->alloc_offset + current->tensor_size; |
205 | } |
206 | } |
207 | |
208 | // Walk through all tensors that are reusing memory, and update their usage records. |
209 | for (size_t i = tracker->min_value_id; i <= tracker->max_value_id; ++i) { |
210 | struct xnn_value_usage* usage = &tracker->usage[i]; |
211 | uint32_t reuse_id = usage->reuse_value_id; |
212 | if (reuse_id == XNN_INVALID_VALUE_ID) { |
213 | continue; |
214 | } |
215 | assert(tracker->usage[reuse_id].alloc_offset != SIZE_MAX); |
216 | usage->alloc_offset = tracker->usage[reuse_id].alloc_offset; |
217 | } |
218 | |
219 | tracker->mem_arena_size = mem_arena_size; |
220 | xnn_release_memory(sorted_usage); |
221 | xnn_release_memory(current_live_mem_blocks); |
222 | #else |
223 | tracker->mem_arena_size = 0; |
224 | for (uint32_t i = tracker->min_value_id; i <= tracker->max_value_id; ++i) { |
225 | if (tracker->usage[i].tensor_size > 0) { |
226 | tracker->usage[i].alloc_offset = tracker->mem_arena_size; |
227 | tracker->mem_arena_size += tracker->usage[i].tensor_size; |
228 | } |
229 | } |
230 | #endif |
231 | } |
232 | |