1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #include "oneapi/dnnl/dnnl_config.h" |
18 | |
19 | #include "ngen_register_allocator.hpp" |
20 | #include "ngen_utils.hpp" |
21 | #include <iomanip> |
22 | #include <iostream> |
23 | |
24 | namespace ngen { |
25 | |
26 | int Bundle::first_reg(HW hw) const |
27 | { |
28 | int bundle0 = (bundle_id == any) ? 0 : bundle_id; |
29 | int bank0 = (bank_id == any) ? 0 : bank_id; |
30 | |
31 | switch (hw) { |
32 | case HW::Gen9: |
33 | case HW::Gen10: |
34 | return (bundle0 << 8) | bank0; |
35 | case HW::Gen11: |
36 | return (bundle0 << 8) | (bank0 << 1); |
37 | case HW::Gen12LP: |
38 | case HW::XeHPC: |
39 | return (bundle0 << 1) | bank0; |
40 | case HW::XeHP: |
41 | case HW::XeHPG: |
42 | return (bundle0 << 2) | (bank0 << 1); |
43 | default: |
44 | return 0; |
45 | } |
46 | } |
47 | |
48 | int Bundle::group_size(HW hw) const |
49 | { |
50 | if (bundle_id == any && bank_id == any) |
51 | return 128; |
52 | else switch (hw) { |
53 | case HW::Gen11: |
54 | case HW::XeHP: |
55 | case HW::XeHPG: |
56 | return 2; |
57 | default: |
58 | return 1; |
59 | } |
60 | } |
61 | |
62 | int Bundle::stride(HW hw) const |
63 | { |
64 | if (bundle_id == any && bank_id == any) |
65 | return 128; |
66 | else switch (hw) { |
67 | case HW::Gen9: |
68 | case HW::Gen10: |
69 | return 2; |
70 | case HW::Gen11: |
71 | return 4; |
72 | case HW::Gen12LP: |
73 | return 16; |
74 | case HW::XeHP: |
75 | case HW::XeHPG: |
76 | return 64; |
77 | case HW::XeHPC: |
78 | return 32; |
79 | default: |
80 | return 128; |
81 | } |
82 | } |
83 | |
84 | int64_t Bundle::reg_mask(HW hw, int offset) const |
85 | { |
86 | int64_t bundle_mask = -1, bank_mask = -1, base_mask = -1; |
87 | int bundle0 = (bundle_id == any) ? 0 : bundle_id; |
88 | int bank0 = (bank_id == any) ? 0 : bank_id; |
89 | |
90 | switch (hw) { |
91 | case HW::Gen9: |
92 | case HW::Gen10: |
93 | if (bundle_id != any && bundle_id != offset) bundle_mask = 0; |
94 | if (bank_id != any) bank_mask = 0x5555555555555555 << bank_id; |
95 | return bundle_mask & bank_mask; |
96 | case HW::Gen11: |
97 | if (bundle_id != any && bundle_id != offset) bundle_mask = 0; |
98 | if (bank_id != any) bank_mask = 0x3333333333333333 << (bank_id << 1); |
99 | return bundle_mask & bank_mask; |
100 | case HW::Gen12LP: |
101 | if (bundle_id != any) base_mask = 0x0003000300030003; |
102 | if (bank_id != any) base_mask &= 0x5555555555555555; |
103 | return base_mask << (bank0 + (bundle0 << 1)); |
104 | case HW::XeHP: |
105 | case HW::XeHPG: |
106 | if (bundle_id != any) base_mask = 0x000000000000000F; |
107 | if (bank_id != any) base_mask &= 0x3333333333333333; |
108 | return base_mask << ((bank0 << 1) + (bundle0 << 2)); |
109 | case HW::XeHPC: |
110 | if (bundle_id != any) base_mask = 0x0000000300000003; |
111 | if (bank_id != any) base_mask &= 0x5555555555555555; |
112 | return base_mask << (bank0 + (bundle0 << 1)); |
113 | default: |
114 | return -1; |
115 | } |
116 | } |
117 | |
118 | Bundle Bundle::locate(HW hw, RegData reg) |
119 | { |
120 | int base = reg.getBase(); |
121 | |
122 | switch (hw) { |
123 | case HW::Gen9: |
124 | case HW::Gen10: |
125 | return Bundle(base & 1, base >> 6); |
126 | case HW::Gen11: |
127 | return Bundle((base >> 1) & 1, base >> 6); |
128 | case HW::Gen12LP: |
129 | return Bundle(base & 1, (base >> 1) & 7); |
130 | case HW::XeHP: |
131 | case HW::XeHPG: |
132 | return Bundle((base >> 1) & 1, (base >> 2) & 0xF); |
133 | case HW::XeHPC: |
134 | return Bundle(base & 1, (base >> 1) & 0xF); |
135 | default: |
136 | return Bundle(); |
137 | } |
138 | } |
139 | |
140 | // ----------------------------------------- |
141 | // Low-level register allocator functions. |
142 | // ----------------------------------------- |
143 | |
144 | void RegisterAllocator::init() |
145 | { |
146 | fullSubMask = (1u << (GRF::bytes(hw) >> 2)) - 1; |
147 | for (int r = 0; r < max_regs; r++) |
148 | free_sub[r] = fullSubMask; |
149 | for (int r_whole = 0; r_whole < (max_regs >> 3); r_whole++) |
150 | free_whole[r_whole] = 0xFF; |
151 | |
152 | free_flag = (1u << FlagRegister::subcount(hw)) - 1; |
153 | reg_count = max_regs; |
154 | |
155 | if (hw < HW::XeHP) |
156 | setRegisterCount(128); |
157 | } |
158 | |
159 | void RegisterAllocator::claim(GRF reg) |
160 | { |
161 | int r = reg.getBase(); |
162 | |
163 | free_sub[r] = 0x00; |
164 | free_whole[r >> 3] &= ~(1 << (r & 7)); |
165 | } |
166 | |
167 | void RegisterAllocator::claim(GRFRange range) |
168 | { |
169 | for (int i = 0; i < range.getLen(); i++) |
170 | claim(range[i]); |
171 | } |
172 | |
173 | void RegisterAllocator::claim(Subregister subreg) |
174 | { |
175 | int r = subreg.getBase(); |
176 | int dw = subreg.getDwords(); |
177 | int o = (subreg.getByteOffset()) >> 2; |
178 | |
179 | claim_sub(r, o, dw); |
180 | } |
181 | |
182 | void RegisterAllocator::claim_sub(int r, int o, int dw) |
183 | { |
184 | free_sub[r] &= ~((1 << (o + dw)) - (1 << o)); |
185 | free_whole[r >> 3] &= ~(1 << (r & 7)); |
186 | } |
187 | |
188 | void RegisterAllocator::claim(FlagRegister flag) |
189 | { |
190 | free_flag &= ~(1 << flag.index()); |
191 | } |
192 | |
193 | void RegisterAllocator::setRegisterCount(int rcount) |
194 | { |
195 | if (rcount < reg_count) { |
196 | for (int r = rcount; r < max_regs; r++) |
197 | free_sub[r] = 0x00; |
198 | for (int rr = (rcount + 7) >> 3; rr < (max_regs >> 3); rr++) |
199 | free_whole[rr] = 0x00; |
200 | if ((rcount & 7) && (rcount < max_regs)) |
201 | free_whole[rcount >> 3] &= ~((1 << (rcount & 7)) - 1); |
202 | } else if (rcount > reg_count) { |
203 | for (int r = reg_count; r < std::min(rcount, max_regs); r++) |
204 | release(GRF(r)); |
205 | } |
206 | reg_count = rcount; |
207 | } |
208 | |
209 | int RegisterAllocator::countAllocedRegisters() const { |
210 | |
211 | int register_count = 0; |
212 | int group_size = 8 * sizeof(this->free_whole[0]); |
213 | int register_groups = this->reg_count / group_size; |
214 | for (int group = 0; group < register_groups; group++) { |
215 | for (int subgroup = 0; subgroup < group_size; subgroup++) { |
216 | if ((this->free_whole[group] & (1 << subgroup)) == 0) |
217 | register_count++; |
218 | } |
219 | } |
220 | return register_count; |
221 | } |
222 | |
223 | void RegisterAllocator::release(GRF reg) |
224 | { |
225 | if (reg.isInvalid()) return; |
226 | int r = reg.getBase(); |
227 | |
228 | free_sub[r] = fullSubMask; |
229 | free_whole[r >> 3] |= (1 << (r & 7)); |
230 | } |
231 | |
232 | void RegisterAllocator::release(GRFRange range) |
233 | { |
234 | if (range.isInvalid()) return; |
235 | for (int i = 0; i < range.getLen(); i++) |
236 | release(range[i]); |
237 | } |
238 | |
239 | void RegisterAllocator::release(Subregister subreg) |
240 | { |
241 | if (subreg.isInvalid()) return; |
242 | int r = subreg.getBase(); |
243 | int dw = subreg.getDwords(); |
244 | int o = (subreg.getByteOffset()) >> 2; |
245 | |
246 | free_sub[r] |= (1 << (o + dw)) - (1 << o); |
247 | if (free_sub[r] == fullSubMask) |
248 | free_whole[r >> 3] |= (1 << (r & 7)); |
249 | } |
250 | |
251 | void RegisterAllocator::release(FlagRegister flag) |
252 | { |
253 | if (flag.isInvalid()) return; |
254 | free_flag |= (1 << flag.index()); |
255 | } |
256 | |
257 | // ------------------------------------------- |
258 | // High-level register allocation functions. |
259 | // ------------------------------------------- |
260 | |
261 | GRFRange RegisterAllocator::alloc_range(int nregs, Bundle base_bundle, BundleGroup bundle_mask) |
262 | { |
263 | auto result = try_alloc_range(nregs, base_bundle, bundle_mask); |
264 | if (result.isInvalid()) |
265 | throw out_of_registers_exception(); |
266 | return result; |
267 | } |
268 | |
269 | Subregister RegisterAllocator::alloc_sub(DataType type, Bundle bundle) |
270 | { |
271 | auto result = try_alloc_sub(type, bundle); |
272 | if (result.isInvalid()) |
273 | throw out_of_registers_exception(); |
274 | return result; |
275 | } |
276 | |
277 | FlagRegister RegisterAllocator::alloc_flag() |
278 | { |
279 | auto result = try_alloc_flag(); |
280 | if (result.isInvalid()) |
281 | throw out_of_registers_exception(); |
282 | return result; |
283 | } |
284 | |
285 | GRFRange RegisterAllocator::try_alloc_range(int nregs, Bundle base_bundle, BundleGroup bundle_mask) |
286 | { |
287 | int64_t *free_whole64 = (int64_t *) free_whole; |
288 | bool ok = false; |
289 | int r_base = -1; |
290 | |
291 | for (int rchunk = 0; rchunk < (max_regs >> 6); rchunk++) { |
292 | int64_t free = free_whole64[rchunk] & bundle_mask.reg_mask(rchunk); |
293 | int64_t free_base = free & base_bundle.reg_mask(hw, rchunk); |
294 | |
295 | while (free_base) { |
296 | // Find the first free base register. |
297 | int first_bit = utils::bsf(free_base); |
298 | r_base = first_bit + (rchunk << 6); |
299 | |
300 | // Check if required # of registers are available. |
301 | int last_bit = first_bit + nregs; |
302 | if (last_bit <= 64) { |
303 | // Range to check doesn't cross 64-GRF boundary. Fast check using bitmasks. |
304 | uint64_t mask = ((uint64_t(1) << (last_bit - 1)) << 1) - (uint64_t(1) << first_bit); |
305 | ok = !(mask & ~free); |
306 | } else { |
307 | // Range to check crosses 64-GRF boundary. Check first part using bitmasks, |
308 | // Check the rest using a loop (ho hum). |
309 | uint64_t mask = ~uint64_t(0) << first_bit; |
310 | ok = !(mask & ~free); |
311 | if (ok) for (int rr = 64 - first_bit; rr < nregs; rr++) { |
312 | if (free_sub[r_base + rr] != fullSubMask) { |
313 | ok = false; |
314 | break; |
315 | } |
316 | } |
317 | } |
318 | |
319 | if (ok) { |
320 | // Create and claim GRF range. |
321 | GRFRange result(r_base, nregs); |
322 | claim(result); |
323 | |
324 | return result; |
325 | } |
326 | |
327 | // Not enough consecutive registers. Save time when looking for next base |
328 | // register by clearing the entire range of registers we just considered. |
329 | int64_t clear_mask = free + (uint64_t(1) << first_bit); |
330 | free &= clear_mask; |
331 | free_base &= clear_mask; |
332 | } |
333 | } |
334 | |
335 | return GRFRange(); |
336 | } |
337 | |
338 | GRF RegisterAllocator::try_alloc(Bundle bundle) |
339 | { |
340 | auto range = try_alloc_range(1, bundle); |
341 | return range.isInvalid() ? GRF() : range[0]; |
342 | } |
343 | |
344 | Subregister RegisterAllocator::try_alloc_sub(DataType type, Bundle bundle) |
345 | { |
346 | int dwords = getDwords(type); |
347 | int r_alloc = 0, o_alloc = 0; |
348 | |
349 | auto find_alloc_sub = [&,bundle,dwords](bool search_full_grf) -> bool { |
350 | static const uint16_t alloc_patterns[4] = {0b1111111111111111, 0b0101010101010101, 0, 0b0001000100010001}; |
351 | auto alloc_pattern = alloc_patterns[(dwords - 1) & 3]; |
352 | int64_t *free_whole64 = (int64_t *) free_whole; |
353 | |
354 | for (int rchunk = 0; rchunk < (max_regs >> 6); rchunk++) { |
355 | int64_t free = search_full_grf ? free_whole64[rchunk] : -1; |
356 | free &= bundle.reg_mask(hw, rchunk); |
357 | |
358 | while (free) { |
359 | int rr = utils::bsf(free); |
360 | int r = rr + (rchunk << 6); |
361 | free &= ~(int64_t(1) << rr); |
362 | |
363 | if (search_full_grf || free_sub[r] != fullSubMask) { |
364 | int subfree = free_sub[r]; |
365 | for (int dw = 1; dw < dwords; dw++) |
366 | subfree &= (subfree >> dw); |
367 | subfree &= alloc_pattern; |
368 | |
369 | if (subfree) { |
370 | r_alloc = r; |
371 | o_alloc = utils::bsf(subfree); |
372 | return true; |
373 | } |
374 | } |
375 | } |
376 | } |
377 | |
378 | return false; |
379 | }; |
380 | |
381 | // First try to find room in a partially allocated register; fall back to |
382 | // completely empty registers if unsuccessful. |
383 | bool success = find_alloc_sub(false) |
384 | || find_alloc_sub(true); |
385 | |
386 | if (!success) |
387 | return Subregister(); |
388 | |
389 | claim_sub(r_alloc, o_alloc, dwords); |
390 | |
391 | return Subregister(GRF(r_alloc), (o_alloc << 2) / getBytes(type), type); |
392 | } |
393 | |
394 | FlagRegister RegisterAllocator::try_alloc_flag() |
395 | { |
396 | if (!free_flag) return FlagRegister(); |
397 | |
398 | int idx = utils::bsf(free_flag); |
399 | free_flag &= (free_flag - 1); // clear lowest bit. |
400 | |
401 | return FlagRegister::createFromIndex(idx); |
402 | } |
403 | |
404 | void RegisterAllocator::dump(std::ostream &str) |
405 | { |
406 | str << "\n// Flag registers: " ; |
407 | for (int r = 0; r < FlagRegister::subcount(hw); r++) |
408 | str << char((free_flag & (1 << r)) ? '.' : 'x'); |
409 | |
410 | for (int r = 0; r < reg_count; r++) { |
411 | if (!(r & 0x1F)) { |
412 | str << "\n//\n// " << std::left; |
413 | str << 'r' << std::setw(3) << r; |
414 | str << " - r" << std::setw(3) << r+0x1F; |
415 | str << " " ; |
416 | } |
417 | if (!(r & 0xF)) str << ' '; |
418 | if (!(r & 0x3)) str << ' '; |
419 | |
420 | if (free_sub[r] == 0x00) str << 'x'; |
421 | else if (free_sub[r] == fullSubMask) str << '.'; |
422 | else str << '/'; |
423 | } |
424 | |
425 | str << "\n//\n" ; |
426 | |
427 | for (int r = 0; r < max_regs; r++) { |
428 | int rr = r >> 3, rb = 1 << (r & 7); |
429 | if ((free_sub[r] == fullSubMask) != bool(free_whole[rr] & rb)) |
430 | str << "// Inconsistent bitmaps at r" << r << std::endl; |
431 | if (free_sub[r] != 0x00 && free_sub[r] != fullSubMask) { |
432 | str << "// r" << std::setw(3) << r << " " ; |
433 | for (int s = 0; s < (GRF::bytes(hw) >> 2); s++) |
434 | str << char((free_sub[r] & (1 << s)) ? '.' : 'x'); |
435 | str << std::endl; |
436 | } |
437 | } |
438 | |
439 | str << std::endl; |
440 | } |
441 | |
442 | constexpr int BundleGroup::max_regs; |
443 | constexpr int BundleGroup::nmasks; |
444 | constexpr int RegisterAllocator::max_regs; |
445 | |
446 | } /* namespace ngen */ |
447 | |