1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#include "oneapi/dnnl/dnnl_config.h"
18
19#include "ngen_register_allocator.hpp"
20#include "ngen_utils.hpp"
21#include <iomanip>
22#include <iostream>
23
24namespace ngen {
25
26int Bundle::first_reg(HW hw) const
27{
28 int bundle0 = (bundle_id == any) ? 0 : bundle_id;
29 int bank0 = (bank_id == any) ? 0 : bank_id;
30
31 switch (hw) {
32 case HW::Gen9:
33 case HW::Gen10:
34 return (bundle0 << 8) | bank0;
35 case HW::Gen11:
36 return (bundle0 << 8) | (bank0 << 1);
37 case HW::Gen12LP:
38 case HW::XeHPC:
39 return (bundle0 << 1) | bank0;
40 case HW::XeHP:
41 case HW::XeHPG:
42 return (bundle0 << 2) | (bank0 << 1);
43 default:
44 return 0;
45 }
46}
47
48int Bundle::group_size(HW hw) const
49{
50 if (bundle_id == any && bank_id == any)
51 return 128;
52 else switch (hw) {
53 case HW::Gen11:
54 case HW::XeHP:
55 case HW::XeHPG:
56 return 2;
57 default:
58 return 1;
59 }
60}
61
62int Bundle::stride(HW hw) const
63{
64 if (bundle_id == any && bank_id == any)
65 return 128;
66 else switch (hw) {
67 case HW::Gen9:
68 case HW::Gen10:
69 return 2;
70 case HW::Gen11:
71 return 4;
72 case HW::Gen12LP:
73 return 16;
74 case HW::XeHP:
75 case HW::XeHPG:
76 return 64;
77 case HW::XeHPC:
78 return 32;
79 default:
80 return 128;
81 }
82}
83
84int64_t Bundle::reg_mask(HW hw, int offset) const
85{
86 int64_t bundle_mask = -1, bank_mask = -1, base_mask = -1;
87 int bundle0 = (bundle_id == any) ? 0 : bundle_id;
88 int bank0 = (bank_id == any) ? 0 : bank_id;
89
90 switch (hw) {
91 case HW::Gen9:
92 case HW::Gen10:
93 if (bundle_id != any && bundle_id != offset) bundle_mask = 0;
94 if (bank_id != any) bank_mask = 0x5555555555555555 << bank_id;
95 return bundle_mask & bank_mask;
96 case HW::Gen11:
97 if (bundle_id != any && bundle_id != offset) bundle_mask = 0;
98 if (bank_id != any) bank_mask = 0x3333333333333333 << (bank_id << 1);
99 return bundle_mask & bank_mask;
100 case HW::Gen12LP:
101 if (bundle_id != any) base_mask = 0x0003000300030003;
102 if (bank_id != any) base_mask &= 0x5555555555555555;
103 return base_mask << (bank0 + (bundle0 << 1));
104 case HW::XeHP:
105 case HW::XeHPG:
106 if (bundle_id != any) base_mask = 0x000000000000000F;
107 if (bank_id != any) base_mask &= 0x3333333333333333;
108 return base_mask << ((bank0 << 1) + (bundle0 << 2));
109 case HW::XeHPC:
110 if (bundle_id != any) base_mask = 0x0000000300000003;
111 if (bank_id != any) base_mask &= 0x5555555555555555;
112 return base_mask << (bank0 + (bundle0 << 1));
113 default:
114 return -1;
115 }
116}
117
118Bundle Bundle::locate(HW hw, RegData reg)
119{
120 int base = reg.getBase();
121
122 switch (hw) {
123 case HW::Gen9:
124 case HW::Gen10:
125 return Bundle(base & 1, base >> 6);
126 case HW::Gen11:
127 return Bundle((base >> 1) & 1, base >> 6);
128 case HW::Gen12LP:
129 return Bundle(base & 1, (base >> 1) & 7);
130 case HW::XeHP:
131 case HW::XeHPG:
132 return Bundle((base >> 1) & 1, (base >> 2) & 0xF);
133 case HW::XeHPC:
134 return Bundle(base & 1, (base >> 1) & 0xF);
135 default:
136 return Bundle();
137 }
138}
139
140// -----------------------------------------
141// Low-level register allocator functions.
142// -----------------------------------------
143
144void RegisterAllocator::init()
145{
146 fullSubMask = (1u << (GRF::bytes(hw) >> 2)) - 1;
147 for (int r = 0; r < max_regs; r++)
148 free_sub[r] = fullSubMask;
149 for (int r_whole = 0; r_whole < (max_regs >> 3); r_whole++)
150 free_whole[r_whole] = 0xFF;
151
152 free_flag = (1u << FlagRegister::subcount(hw)) - 1;
153 reg_count = max_regs;
154
155 if (hw < HW::XeHP)
156 setRegisterCount(128);
157}
158
159void RegisterAllocator::claim(GRF reg)
160{
161 int r = reg.getBase();
162
163 free_sub[r] = 0x00;
164 free_whole[r >> 3] &= ~(1 << (r & 7));
165}
166
167void RegisterAllocator::claim(GRFRange range)
168{
169 for (int i = 0; i < range.getLen(); i++)
170 claim(range[i]);
171}
172
173void RegisterAllocator::claim(Subregister subreg)
174{
175 int r = subreg.getBase();
176 int dw = subreg.getDwords();
177 int o = (subreg.getByteOffset()) >> 2;
178
179 claim_sub(r, o, dw);
180}
181
182void RegisterAllocator::claim_sub(int r, int o, int dw)
183{
184 free_sub[r] &= ~((1 << (o + dw)) - (1 << o));
185 free_whole[r >> 3] &= ~(1 << (r & 7));
186}
187
188void RegisterAllocator::claim(FlagRegister flag)
189{
190 free_flag &= ~(1 << flag.index());
191}
192
193void RegisterAllocator::setRegisterCount(int rcount)
194{
195 if (rcount < reg_count) {
196 for (int r = rcount; r < max_regs; r++)
197 free_sub[r] = 0x00;
198 for (int rr = (rcount + 7) >> 3; rr < (max_regs >> 3); rr++)
199 free_whole[rr] = 0x00;
200 if ((rcount & 7) && (rcount < max_regs))
201 free_whole[rcount >> 3] &= ~((1 << (rcount & 7)) - 1);
202 } else if (rcount > reg_count) {
203 for (int r = reg_count; r < std::min(rcount, max_regs); r++)
204 release(GRF(r));
205 }
206 reg_count = rcount;
207}
208
209int RegisterAllocator::countAllocedRegisters() const {
210
211 int register_count = 0;
212 int group_size = 8 * sizeof(this->free_whole[0]);
213 int register_groups = this->reg_count / group_size;
214 for (int group = 0; group < register_groups; group++) {
215 for (int subgroup = 0; subgroup < group_size; subgroup++) {
216 if ((this->free_whole[group] & (1 << subgroup)) == 0)
217 register_count++;
218 }
219 }
220 return register_count;
221}
222
223void RegisterAllocator::release(GRF reg)
224{
225 if (reg.isInvalid()) return;
226 int r = reg.getBase();
227
228 free_sub[r] = fullSubMask;
229 free_whole[r >> 3] |= (1 << (r & 7));
230}
231
232void RegisterAllocator::release(GRFRange range)
233{
234 if (range.isInvalid()) return;
235 for (int i = 0; i < range.getLen(); i++)
236 release(range[i]);
237}
238
239void RegisterAllocator::release(Subregister subreg)
240{
241 if (subreg.isInvalid()) return;
242 int r = subreg.getBase();
243 int dw = subreg.getDwords();
244 int o = (subreg.getByteOffset()) >> 2;
245
246 free_sub[r] |= (1 << (o + dw)) - (1 << o);
247 if (free_sub[r] == fullSubMask)
248 free_whole[r >> 3] |= (1 << (r & 7));
249}
250
251void RegisterAllocator::release(FlagRegister flag)
252{
253 if (flag.isInvalid()) return;
254 free_flag |= (1 << flag.index());
255}
256
257// -------------------------------------------
258// High-level register allocation functions.
259// -------------------------------------------
260
261GRFRange RegisterAllocator::alloc_range(int nregs, Bundle base_bundle, BundleGroup bundle_mask)
262{
263 auto result = try_alloc_range(nregs, base_bundle, bundle_mask);
264 if (result.isInvalid())
265 throw out_of_registers_exception();
266 return result;
267}
268
269Subregister RegisterAllocator::alloc_sub(DataType type, Bundle bundle)
270{
271 auto result = try_alloc_sub(type, bundle);
272 if (result.isInvalid())
273 throw out_of_registers_exception();
274 return result;
275}
276
277FlagRegister RegisterAllocator::alloc_flag()
278{
279 auto result = try_alloc_flag();
280 if (result.isInvalid())
281 throw out_of_registers_exception();
282 return result;
283}
284
285GRFRange RegisterAllocator::try_alloc_range(int nregs, Bundle base_bundle, BundleGroup bundle_mask)
286{
287 int64_t *free_whole64 = (int64_t *) free_whole;
288 bool ok = false;
289 int r_base = -1;
290
291 for (int rchunk = 0; rchunk < (max_regs >> 6); rchunk++) {
292 int64_t free = free_whole64[rchunk] & bundle_mask.reg_mask(rchunk);
293 int64_t free_base = free & base_bundle.reg_mask(hw, rchunk);
294
295 while (free_base) {
296 // Find the first free base register.
297 int first_bit = utils::bsf(free_base);
298 r_base = first_bit + (rchunk << 6);
299
300 // Check if required # of registers are available.
301 int last_bit = first_bit + nregs;
302 if (last_bit <= 64) {
303 // Range to check doesn't cross 64-GRF boundary. Fast check using bitmasks.
304 uint64_t mask = ((uint64_t(1) << (last_bit - 1)) << 1) - (uint64_t(1) << first_bit);
305 ok = !(mask & ~free);
306 } else {
307 // Range to check crosses 64-GRF boundary. Check first part using bitmasks,
308 // Check the rest using a loop (ho hum).
309 uint64_t mask = ~uint64_t(0) << first_bit;
310 ok = !(mask & ~free);
311 if (ok) for (int rr = 64 - first_bit; rr < nregs; rr++) {
312 if (free_sub[r_base + rr] != fullSubMask) {
313 ok = false;
314 break;
315 }
316 }
317 }
318
319 if (ok) {
320 // Create and claim GRF range.
321 GRFRange result(r_base, nregs);
322 claim(result);
323
324 return result;
325 }
326
327 // Not enough consecutive registers. Save time when looking for next base
328 // register by clearing the entire range of registers we just considered.
329 int64_t clear_mask = free + (uint64_t(1) << first_bit);
330 free &= clear_mask;
331 free_base &= clear_mask;
332 }
333 }
334
335 return GRFRange();
336}
337
338GRF RegisterAllocator::try_alloc(Bundle bundle)
339{
340 auto range = try_alloc_range(1, bundle);
341 return range.isInvalid() ? GRF() : range[0];
342}
343
344Subregister RegisterAllocator::try_alloc_sub(DataType type, Bundle bundle)
345{
346 int dwords = getDwords(type);
347 int r_alloc = 0, o_alloc = 0;
348
349 auto find_alloc_sub = [&,bundle,dwords](bool search_full_grf) -> bool {
350 static const uint16_t alloc_patterns[4] = {0b1111111111111111, 0b0101010101010101, 0, 0b0001000100010001};
351 auto alloc_pattern = alloc_patterns[(dwords - 1) & 3];
352 int64_t *free_whole64 = (int64_t *) free_whole;
353
354 for (int rchunk = 0; rchunk < (max_regs >> 6); rchunk++) {
355 int64_t free = search_full_grf ? free_whole64[rchunk] : -1;
356 free &= bundle.reg_mask(hw, rchunk);
357
358 while (free) {
359 int rr = utils::bsf(free);
360 int r = rr + (rchunk << 6);
361 free &= ~(int64_t(1) << rr);
362
363 if (search_full_grf || free_sub[r] != fullSubMask) {
364 int subfree = free_sub[r];
365 for (int dw = 1; dw < dwords; dw++)
366 subfree &= (subfree >> dw);
367 subfree &= alloc_pattern;
368
369 if (subfree) {
370 r_alloc = r;
371 o_alloc = utils::bsf(subfree);
372 return true;
373 }
374 }
375 }
376 }
377
378 return false;
379 };
380
381 // First try to find room in a partially allocated register; fall back to
382 // completely empty registers if unsuccessful.
383 bool success = find_alloc_sub(false)
384 || find_alloc_sub(true);
385
386 if (!success)
387 return Subregister();
388
389 claim_sub(r_alloc, o_alloc, dwords);
390
391 return Subregister(GRF(r_alloc), (o_alloc << 2) / getBytes(type), type);
392}
393
394FlagRegister RegisterAllocator::try_alloc_flag()
395{
396 if (!free_flag) return FlagRegister();
397
398 int idx = utils::bsf(free_flag);
399 free_flag &= (free_flag - 1); // clear lowest bit.
400
401 return FlagRegister::createFromIndex(idx);
402}
403
404void RegisterAllocator::dump(std::ostream &str)
405{
406 str << "\n// Flag registers: ";
407 for (int r = 0; r < FlagRegister::subcount(hw); r++)
408 str << char((free_flag & (1 << r)) ? '.' : 'x');
409
410 for (int r = 0; r < reg_count; r++) {
411 if (!(r & 0x1F)) {
412 str << "\n//\n// " << std::left;
413 str << 'r' << std::setw(3) << r;
414 str << " - r" << std::setw(3) << r+0x1F;
415 str << " ";
416 }
417 if (!(r & 0xF)) str << ' ';
418 if (!(r & 0x3)) str << ' ';
419
420 if (free_sub[r] == 0x00) str << 'x';
421 else if (free_sub[r] == fullSubMask) str << '.';
422 else str << '/';
423 }
424
425 str << "\n//\n";
426
427 for (int r = 0; r < max_regs; r++) {
428 int rr = r >> 3, rb = 1 << (r & 7);
429 if ((free_sub[r] == fullSubMask) != bool(free_whole[rr] & rb))
430 str << "// Inconsistent bitmaps at r" << r << std::endl;
431 if (free_sub[r] != 0x00 && free_sub[r] != fullSubMask) {
432 str << "// r" << std::setw(3) << r << " ";
433 for (int s = 0; s < (GRF::bytes(hw) >> 2); s++)
434 str << char((free_sub[r] & (1 << s)) ? '.' : 'x');
435 str << std::endl;
436 }
437 }
438
439 str << std::endl;
440}
441
442constexpr int BundleGroup::max_regs;
443constexpr int BundleGroup::nmasks;
444constexpr int RegisterAllocator::max_regs;
445
446} /* namespace ngen */
447