1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /* |
18 | * Do not #include this file directly; ngen uses it internally. |
19 | */ |
20 | |
21 | #ifndef NGEN_AUTO_SWSB_HPP |
22 | #define NGEN_AUTO_SWSB_HPP |
23 | |
24 | #if defined(NGEN_DEBUG) || defined(NGEN_DEBUG_PROPAGATE) || defined(NGEN_DEBUG_BB) |
25 | #include <iomanip> |
26 | #include <iostream> |
27 | #endif |
28 | |
29 | #include <limits> |
30 | #include <list> |
31 | #include <map> |
32 | |
33 | namespace ngen { |
34 | namespace autoswsb { |
35 | |
36 | /*******************/ |
37 | /* Data structures */ |
38 | /*******************/ |
39 | |
40 | typedef uint8_t PipeMask; |
41 | enum { |
42 | PipeMaskNone = 0, |
43 | PipeMaskA = 1, // All in-order pipes |
44 | PipeMaskF = 2, |
45 | PipeMaskI = 4, |
46 | PipeMaskL = 8, |
47 | PipeMaskM = 0x10, |
48 | PipeMaskO = 0x20, // All out-of-order pipes. Not a valid GeneralizedPipe. |
49 | PipeBitA = 0, |
50 | PipeBitF = 1, |
51 | PipeBitI = 2, |
52 | PipeBitL = 3, |
53 | PipeBitM = 4, |
54 | PipeBitO = 5, |
55 | }; |
56 | static constexpr int NPipes = 5; |
57 | |
58 | static inline PipeMask toMask(Pipe pipe) { return (1 << (static_cast<unsigned>(pipe) - 1)); } |
59 | static inline Pipe fromMask(PipeMask mask) { return mask ? static_cast<Pipe>(1 + utils::log2(mask)) : Pipe::Default; } |
60 | |
61 | typedef uint8_t DestinationMask; |
62 | enum { |
63 | DestNone = 0, |
64 | DestNextIP = 1, |
65 | DestJIP = 2, |
66 | DestUIP = 4, |
67 | DestUnknown = 8 |
68 | }; |
69 | |
70 | class GeneralizedPipe { |
71 | uint8_t v; |
72 | |
73 | static constexpr uint8_t vInOrder = 0x00; |
74 | static constexpr uint8_t vSend = 0x40; // OR'ed with SFID |
75 | static constexpr uint8_t vSystolic = 0x80; |
76 | static constexpr uint8_t vMath = 0xC0; |
77 | static constexpr uint8_t vTypeMask = 0xC0; |
78 | |
79 | GeneralizedPipe(uint8_t v_, int dummy) : v{v_} {} |
80 | |
81 | public: |
82 | GeneralizedPipe() : v{uint8_t(0)} {} |
83 | GeneralizedPipe(PipeMask pipe) : v{uint8_t(vInOrder | pipe)} {} |
84 | GeneralizedPipe(SharedFunction sfid) : v{uint8_t(vSend | static_cast<uint8_t>(sfid))} {} |
85 | |
86 | static GeneralizedPipe Systolic() { return GeneralizedPipe(vSystolic, 0); } |
87 | static GeneralizedPipe Math() { return GeneralizedPipe(vMath, 0); } |
88 | |
89 | bool operator==(GeneralizedPipe other) const { return v == other.v; } |
90 | bool operator!=(GeneralizedPipe other) const { return v != other.v; } |
91 | |
92 | bool inOrder() const { return ((v & vTypeMask) == vInOrder) && (v != PipeMaskNone); } |
93 | PipeMask inOrderPipe() const { return inOrder() ? (v & ~vTypeMask) : PipeMaskNone; } |
94 | Pipe toPipe() const { return fromMask(inOrderPipe()); } |
95 | inline PipeMask syncPipes(HW hw) const; |
96 | |
97 | #ifdef NGEN_DEBUG |
98 | inline void dump() const; |
99 | #endif |
100 | }; |
101 | |
102 | struct DependencyRegion { |
103 | uint8_t base, size; |
104 | uint8_t unspecified : 1; |
105 | uint8_t checkWAW : 1; |
106 | uint8_t arf : 1; |
107 | HW hw; |
108 | std::array<uint32_t, 32> masks; |
109 | |
110 | DependencyRegion() : DependencyRegion(HW::Unknown) {} |
111 | explicit DependencyRegion(HW hw_) : base(0), size(0), unspecified{true}, checkWAW{false}, arf{false}, hw{hw_} { |
112 | for (auto &m: masks) m = 0; |
113 | } |
114 | inline DependencyRegion(HW hw, GRFRange r); |
115 | inline DependencyRegion(HW hw, int esize, RegData rr); |
116 | |
117 | inline void intersect(const DependencyRegion &other); |
118 | inline void subtract(const DependencyRegion &other); |
119 | |
120 | bool empty() const { |
121 | if (unspecified) return false; |
122 | if (size == 0) return true; |
123 | for (auto m : masks) |
124 | if (m != 0) |
125 | return false; |
126 | return true; |
127 | } |
128 | void clear() { *this = DependencyRegion(hw); unspecified = false; checkWAW = false; arf = false; } |
129 | |
130 | #ifdef NGEN_DEBUG |
131 | inline void dump() const; |
132 | #endif |
133 | }; |
134 | |
135 | template <bool consumer> |
136 | struct Dependency { |
137 | int32_t label; // Multipurpose label for use in algorithms |
138 | |
139 | // Source instruction information. |
140 | GeneralizedPipe pipe; // Execution pipe for instruction |
141 | uint16_t tokenTime; // Estimated upper bound for token lifetime, in cycles. |
142 | std::array<int32_t, NPipes> counters; // Pipe counters, relative to start of BB. |
143 | |
144 | // (Mostly) dependency information. |
145 | uint8_t token; // Out of order token |
146 | uint8_t tokenSrc : 1; // Src dependency on token? |
147 | uint8_t tokenDst : 1; // Dst dependency on token? |
148 | uint8_t rw : 1; // Flag: read or write? |
149 | uint8_t swsb : 1; // True for SWSB dependency consumers |
150 | uint8_t active : 1; // True if dependency is still alive. |
151 | PipeMask depPipe; // (swsb consumer only) Pipe to wait on |
152 | uint8_t dist; // (swsb consumer only) Pipe distance |
153 | DependencyRegion region; // GRF region covered |
154 | |
155 | Dependency() : label{0}, pipe{}, tokenTime{0}, |
156 | token{0}, tokenSrc{false}, tokenDst{false}, |
157 | rw{false}, swsb{false}, active{true}, |
158 | depPipe{PipeMaskNone}, dist{0}, region{} { counters.fill(0); } |
159 | |
160 | bool operator==(const Dependency &other) { |
161 | return !std::memcmp(this, &other, sizeof(Dependency)); |
162 | } |
163 | bool operator!=(const Dependency *other) { return !(operator==(other)); } |
164 | |
165 | int32_t &inum() { return counters[1]; } // For OOO dependencies in phase 0 |
166 | const int32_t &inum() const { return counters[1]; } |
167 | |
168 | constexpr bool read() const { return !rw; } |
169 | constexpr bool write() const { return rw; } |
170 | constexpr bool hasToken() const { return tokenSrc || tokenDst; } |
171 | constexpr bool hasDist() const { return (dist > 0); } |
172 | |
173 | Dependency<!consumer>& cast() { return reinterpret_cast<Dependency<!consumer>&>(*this); } |
174 | |
175 | static constexpr uint8_t tokenTBD = 0xFF; |
176 | |
177 | #ifdef NGEN_DEBUG |
178 | inline void dump() const; |
179 | #endif |
180 | }; |
181 | |
182 | template <bool consumer> |
183 | class DependencyTable { |
184 | enum { |
185 | ListTypeGRF = 0, // Lists of DependencyFragments filtered by GRF base register. |
186 | ListTypeToken = 1, // Lists of DependencyFragments filtered by token. |
187 | ListTypePipe = 2, // Lists of DependencyFragments filtered by (in-order) pipe. |
188 | // fragsByToken/fragsByPipe contain only one DependencyFragment per Dependency. |
189 | NListTypes = 3 |
190 | }; |
191 | |
192 | enum : uint32_t { |
193 | none = ~uint32_t(0) // Special value indicating end of list. |
194 | }; |
195 | |
196 | enum : int { |
197 | grfListIdxUnspecified = 256 // GRF list index for all unspecified regions. |
198 | }; |
199 | |
200 | struct DependencyFragment { |
201 | uint32_t depID; // Index of main Dependency struct in array. |
202 | uint8_t before, after; // # of consecutive fragments associated with the same Dependency |
203 | // before and after this one. |
204 | uint32_t prev[NListTypes]; // Previous pointers for doubly-linked lists. |
205 | uint32_t next[NListTypes]; // Next pointers for doubly-linked lists. |
206 | }; |
207 | |
208 | std::vector<Dependency<consumer>> deps; // List of all Dependencies (active or not) |
209 | std::vector<DependencyFragment> frags; // List of all DependencyFragments (active or not) |
210 | std::array<uint32_t, 257> heads[NListTypes]; // Heads of doubly-linked lists. |
211 | |
212 | static bool isHeadLink(uint32_t id) { return ((id & 0x80000000) != 0) && (id != none); } |
213 | static uint32_t readHeadLink(uint32_t id) { return id & 0x7FFFFFFF; } |
214 | static uint32_t makeHeadLink(uint32_t idx) { return idx | 0x80000000; } |
215 | |
216 | template <bool iconsumer> inline void findAndRemoveIntersections(int listType, int listIdx, const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove = true); |
217 | inline bool insertPrepare(int listType, int listIdx, Dependency<consumer> &dep, bool checkWeaker, bool checkStronger); |
218 | inline void insertLinkedList(int listType, int listIdx, int32_t fragID); |
219 | |
220 | template <bool iconsumer> static inline int getPipeIndex(const Dependency<iconsumer> &dep); |
221 | |
222 | public: |
223 | DependencyTable() { clear(); } |
224 | |
225 | inline void clear(); |
226 | inline void reserve(int icount); |
227 | inline bool insert(Dependency<consumer> &dep, bool checkWeaker = true, bool checkStronger = true); |
228 | inline bool insertWeak(Dependency<consumer> &dep) { return insert(dep, true, false); } |
229 | inline void insertStrong(const Dependency<consumer> &dep) { (void) insert(const_cast<Dependency<consumer> &>(dep), false, true); } |
230 | inline void remove(int fragID); |
231 | template <bool iconsumer> inline void findIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> &out); |
232 | template <bool iconsumer> inline void findAndRemoveIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove = true); |
233 | template <bool iconsumer> inline void removeIntersections(const Dependency<iconsumer> &dep); |
234 | inline uint32_t removeByTokenMask(uint32_t mask, bool dst); |
235 | inline uint32_t removeOOOWritesByRegion(const DependencyRegion ®ion); |
236 | |
237 | template <typename Func> inline void forEach(Func f) { for (auto &entry : deps) if (entry.active) f(entry); } |
238 | template <typename Func> inline void forEach(Func f) const { for (auto &entry : deps) if (entry.active) f(entry); } |
239 | |
240 | #ifdef NGEN_DEBUG |
241 | inline void dump() const; |
242 | #endif |
243 | }; |
244 | |
245 | struct SyncInsertion { |
246 | uint32_t inum; |
247 | SWSBInfo swsb; |
248 | SyncFunction fc; |
249 | uint32_t mask; // (allrd/allwr) 0 indicates no mask to be applied. |
250 | }; |
251 | |
252 | struct BasicBlock; |
253 | |
254 | struct BasicBlock { |
255 | uint32_t id; // index |
256 | int32_t label; // multipurpose flag for use in algorithms |
257 | uint32_t istart, iend; // instruction range: [istart, iend) |
258 | uint32_t wrdeps; // # of wrdep pseudo-instructions in this BB |
259 | std::array<uint32_t, NPipes> lengths; // # of instructions in each pipe in this BB |
260 | std::vector<BasicBlock *> pred, succ; // list of predecessor/successor BBs |
261 | DependencyTable<false> producers; // table of dependencies produced and consumed by this BB. |
262 | DependencyTable<true> consumers; // production table re-used for live incoming dependencies. |
263 | DependencyTable<false> incoming; // table of dependencies produced by prior BBs (temporary). |
264 | std::vector<SyncInsertion> syncs; // list of sync instructions to generate. |
265 | std::vector<std::array<DependencyRegion, 4>> opRegions; // cache of instruction operand regions. |
266 | |
267 | const DependencyRegion &getOperandRegion(int inum, int opNum) const { |
268 | return opRegions[inum - istart][opNum + 1]; |
269 | } |
270 | }; |
271 | |
272 | using BasicBlockList = std::vector<BasicBlock>; |
273 | |
274 | /*****************/ |
275 | /* Pipe Handling */ |
276 | /*****************/ |
277 | |
278 | // Get all pipes to track in-order dependencies on. |
279 | inline PipeMask allPipes(HW hw) |
280 | { |
281 | PipeMask mask = PipeMaskA | PipeMaskO; |
282 | if (hw >= HW::XeHP) mask |= PipeMaskF | PipeMaskI | PipeMaskL; |
283 | if (hw >= HW::XeHPC) mask |= PipeMaskM; |
284 | |
285 | return mask; |
286 | } |
287 | |
288 | // Get the execution pipe for an instruction. |
289 | template <typename Instruction> |
290 | inline GeneralizedPipe getPipe(HW hw, const Instruction &insn, bool checkOOO = true) |
291 | { |
292 | // Check jumps and no-ops |
293 | auto op = insn.opcode(); |
294 | if (isBranch(op) || op == Opcode::nop_gen12 || op == Opcode::sync || op == Opcode::illegal) |
295 | return GeneralizedPipe(); |
296 | |
297 | // Check OOO instructions. |
298 | if (isVariableLatency(hw, op)) { |
299 | if (!checkOOO) |
300 | return GeneralizedPipe(); |
301 | switch (op) { |
302 | case Opcode::dpas: |
303 | case Opcode::dpasw: |
304 | return GeneralizedPipe::Systolic(); |
305 | case Opcode::math: |
306 | return GeneralizedPipe::Math(); |
307 | case Opcode::send: |
308 | case Opcode::sendc: |
309 | return GeneralizedPipe(insn.sfid()); |
310 | default: |
311 | break; |
312 | } |
313 | } |
314 | |
315 | if (hw >= HW::XeHPC && (op == Opcode::math)) |
316 | return PipeMaskM; |
317 | |
318 | PipeMask mask = PipeMaskNone; |
319 | |
320 | // For SWSB purposes, Gen12LP has a single in-order pipe. |
321 | // Otherwise, in-order pipe determined by destination type. |
322 | // Exception: if there are any long operands, it's a long pipe instruction. |
323 | if (hw >= HW::XeHP) { |
324 | auto dt = insn.dstTypecode(); |
325 | unsigned lmask = (hw == HW::XeHPC) ? 0b1011 : 0b0011; // Note: assumes PVC-XT |
326 | if ((dt & lmask) == lmask) |
327 | mask |= PipeMaskL; |
328 | else if (dt & 8) |
329 | mask |= PipeMaskF; |
330 | else |
331 | mask |= PipeMaskI; |
332 | |
333 | if (!(mask & PipeMaskL)) { |
334 | if ((insn.src0Typecode() & lmask) == lmask) |
335 | mask = PipeMaskL; |
336 | else if ((insn.src1Typecode() & lmask) == lmask) |
337 | mask = PipeMaskL; |
338 | } |
339 | } else |
340 | mask = PipeMaskA; |
341 | return mask; |
342 | } |
343 | |
344 | template <typename Instruction> |
345 | inline PipeMask getPipeMask(HW hw, const Instruction &insn) |
346 | { |
347 | PipeMask pipe = getPipe(hw, insn, false).inOrderPipe(); |
348 | if (pipe != PipeMaskNone) |
349 | pipe |= PipeMaskA; |
350 | return pipe; |
351 | } |
352 | |
353 | PipeMask GeneralizedPipe::syncPipes(HW hw) const |
354 | { |
355 | if ((hw >= HW::XeHP) && (v & PipeMaskA)) |
356 | return allPipes(hw) & ~PipeMaskA & ~PipeMaskO; |
357 | return (v == PipeMaskNone) ? allPipes(hw) : inOrderPipe(); |
358 | } |
359 | |
360 | /**********************/ |
361 | /* Dependency Regions */ |
362 | /**********************/ |
363 | DependencyRegion::DependencyRegion(HW hw_, GRFRange r) |
364 | { |
365 | auto nmasks = int(masks.size()); |
366 | #ifdef NGEN_SAFE |
367 | if (r.isInvalid() || (r.getLen() > nmasks)) |
368 | throw invalid_region_exception(); |
369 | #endif |
370 | |
371 | hw = hw_; |
372 | unspecified = false; |
373 | checkWAW = false; |
374 | arf = false; |
375 | base = r.getBase(); |
376 | size = r.getLen(); |
377 | auto fullMask = ~uint32_t(0); |
378 | for (int i = 0; i < nmasks; i++) |
379 | masks[i] = (i < r.getLen()) ? fullMask : 0u; |
380 | } |
381 | |
382 | DependencyRegion::DependencyRegion(HW hw_, int esize, RegData rr) |
383 | { |
384 | const auto mbits = GRF::bytes(hw_); |
385 | const auto log2MBits = GRF::log2Bytes(hw_); |
386 | |
387 | hw = hw_; |
388 | base = rr.getBase(); |
389 | unspecified = false; |
390 | checkWAW = false; |
391 | arf = rr.isARF(); |
392 | |
393 | int hs = rr.getHS(), vs = rr.getVS(); |
394 | int nh = rr.getWidth(); |
395 | #ifdef NGEN_SAFE |
396 | if (nh == 0) nh = 1; |
397 | #endif |
398 | int nv = esize / nh; |
399 | int bytes = rr.getBytes(); |
400 | int off = rr.getByteOffset(); |
401 | |
402 | auto makeMask = [](int sz) -> uint64_t { |
403 | return (uint64_t(1) << sz) - 1; |
404 | }; |
405 | |
406 | auto compress = [&](uint64_t m) -> uint32_t { |
407 | if (hw_ >= HW::XeHPC) { |
408 | // Regions tracked at word granularity. OR and pack adjacent bits. |
409 | // If any sub-word writes, need to track WAW dependencies. |
410 | if ((m ^ (m >> 1)) & 0x5555555555555555) |
411 | checkWAW = true; |
412 | m = (m | (m >> 1)) & 0x5555555555555555; |
413 | m = (m | (m >> 1)) & 0x3333333333333333; |
414 | m = (m | (m >> 2)) & 0x0F0F0F0F0F0F0F0F; |
415 | m = (m | (m >> 4)) & 0x00FF00FF00FF00FF; |
416 | m = (m | (m >> 8)) & 0x0000FFFF0000FFFF; |
417 | m |= (m >> 16); |
418 | } |
419 | return uint32_t(m); |
420 | }; |
421 | |
422 | if (hs == 0) nh = hs = 1; |
423 | if (vs == 0) nv = 1; |
424 | hs *= bytes; |
425 | vs *= bytes; |
426 | |
427 | for (auto &m : masks) |
428 | m = 0; |
429 | |
430 | uint64_t hmask = makeMask(bytes) * (makeMask(nh * hs) / makeMask(hs)); |
431 | for (int j = 0; j < nv; j++) { |
432 | masks[off >> log2MBits] |= compress(hmask << (off & (mbits - 1))); |
433 | off += vs; |
434 | } |
435 | |
436 | size = ((off - vs) >> log2MBits) + 1; |
437 | } |
438 | |
439 | void DependencyRegion::intersect(const DependencyRegion &other) |
440 | { |
441 | if (arf != other.arf) { |
442 | clear(); |
443 | return; |
444 | } |
445 | |
446 | if (unspecified || other.unspecified) |
447 | return; |
448 | |
449 | int i, iOther; |
450 | for (i = 0, iOther = base - other.base; i < size; i++, iOther++) { |
451 | if (iOther >= 0 && iOther < other.size) |
452 | masks[i] &= other.masks[iOther]; |
453 | else |
454 | masks[i] = 0; |
455 | } |
456 | } |
457 | |
458 | // Check whether two regions overlap. |
459 | inline bool intersects(const DependencyRegion &dep1, const DependencyRegion &dep2) |
460 | { |
461 | // Check register file. |
462 | if (dep1.arf != dep2.arf) |
463 | return false; |
464 | |
465 | // Unspecified regions might always overlap. |
466 | if (dep1.unspecified || dep2.unspecified) |
467 | return true; |
468 | |
469 | // Quick check based on register bounds. |
470 | int diff = dep1.base - dep2.base; |
471 | if ((diff >= dep2.size) || (diff <= -dep1.size)) |
472 | return false; |
473 | |
474 | // Precise check. |
475 | int i1, i2; |
476 | for (i1 = 0, i2 = diff; i1 < dep1.size; i1++, i2++) |
477 | if (i2 >= 0 && i2 < dep2.size) |
478 | if (dep1.masks[i1] & dep2.masks[i2]) |
479 | return true; |
480 | |
481 | return false; |
482 | } |
483 | |
484 | void DependencyRegion::subtract(const DependencyRegion &other) |
485 | { |
486 | if (other.arf != arf) |
487 | return; |
488 | if (unspecified) |
489 | return; |
490 | if (other.unspecified) |
491 | clear(); |
492 | else { |
493 | int i, iOther; |
494 | for (i = 0, iOther = base - other.base; i < size; i++, iOther++) |
495 | if (iOther >= 0 && iOther < other.size) |
496 | masks[i] &= ~other.masks[iOther]; |
497 | } |
498 | } |
499 | |
500 | inline bool contains(const DependencyRegion &dep1, const DependencyRegion &dep2) |
501 | { |
502 | using mtype = decltype(DependencyRegion::masks)::value_type; |
503 | |
504 | if (dep1.arf != dep2.arf) return false; |
505 | if (dep1.unspecified) return true; |
506 | if (dep2.unspecified) return false; |
507 | |
508 | int i1, i2; |
509 | for (i1 = dep2.base - dep1.base, i2 = 0; i2 < dep2.size; i1++, i2++) { |
510 | mtype mask = (i1 >= 0 && i1 < dep1.size) ? dep1.masks[i1] : 0; |
511 | if (~mask && dep2.masks[i2]) |
512 | return false; |
513 | } |
514 | return true; |
515 | } |
516 | |
517 | // Check if an ARF type needs SWSB tracking. |
518 | inline bool trackableARF(ARFType type) |
519 | { |
520 | return (type == ARFType::acc || type == ARFType::a); |
521 | } |
522 | |
523 | // Distance in an in-order pipe after which a dependency can be ignored. |
524 | inline int timeout(GeneralizedPipe pipe) |
525 | { |
526 | switch (pipe.inOrderPipe()) { |
527 | case PipeMaskA: return 11; // Gen12LP |
528 | case PipeMaskI: return 11; |
529 | case PipeMaskF: return 11; |
530 | case PipeMaskL: return 15; |
531 | case PipeMaskM: return 19; |
532 | default: return std::numeric_limits<int>::max(); |
533 | } |
534 | } |
535 | |
536 | // Approximate upper bound on cycle count for an OOO instruction. |
537 | template <typename Instruction> |
538 | inline int estimateLatency(HW hw, const Instruction &insn) |
539 | { |
540 | switch (insn.opcode()) { |
541 | case Opcode::math: return (hw == HW::Gen12LP) ? 20 : 17; |
542 | case Opcode::dpas: |
543 | case Opcode::dpasw: return 20; // need correct value |
544 | case Opcode::send: |
545 | case Opcode::sendc: { |
546 | switch (insn.sfid()) { |
547 | case SharedFunction::dc0: |
548 | case SharedFunction::dc1: { |
549 | MessageDescriptor desc; |
550 | if (insn.getSendDesc(desc)) |
551 | if (desc.surface.index == 0xFE) |
552 | return (hw == HW::Gen12LP) ? 33 : 25; |
553 | return (hw == HW::Gen12LP) ? 106 : 150; |
554 | } |
555 | case SharedFunction::sampler: return (hw == HW::Gen12LP) ? 175 : 210; |
556 | default: return 50; |
557 | } |
558 | } |
559 | default: return 0; |
560 | } |
561 | } |
562 | |
563 | // Measure instruction distance between two Dependencies in a given pipe. |
564 | template <bool consumer1, bool consumer2> |
565 | inline int distance(const Dependency<consumer1> &dep1, const Dependency<consumer2> &dep2, GeneralizedPipe pipe) |
566 | { |
567 | auto ioPipe = pipe.inOrderPipe(); |
568 | |
569 | if (ioPipe == PipeMaskNone) |
570 | return 0; |
571 | |
572 | auto pidx = utils::log2(ioPipe); |
573 | return dep2.counters[pidx] - dep1.counters[pidx]; |
574 | } |
575 | |
576 | // Check whether two dependencies form a producer-consumer pair. |
577 | inline bool intersects(const Dependency<false> &dep1, const Dependency<true> &dep2) |
578 | { |
579 | if (!dep2.swsb) { |
580 | // Region-based dependency. First, quick check based on dependency type: |
581 | // RAR: ignore |
582 | // WAR/WAW: ignore if both instructions in same GeneralizedPipe (same in-order pipe, or sends with same SFID, etc.) |
583 | // If not ignorable, check: |
584 | // * If consumer is in-order, is that pipe still live (unsynchronized) in the producer? |
585 | // * If producer is in-order, is it close enough to require tracking the dependency? |
586 | // * Do the producer+consumer regions overlap? |
587 | if (dep1.read() && dep2.read()) return false; |
588 | if (!(dep1.write() && dep2.write() && (dep1.region.checkWAW || dep2.region.checkWAW))) |
589 | if (dep2.write() && (dep1.pipe == dep2.pipe) && (dep1.pipe != GeneralizedPipe::Math())) return false; |
590 | if (dep1.pipe.inOrder() && (distance(dep1, dep2, dep1.pipe) >= timeout(dep1.pipe))) return false; |
591 | if (dep2.region.arf && (dep2.read() || dep2.region.hw == HW::Gen12LP)) return false; |
592 | return intersects(dep1.region, dep2.region); |
593 | } else { |
594 | // SWSB dependency. |
595 | if (dep1.hasToken() && dep2.hasToken() && (dep1.token == dep2.token) && (dep1.tokenSrc || dep2.tokenDst)) |
596 | return true; |
597 | if (dep1.pipe.inOrder()) { |
598 | auto commonPipe = (dep1.pipe.inOrderPipe() | PipeMaskA) & dep2.depPipe; |
599 | if (commonPipe) |
600 | return (distance(dep1, dep2, dep1.pipe) >= dep2.dist); |
601 | } |
602 | return false; |
603 | } |
604 | } |
605 | |
606 | // Check whether two dependencies form a producer-consumer pair. |
607 | inline bool intersects(const Dependency<true> &dep1, const Dependency<false> &dep2) |
608 | { |
609 | return intersects(dep2, dep1); |
610 | } |
611 | |
612 | // Check whether one producer dependency implies another, without checking regions. |
613 | inline bool impliesWithoutRegion(const Dependency<false> &dep1, const Dependency<false> &dep2) |
614 | { |
615 | // Reads never imply writes. |
616 | if (dep2.write() && dep1.read()) |
617 | return false; |
618 | // Check pipes. |
619 | if (dep2.pipe != dep1.pipe) |
620 | return false; |
621 | if (dep2.hasToken()) { |
622 | // Token dependency: tokens must match. If tokens not assigned, instructions must match. |
623 | if (!dep1.hasToken()) |
624 | return false; |
625 | if (!dep1.tokenDst && dep2.tokenDst) |
626 | return false; |
627 | if (dep1.token != dep2.token) |
628 | return false; |
629 | if ((dep1.token == dep1.tokenTBD) && (dep1.inum() != dep2.inum())) |
630 | return false; |
631 | } |
632 | if (dep2.pipe.inOrder()) { |
633 | // Pipeline dependency: compare counters. |
634 | if (dep1.counters[PipeBitA] < dep2.counters[PipeBitA]) |
635 | return false; |
636 | auto pidx = utils::log2(dep2.pipe.inOrderPipe()); |
637 | if (dep1.counters[pidx] < dep2.counters[pidx]) |
638 | return false; |
639 | } |
640 | return true; |
641 | } |
642 | |
643 | // Check whether one consumer dependency implies another, without checking regions. |
644 | inline bool impliesWithoutRegion(const Dependency<true> &dep1, const Dependency<true> &dep2) |
645 | { |
646 | // Writes never imply reads. |
647 | if (dep2.read() && dep1.write()) return false; |
648 | |
649 | // Check pipes. |
650 | if (dep2.pipe != dep1.pipe) |
651 | return false; |
652 | if (dep2.depPipe != dep1.depPipe) |
653 | return false; |
654 | if (dep2.hasToken()) { |
655 | // Token dependency. |
656 | if (!dep1.hasToken()) |
657 | return false; |
658 | if (!dep1.tokenDst && dep2.tokenDst) |
659 | return false; |
660 | if (dep1.token != dep2.token) |
661 | return false; |
662 | } |
663 | if (dep2.pipe.inOrder()) { |
664 | // Pipeline dependency. Consumer dependencies are only compared |
665 | // within BBs, so it's enough to check the A counter. |
666 | // Note distance check not always valid for A@ consumers >= XeHP, |
667 | // but is never used in these cases. |
668 | if (dep2.counters[PipeBitA] < dep1.counters[PipeBitA]) |
669 | return false; |
670 | if (dep2.hasDist() != dep1.hasDist()) |
671 | return false; |
672 | if (dep2.hasDist()) |
673 | if (distance(dep1, dep2, dep2.pipe) - dep2.dist + dep1.dist < 0) |
674 | return false; |
675 | if (dep1.read() && dep2.write()) |
676 | return false; |
677 | } |
678 | return true; |
679 | } |
680 | |
681 | template <bool consumer> |
682 | void DependencyTable<consumer>::clear() |
683 | { |
684 | deps.clear(); |
685 | frags.clear(); |
686 | for (int l = 0; l < NListTypes; l++) |
687 | std::fill(heads[l].begin(), heads[l].end(), none); |
688 | } |
689 | |
690 | template <bool consumer> |
691 | void DependencyTable<consumer>::reserve(int icount) |
692 | { |
693 | icount *= 4; |
694 | deps.reserve(icount); |
695 | frags.reserve(icount * 4); |
696 | } |
697 | |
698 | template <bool consumer> |
699 | bool DependencyTable<consumer>::insertPrepare(int listType, int listIdx, Dependency<consumer> &dep, bool checkWeaker, bool checkStronger) |
700 | { |
701 | for (auto fragID = heads[listType][listIdx]; fragID != none;) { |
702 | auto &frag = frags[fragID]; |
703 | auto &entry = deps[frag.depID]; |
704 | |
705 | bool noRegions = (dep.region.unspecified && entry.region.unspecified); |
706 | |
707 | if (checkWeaker && impliesWithoutRegion(entry, dep)) { |
708 | if (noRegions) |
709 | return false; |
710 | dep.region.subtract(entry.region); |
711 | if (dep.region.empty()) |
712 | return false; |
713 | } |
714 | |
715 | if (checkStronger && impliesWithoutRegion(dep, entry)) { |
716 | entry.region.subtract(dep.region); |
717 | if (entry.region.empty() || noRegions) |
718 | remove(fragID); |
719 | } |
720 | |
721 | fragID = frag.next[listType]; |
722 | } |
723 | |
724 | return true; |
725 | } |
726 | |
727 | template <bool consumer> |
728 | void DependencyTable<consumer>::insertLinkedList(int listType, int listIdx, int32_t fragID) |
729 | { |
730 | auto &head = heads[listType][listIdx]; |
731 | auto &frag = frags[fragID]; |
732 | |
733 | frag.next[listType] = head; |
734 | frag.prev[listType] = makeHeadLink(listIdx); |
735 | if (head != none) |
736 | frags[head].prev[listType] = fragID; |
737 | head = fragID; |
738 | } |
739 | |
740 | template <bool consumer> |
741 | template <bool iconsumer> |
742 | int DependencyTable<consumer>::getPipeIndex(const Dependency<iconsumer> &dep) |
743 | { |
744 | auto checkPipe = iconsumer ? dep.depPipe : dep.pipe.inOrderPipe(); |
745 | |
746 | if (!checkPipe) |
747 | return -1; |
748 | |
749 | return utils::log2(checkPipe); |
750 | } |
751 | |
752 | // Insert dependency into table. |
753 | // If checkStronger set, remove any weaker existing dependencies. |
754 | // If checkWeaker set, the input dependency's region will be adjusted to remove |
755 | // overlapping stronger dependencies. If this dependency is already implied by the |
756 | // table, it will not be added. |
757 | // Return value indicates whether dependency added. |
758 | template <bool consumer> |
759 | bool DependencyTable<consumer>::insert(Dependency<consumer> &dep, bool checkWeaker, bool checkStronger) |
760 | { |
761 | bool toAdd = true; |
762 | int pidx = getPipeIndex(dep); |
763 | |
764 | bool checkToken = dep.hasToken() |
765 | && !(!consumer && dep.token == Dependency<consumer>::tokenTBD && !dep.region.unspecified); |
766 | |
767 | if (checkToken) |
768 | toAdd = toAdd && insertPrepare(ListTypeToken, dep.token, dep, checkWeaker, checkStronger); |
769 | else if (!dep.region.unspecified) { |
770 | for (int r = dep.region.base; r < dep.region.base + dep.region.size; r++) |
771 | toAdd = toAdd && insertPrepare(ListTypeGRF, r, dep, checkWeaker, checkStronger); |
772 | } else if (pidx >= 0) |
773 | toAdd = toAdd && insertPrepare(ListTypePipe, pidx, dep, checkWeaker, checkStronger); |
774 | |
775 | if (!toAdd) |
776 | return false; |
777 | |
778 | auto depID = int(deps.size()); |
779 | deps.push_back(dep); |
780 | |
781 | // Create fragments. |
782 | bool hasRegion = !dep.region.unspecified && (dep.region.size > 0); |
783 | int ridx = hasRegion ? dep.region.base : grfListIdxUnspecified; |
784 | int nfrags = hasRegion ? dep.region.size : 1; |
785 | auto fragID = int(frags.size()); |
786 | |
787 | DependencyFragment frag; |
788 | frag.before = 0; |
789 | frag.after = nfrags - 1; |
790 | frag.depID = depID; |
791 | for (int l = 0; l < NListTypes; l++) |
792 | frag.prev[l] = frag.next[l] = none; |
793 | |
794 | for (int o = 0; o < nfrags; o++, fragID++, frag.before++, frag.after--) { |
795 | frags.push_back(frag); |
796 | if (hasRegion || dep.region.unspecified) |
797 | insertLinkedList(ListTypeGRF, ridx++, fragID); |
798 | if (o > 0) |
799 | continue; |
800 | if (dep.hasToken()) |
801 | insertLinkedList(ListTypeToken, dep.token, fragID); |
802 | if (pidx >= 0) |
803 | insertLinkedList(ListTypePipe, pidx, fragID); |
804 | } |
805 | |
806 | return true; |
807 | } |
808 | |
809 | template <bool consumer> |
810 | void DependencyTable<consumer>::remove(int fragID) |
811 | { |
812 | auto &frag0 = frags[fragID]; |
813 | deps[frag0.depID].active = false; |
814 | |
815 | fragID -= frag0.before; |
816 | int nfrag = frag0.before + frag0.after + 1; |
817 | |
818 | for (int i = 0; i < nfrag; i++, fragID++) { |
819 | auto &frag = frags[fragID]; |
820 | |
821 | for (int l = 0; l < NListTypes; l++) { |
822 | if (isHeadLink(frag.prev[l])) |
823 | heads[l][readHeadLink(frag.prev[l])] = frag.next[l]; |
824 | else if (frag.prev[l] != none) |
825 | frags[frag.prev[l]].next[l] = frag.next[l]; |
826 | if (frag.next[l] != none) |
827 | frags[frag.next[l]].prev[l] = frag.prev[l]; |
828 | if (i > 0) |
829 | break; // Only GRF linked lists contain multiple fragments per dependency. |
830 | } |
831 | } |
832 | } |
833 | |
834 | // Find dependencies in the table intersecting the given dependency, and append them to the given list. |
835 | // NB: the resulting list may contain duplicate dependencies. |
836 | template <bool consumer> |
837 | template <bool iconsumer> |
838 | void DependencyTable<consumer>::findIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> &out) |
839 | { |
840 | findAndRemoveIntersections(dep, &out, false); |
841 | } |
842 | |
843 | template <bool consumer> |
844 | template <bool iconsumer> |
845 | void DependencyTable<consumer>::findAndRemoveIntersections(int listType, int listIdx, const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove) |
846 | { |
847 | for (auto fragID = heads[listType][listIdx]; fragID != none;) { |
848 | auto &frag = frags[fragID]; |
849 | auto &entry = deps[frag.depID]; |
850 | if (doRemove && !consumer && (distance(entry, dep, entry.pipe) >= timeout(entry.pipe))) |
851 | remove(fragID); |
852 | else if (intersects(dep, entry)) { |
853 | if (out != nullptr) |
854 | out->push_back(entry); |
855 | if (doRemove) |
856 | remove(fragID); |
857 | } |
858 | fragID = frag.next[listType]; |
859 | } |
860 | } |
861 | |
862 | // Find dependencies in the table intersecting the given dependency. |
863 | // Append them to the given list, and remove from table. |
864 | // Also checks for, and removes, timed-out producer dependencies. |
865 | template <bool consumer> |
866 | template <bool iconsumer> |
867 | void DependencyTable<consumer>::findAndRemoveIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove) |
868 | { |
869 | PipeMask checkPipe = PipeMaskNone; |
870 | bool checkToken = false; |
871 | bool checkRegion = !dep.region.empty(); |
872 | |
873 | if (iconsumer) { |
874 | if (dep.swsb) { |
875 | checkToken = true; |
876 | checkPipe = dep.depPipe; |
877 | checkRegion = false; |
878 | } |
879 | } else { |
880 | checkToken = true; |
881 | checkPipe = dep.pipe.inOrderPipe(); |
882 | } |
883 | |
884 | // Handle token dependencies. |
885 | if (checkToken && dep.hasToken() && dep.token != dep.tokenTBD) |
886 | findAndRemoveIntersections(ListTypeToken, dep.token, dep, out, doRemove); |
887 | |
888 | // Handle pipeline dependencies. |
889 | if (checkPipe & PipeMaskA) { |
890 | for (int pidx = 0; pidx < NPipes; pidx++) |
891 | findAndRemoveIntersections(ListTypePipe, pidx, dep, out, doRemove); |
892 | } else if (checkPipe != PipeMaskNone) { |
893 | int pidx = utils::log2(checkPipe); |
894 | findAndRemoveIntersections(ListTypePipe, pidx, dep, out, doRemove); |
895 | findAndRemoveIntersections(ListTypePipe, PipeBitA, dep, out, doRemove); |
896 | } |
897 | |
898 | // Handle GRF dependencies. |
899 | if (checkRegion) { |
900 | int base = dep.region.unspecified ? 0 : dep.region.base; |
901 | int len = dep.region.unspecified ? 256 : dep.region.size; |
902 | for (int r = base; r < base + len; r++) |
903 | findAndRemoveIntersections(ListTypeGRF, r, dep, out, doRemove); |
904 | findAndRemoveIntersections(ListTypeGRF, grfListIdxUnspecified, dep, out, doRemove); |
905 | } |
906 | } |
907 | |
908 | // Find dependencies in the table intersecting the given dependency, and remove them. |
909 | template <bool consumer> |
910 | template <bool iconsumer> |
911 | void DependencyTable<consumer>::removeIntersections(const Dependency<iconsumer> &dep) |
912 | { |
913 | findAndRemoveIntersections(dep, nullptr); |
914 | } |
915 | |
916 | // Remove dependencies from the table matching a token mask. |
917 | // Returns mask of unmatched tokens. |
918 | template <bool consumer> |
919 | uint32_t DependencyTable<consumer>::removeByTokenMask(uint32_t mask, bool dst) |
920 | { |
921 | uint32_t unmatched = mask; |
922 | |
923 | while (mask) { |
924 | uint32_t mask1 = mask & ~(mask & (mask - 1)); |
925 | mask &= ~mask1; |
926 | int token = utils::log2(mask1); |
927 | |
928 | for (auto fragID = heads[ListTypeToken][token]; fragID != none;) { |
929 | auto &frag = frags[fragID]; |
930 | auto &entry = deps[frag.depID]; |
931 | |
932 | if (entry.tokenSrc || (entry.tokenDst && dst)) { |
933 | unmatched &= ~mask1; |
934 | remove(fragID); |
935 | } |
936 | |
937 | fragID = frag.next[ListTypeToken]; |
938 | } |
939 | } |
940 | |
941 | return unmatched; |
942 | } |
943 | |
944 | // Remove OOO writes intersecting the given region. Return mask of token IDs for these writes. |
945 | template <bool consumer> |
946 | uint32_t DependencyTable<consumer>::removeOOOWritesByRegion(const DependencyRegion ®ion) |
947 | { |
948 | uint32_t removed = 0; |
949 | |
950 | if (!region.unspecified) { |
951 | for (int r = region.base; r < region.base + region.size; r++) { |
952 | for (auto fragID = heads[ListTypeGRF][r]; fragID != none;) { |
953 | auto &frag = frags[fragID]; |
954 | auto &entry = deps[frag.depID]; |
955 | |
956 | if (!entry.pipe.inOrder() && entry.write() && intersects(entry.region, region)) { |
957 | if (entry.token != entry.tokenTBD) |
958 | removed |= (1 << entry.token); |
959 | remove(fragID); |
960 | } |
961 | |
962 | fragID = frag.next[ListTypeGRF]; |
963 | } |
964 | } |
965 | } |
966 | |
967 | return removed; |
968 | } |
969 | |
970 | #ifdef NGEN_DEBUG |
971 | inline void dumpPipeMask(PipeMask mask, bool spacers = true) |
972 | { |
973 | if (spacers) { |
974 | std::cerr << char((mask & PipeMaskA) ? 'A' : ' '); |
975 | std::cerr << char((mask & PipeMaskF) ? 'F' : ' '); |
976 | std::cerr << char((mask & PipeMaskI) ? 'I' : ' '); |
977 | std::cerr << char((mask & PipeMaskL) ? 'L' : ' '); |
978 | std::cerr << char((mask & PipeMaskM) ? 'M' : ' '); |
979 | std::cerr << char((mask & PipeMaskO) ? 'O' : ' '); |
980 | } else { |
981 | if (mask & PipeMaskA) std::cerr << 'A'; |
982 | if (mask & PipeMaskF) std::cerr << 'F'; |
983 | if (mask & PipeMaskI) std::cerr << 'I'; |
984 | if (mask & PipeMaskL) std::cerr << 'L'; |
985 | if (mask & PipeMaskM) std::cerr << 'M'; |
986 | if (mask & PipeMaskO) std::cerr << 'O'; |
987 | if (mask == PipeMaskNone) std::cerr << '-'; |
988 | } |
989 | } |
990 | |
991 | void GeneralizedPipe::dump() const |
992 | { |
993 | switch (v & vTypeMask) { |
994 | case vInOrder: dumpPipeMask(inOrderPipe(), false); break; |
995 | case vSystolic: std::cerr << 'D'; break; |
996 | case vMath: std::cerr << 'M'; break; |
997 | case vSend: std::cerr << 'S' << int(v & 0xF); break; |
998 | default: std::cerr << '?'; break; |
999 | } |
1000 | } |
1001 | |
1002 | void DependencyRegion::dump() const |
1003 | { |
1004 | if (unspecified) |
1005 | std::cerr << "[no region]" ; |
1006 | else if (size == 0) |
1007 | std::cerr << "[zero size region]" ; |
1008 | else { |
1009 | std::cerr << "r" << int(base); |
1010 | if (size > 1) |
1011 | std::cerr << "-r" << int(base + size - 1); |
1012 | |
1013 | auto fullMask = ~uint32_t(0); |
1014 | bool partial = false; |
1015 | for (int ii = 0; ii < size; ii++) |
1016 | partial |= (masks[ii] != fullMask); |
1017 | |
1018 | if (partial) { |
1019 | std::cerr << " (" << std::hex; |
1020 | for (int ii = 0; ii < size; ii++) { |
1021 | if (masks[ii] != fullMask) |
1022 | std::cerr << std::setw(sizeof(masks[ii]) * 2) << masks[ii]; |
1023 | else |
1024 | std::cerr << "all" ; |
1025 | std::cerr << char((ii == (size - 1)) ? ')' : ' '); |
1026 | } |
1027 | std::cerr << std::dec; |
1028 | } |
1029 | } |
1030 | } |
1031 | |
1032 | template <bool consumer> |
1033 | void Dependency<consumer>::dump() const |
1034 | { |
1035 | if (tokenTime > 0) { |
1036 | std::cerr << '[' << counters[PipeBitA] << " + " << tokenTime; |
1037 | std::cerr << ',' << inum(); |
1038 | } else { |
1039 | std::cerr << '['; |
1040 | for (auto &counter : counters) |
1041 | std::cerr << counter << ','; |
1042 | pipe.dump(); |
1043 | } |
1044 | std::cerr << ']'; |
1045 | if (hasToken()) { |
1046 | std::cerr << " $" ; |
1047 | if (token == tokenTBD) |
1048 | std::cerr << '?'; |
1049 | else |
1050 | std::cerr << std::hex << int(token) << std::dec; |
1051 | if (tokenSrc && !tokenDst) |
1052 | std::cerr << ".src" ; |
1053 | else if (tokenDst && !tokenSrc) |
1054 | std::cerr << ".dst" ; |
1055 | else |
1056 | std::cerr << " " ; |
1057 | } else |
1058 | std::cerr << " " ; |
1059 | if (dist > 0) { |
1060 | dumpPipeMask(depPipe, false); |
1061 | std::cerr << '@' << int(dist); |
1062 | } else |
1063 | std::cerr << " " ; |
1064 | |
1065 | std::cerr << (rw ? " write " : " read " ); |
1066 | if (!region.unspecified) |
1067 | region.dump(); |
1068 | } |
1069 | |
1070 | template <bool consumer> |
1071 | void DependencyTable<consumer>::dump() const |
1072 | { |
1073 | std::cerr << (consumer ? "Consumers:\n" : "Producers:\n" ); |
1074 | for (size_t i = 0; i < deps.size(); i++) { |
1075 | if (!deps[i].active) |
1076 | continue; |
1077 | std::cerr << i << ":\t" ; |
1078 | deps[i].dump(); |
1079 | std::cerr << std::endl; |
1080 | } |
1081 | |
1082 | for (int l = 0; l < NListTypes; l++) { |
1083 | for (size_t i = 0; i < heads[l].size(); i++) { |
1084 | auto fragID = heads[l][i], lastFragID = makeHeadLink(i); |
1085 | if (fragID != none) { |
1086 | switch (l) { |
1087 | case ListTypeGRF: |
1088 | std::cerr << 'r'; |
1089 | if (i == grfListIdxUnspecified) |
1090 | std::cerr << '?'; |
1091 | else |
1092 | std::cerr << i; |
1093 | break; |
1094 | case ListTypeToken: |
1095 | std::cerr << '$'; |
1096 | if (i == Dependency<consumer>::tokenTBD) |
1097 | std::cerr << '?'; |
1098 | else |
1099 | std::cerr << i; |
1100 | break; |
1101 | case ListTypePipe: |
1102 | if (i > NPipes) |
1103 | std::cerr << '?'; |
1104 | else |
1105 | std::cerr << "AFILMO" [i % (NPipes + 1)]; |
1106 | break; |
1107 | } |
1108 | std::cerr << ":\t" ; |
1109 | while (fragID != none) { |
1110 | if (frags[fragID].prev[l] != lastFragID) |
1111 | std::cerr << "(bad last ptr) " ; |
1112 | std::cerr << frags[fragID].depID << " -> " ; |
1113 | lastFragID = fragID; |
1114 | fragID = frags[fragID].next[l]; |
1115 | } |
1116 | std::cerr << std::endl; |
1117 | } |
1118 | } |
1119 | } |
1120 | } |
1121 | #endif |
1122 | |
1123 | /*****************/ |
1124 | /* Main Routines */ |
1125 | /*****************/ |
1126 | |
1127 | template <typename Program> |
1128 | inline bool hasAutoSWSB(HW hw, const Program &program) |
1129 | { |
1130 | if (hw < HW::Gen12LP) |
1131 | return false; |
1132 | for (uint32_t n = 0; n < program.size(); n++) |
1133 | if (program[n].autoSWSB()) |
1134 | return true; |
1135 | return false; |
1136 | } |
1137 | |
1138 | // Get a list of basic blocks for this program. |
1139 | template <typename Program> |
1140 | inline BasicBlockList getBasicBlocks(HW hw, const Program &program) |
1141 | { |
1142 | auto icount = int(program.size()); |
1143 | |
1144 | // Create map from BB head instructions to instruction #s. |
1145 | std::map<int, int> heads; |
1146 | heads.insert({0, 0}); |
1147 | |
1148 | // Scan through program and find all fixed jump targets. These will |
1149 | // be the BB heads (first instruction in block). |
1150 | // Also check for instructions which end blocks. |
1151 | for (int n = 0; n < icount; n++) { |
1152 | const auto &insn = program[n]; |
1153 | int jip = -1, uip = -1; |
1154 | auto dests = insn.destinations(jip, uip); |
1155 | |
1156 | if (dests == DestNextIP) |
1157 | continue; |
1158 | |
1159 | #ifdef NGEN_DEBUG_BB |
1160 | std::cerr << "Instruction " << n << " ->" ; |
1161 | if (dests & DestNextIP) std::cerr << " " << n + 1; |
1162 | if (dests & DestJIP) std::cerr << " " << n + jip; |
1163 | if (dests & DestUIP) std::cerr << " " << n + uip; |
1164 | std::cerr << std::endl; |
1165 | #endif |
1166 | |
1167 | heads.insert({n + 1, 0}); |
1168 | if (dests & DestJIP) heads.insert({n + jip, 0}); |
1169 | if (dests & DestUIP) heads.insert({n + uip, 0}); |
1170 | } |
1171 | |
1172 | // Create basic blocks and remember mapping from instruction #s to BBs. |
1173 | auto bbCount = uint32_t(heads.size()); |
1174 | BasicBlockList list{bbCount}; |
1175 | |
1176 | int nextBB = 0; |
1177 | for (auto &head : heads) { |
1178 | auto istart = head.first; |
1179 | if (istart >= 0 && istart < icount) { |
1180 | head.second = nextBB; |
1181 | list[nextBB].id = nextBB; |
1182 | list[nextBB++].istart = istart; |
1183 | } |
1184 | } |
1185 | |
1186 | bbCount = nextBB; |
1187 | list.resize(bbCount); |
1188 | |
1189 | for (uint32_t i = 0; i < bbCount - 1; i++) |
1190 | list[i].iend = list[i + 1].istart; |
1191 | list[bbCount - 1].iend = icount; |
1192 | |
1193 | // Scan through basic blocks again. |
1194 | for (auto &bb : list) { |
1195 | // Count in-order instructions in each pipe, and wrdep pseudo-instructions. |
1196 | for (auto &l : bb.lengths) |
1197 | l = 0; |
1198 | bb.wrdeps = 0; |
1199 | |
1200 | for (uint32_t n = bb.istart; n < bb.iend; n++) { |
1201 | const auto &insn = program[n]; |
1202 | |
1203 | if (insn.opcode() == Opcode::wrdep) |
1204 | bb.wrdeps++; |
1205 | auto pipes = getPipeMask(hw, insn); |
1206 | for (int p = 0; p < NPipes; p++) |
1207 | if (pipes & (1 << p)) bb.lengths[p]++; |
1208 | } |
1209 | |
1210 | // Identify successor BBs from final instruction. |
1211 | auto ntail = bb.iend - 1; |
1212 | const auto &insn = program[ntail]; |
1213 | int jip = 0, uip = 0; |
1214 | auto dests = insn.destinations(jip, uip); |
1215 | |
1216 | auto addSuccessor = [&](int inum) { |
1217 | if ((inum >= 0) && (inum < icount)) bb.succ.push_back(&list[heads[inum]]); |
1218 | }; |
1219 | |
1220 | if (dests & DestNextIP) addSuccessor(bb.iend); |
1221 | if (dests & DestJIP) addSuccessor(jip + ntail); |
1222 | if (dests & DestUIP) addSuccessor(uip + ntail); |
1223 | |
1224 | // Add predecessor links to every successor. |
1225 | for (auto succ : bb.succ) |
1226 | succ->pred.push_back(&bb); |
1227 | |
1228 | // Preallocate dependency memory. |
1229 | bb.producers.reserve(bb.iend - bb.istart); |
1230 | bb.consumers.reserve(bb.iend - bb.istart); |
1231 | |
1232 | // Decode and cache operand regions. |
1233 | bb.opRegions.resize(bb.iend - bb.istart); |
1234 | for (uint32_t n = bb.istart; n < bb.iend; n++) { |
1235 | auto ®ions = bb.opRegions[n - bb.istart]; |
1236 | const auto &insn = program[n]; |
1237 | |
1238 | for (int srcN = -1; srcN < 3; srcN++) { |
1239 | regions[srcN + 1].hw = hw; |
1240 | if (!insn.getOperandRegion(regions[srcN + 1], srcN)) |
1241 | regions[srcN + 1].clear(); |
1242 | } |
1243 | } |
1244 | } |
1245 | |
1246 | return list; |
1247 | } |
1248 | |
1249 | template <typename Instruction> |
1250 | inline bool canDefaultPipe(HW hw, const Instruction &insn) |
1251 | { |
1252 | if (hw >= HW::XeHP && insn.opcode() == Opcode::mov_gen12 && (insn.dstTypecode() ^ insn.src0Typecode()) & 0x8) |
1253 | return false; |
1254 | if (hw >= HW::XeHPC && insn.dstTypecode() == 0xB /* :df */) |
1255 | return false; |
1256 | return true; |
1257 | } |
1258 | |
1259 | // Read SWSB from instruction and output: |
1260 | // * token dependency it produces, if any |
1261 | // * dependencies it consumes |
1262 | // * whether auto SWSB requested (bool return value) |
1263 | // Assumes pipe information for this instruction already set up in consume dependency. |
1264 | inline bool getSWSBDependencies(HW hw, const SWSBInfo &swsb, PipeMask defaultPipe, Dependency<false> &produce, Dependency<true> &consume) |
1265 | { |
1266 | produce.token = 0; |
1267 | produce.tokenSrc = false; |
1268 | produce.tokenDst = false; |
1269 | consume.depPipe = PipeMaskNone; |
1270 | consume.dist = 0; |
1271 | consume.token = 0; |
1272 | consume.tokenSrc = false; |
1273 | consume.tokenDst = false; |
1274 | consume.swsb = true; |
1275 | bool enableAutoSWSB = true; |
1276 | |
1277 | if (swsb.hasDist()) { |
1278 | auto pipe = swsb.getPipe(); |
1279 | consume.depPipe = (hw == HW::Gen12LP) ? PipeMaskA : |
1280 | (pipe == Pipe::Default) ? defaultPipe |
1281 | : toMask(pipe); |
1282 | if (consume.depPipe) { // if is here to ignore default pipe deps for OOO instructions. |
1283 | consume.dist = swsb.parts.dist; |
1284 | enableAutoSWSB = false; |
1285 | } |
1286 | } |
1287 | if (swsb.hasToken()) { |
1288 | consume.token = swsb.parts.token; |
1289 | consume.tokenSrc = swsb.parts.src; |
1290 | consume.tokenDst = swsb.parts.dst; |
1291 | if (swsb.hasTokenSet()) { |
1292 | produce.token = consume.token; |
1293 | produce.tokenSrc = consume.tokenSrc; |
1294 | produce.tokenDst = consume.tokenDst; |
1295 | } |
1296 | } |
1297 | |
1298 | return enableAutoSWSB; |
1299 | } |
1300 | |
1301 | template <typename Instruction> |
1302 | inline bool getSWSBDependencies(HW hw, const Instruction &insn, Dependency<false> &produce, Dependency<true> &consume) |
1303 | { |
1304 | bool autoSWSB = insn.autoSWSB(); |
1305 | autoSWSB &= getSWSBDependencies(hw, insn.swsb(), getPipe(hw, insn).inOrderPipe(), produce, consume); |
1306 | return autoSWSB; |
1307 | } |
1308 | |
1309 | // Encode SWSB information. |
1310 | template <typename Instruction> |
1311 | inline SWSBInfo encodeSWSB(HW hw, const Instruction &insn, const Dependency<false> &produce, const Dependency<true> &consume) |
1312 | { |
1313 | SWSBInfo swsb{}; |
1314 | |
1315 | if (produce.hasToken()) { |
1316 | swsb.parts.token = produce.token; |
1317 | swsb.parts.src = swsb.parts.dst = true; |
1318 | } else if (consume.tokenSrc) { |
1319 | swsb.parts.token = consume.token; |
1320 | swsb.parts.src = true; |
1321 | } else if (consume.tokenDst) { |
1322 | swsb.parts.token = consume.token; |
1323 | swsb.parts.dst = true; |
1324 | } |
1325 | |
1326 | if (consume.hasDist()) { |
1327 | if (canDefaultPipe(hw, insn) && ((hw == HW::Gen12LP) || (GeneralizedPipe(consume.depPipe) == consume.pipe))) |
1328 | swsb.setPipe(Pipe::Default); |
1329 | else |
1330 | swsb.setPipe(fromMask(consume.depPipe)); |
1331 | swsb.parts.dist = std::min<int>(consume.dist, 7); |
1332 | } |
1333 | |
1334 | return swsb; |
1335 | } |
1336 | |
1337 | // Check if ARF src/dst requires special handling |
1338 | inline bool arfNeedsSync(ARFType type) |
1339 | { |
1340 | return (type == ARFType::ce || type == ARFType::cr || type == ARFType::sr); |
1341 | } |
1342 | |
1343 | // Get preferred SBID for a given GRF. |
1344 | inline uint8_t preferredSBID(HW hw, uint8_t base) |
1345 | { |
1346 | if (hw >= HW::XeHPC) |
1347 | return (base >> 2) & 0x1F; |
1348 | return (base >> 3) & 0xF; |
1349 | } |
1350 | |
1351 | // Choose SBID for an OOO instruction, based on preceding OOO instructions. |
1352 | template <typename Program> |
1353 | inline uint8_t chooseSBID(HW hw, Program &program, const BasicBlock &bb, int32_t inum, int32_t counterA, const DependencyTable<false> &incoming, const DependencyTable<false> &producers, uint32_t maskDst) |
1354 | { |
1355 | uint32_t unclaimed = (uint64_t(1) << tokenCount(hw)) - 1; |
1356 | std::array<int32_t, 32> pastExpiration; |
1357 | constexpr int32_t infinite = std::numeric_limits<int32_t>::max(); |
1358 | |
1359 | // Priority 1: choose SBID that is an explicit dst dependency for this instruction, if any. |
1360 | if (maskDst) |
1361 | return utils::bsf(maskDst); |
1362 | |
1363 | // Otherwise, look through incoming OOO producers and accumulate most recent use of each token. |
1364 | for (auto &dist : pastExpiration) dist = infinite; |
1365 | |
1366 | auto accumulateTokens = [&](const Dependency<false> &dep) { |
1367 | if (!dep.hasToken()) return; |
1368 | |
1369 | auto depSWSB = program[dep.inum()].swsb(); |
1370 | if (!depSWSB.hasTokenSet()) return; |
1371 | |
1372 | auto token = depSWSB.parts.token; |
1373 | unclaimed &= ~(1 << token); |
1374 | |
1375 | int32_t pe = counterA - (dep.counters[PipeBitA] + dep.tokenTime); |
1376 | pastExpiration[token] = std::min<int32_t>(pastExpiration[token], pe); |
1377 | }; |
1378 | |
1379 | incoming.forEach(accumulateTokens); |
1380 | producers.forEach(accumulateTokens); |
1381 | |
1382 | int32_t bestPE = std::numeric_limits<int32_t>::min(); |
1383 | uint8_t bestPESBID = 0; |
1384 | for (int token = 0; token < tokenCount(hw); token++) { |
1385 | if (pastExpiration[token] > bestPE) { |
1386 | bestPE = pastExpiration[token]; |
1387 | bestPESBID = token; |
1388 | } |
1389 | } |
1390 | |
1391 | // Priority 2: assign SBID based on base register of dst, src1, src0 (in that order), |
1392 | // if it's unclaimed or expired. |
1393 | for (int opNum : {-1, 1, 0}) { |
1394 | auto ®ion = bb.getOperandRegion(inum, opNum); |
1395 | if (region.size > 0) { |
1396 | auto sbid = preferredSBID(hw, region.base); |
1397 | if (pastExpiration[sbid] >= 0) |
1398 | return sbid; |
1399 | } |
1400 | } |
1401 | |
1402 | // Priority 3: choose highest-numbered unclaimed SBID. |
1403 | if (unclaimed) |
1404 | return utils::bsr(unclaimed); |
1405 | |
1406 | // Priority 4: choose token that's longest expired or closest to expiring. |
1407 | return bestPESBID; |
1408 | } |
1409 | |
1410 | // Main dependency analysis. |
1411 | // This is run three times on every BB. |
1412 | // Phase 0 |
1413 | // Generate dependency tables for SBID assignment: |
1414 | // - produced OOO dependencies: outgoing dependencies from this BB (w/o final SBIDs) |
1415 | // - consumed dependencies: incoming dependencies this BB must synchronize on |
1416 | // Phase 1 |
1417 | // Input: |
1418 | // - incoming OOO dependencies, with expirations. |
1419 | // Output: |
1420 | // - produced dependencies: outgoing dependencies this BB creates and does not synchronize on |
1421 | // - consumed dependencies: incoming dependencies this BB must synchronize on |
1422 | // - SBIDs assigned where needed. |
1423 | // Instructions whose dependencies are all inside this BB are scoreboarded now for efficiency. |
1424 | // Phase 2 |
1425 | // Input: complete list of live dependencies. |
1426 | // All unscoreboarded instructions are reanalyzed and scoreboarded now. |
1427 | template <typename Program> |
1428 | inline void analyze(HW hw, Program &program, BasicBlock &bb, int phase) |
1429 | { |
1430 | const bool final = (phase == 2); |
1431 | const bool computeSWSB = (phase > 0); |
1432 | bool forceA1 = false; |
1433 | int inumChain = -1; |
1434 | uint32_t chainTokenMaskSrc = 0, chainTokenMaskDst = 0, chainTokenMaskDstX = 0; |
1435 | uint32_t wrdepTokenMaskDst = 0; |
1436 | Dependency<true> chainGenerated; |
1437 | std::array<int32_t, NPipes> counters; |
1438 | std::vector<Dependency<false>> depList, depListIncoming, chainProducers; |
1439 | |
1440 | // Initialize "preconsumes." These are region-less consumes arising from SWSB. |
1441 | int noPreconsume = std::numeric_limits<int>::min(); |
1442 | std::array<std::array<int, NPipes + 1>, NPipes> preconsumeIO; |
1443 | uint32_t preconsumeTokenSrc = 0, preconsumeTokenDst = 0; |
1444 | |
1445 | auto recordIOPreconsumes = [&](Dependency<true> &generated) { |
1446 | if ((phase == 1) && generated.hasDist()) { |
1447 | auto spipes = generated.pipe.syncPipes(hw); |
1448 | auto dpipes = GeneralizedPipe(generated.depPipe).syncPipes(hw); |
1449 | for (int dpidx = 0; dpidx < NPipes; dpidx++) |
1450 | if (dpipes & (1 << dpidx)) |
1451 | for (int pidx = 0; pidx <= NPipes; pidx++) |
1452 | if (spipes & (1 << pidx)) |
1453 | preconsumeIO[dpidx][pidx] = std::max<int>(preconsumeIO[dpidx][pidx], counters[dpidx] - generated.dist); |
1454 | } |
1455 | }; |
1456 | |
1457 | if (phase == 1) |
1458 | for (auto &pcList : preconsumeIO) |
1459 | for (auto &pc : pcList) |
1460 | pc = noPreconsume; |
1461 | |
1462 | // Initialize counters. |
1463 | for (auto &counter : counters) |
1464 | counter = 0; |
1465 | |
1466 | for (uint32_t inum = bb.istart; inum < bb.iend; inum++) { |
1467 | auto &insn = program[inum]; |
1468 | bool forceA1Next = false; |
1469 | bool atChainStart = false; |
1470 | |
1471 | // Ignore illegal instructions. |
1472 | if (insn.opcode() == Opcode::illegal) |
1473 | continue; |
1474 | |
1475 | // Process wrdep pseudo-instructions, which add OOO write dependencies on the next real instruction. |
1476 | if (insn.opcode() == Opcode::wrdep) { |
1477 | auto ®ion = bb.getOperandRegion(inum, 0); |
1478 | if (!region.empty()) |
1479 | wrdepTokenMaskDst |= bb.producers.removeOOOWritesByRegion(region); |
1480 | continue; |
1481 | } |
1482 | |
1483 | // Placeholder for dependency consumers from this instruction's operands. |
1484 | Dependency<true> consumeOp; |
1485 | consumeOp.counters = counters; |
1486 | consumeOp.pipe = getPipe(hw, insn); |
1487 | |
1488 | // Read SWSB information for this instruction, if already present. |
1489 | Dependency<false> tokenInfo; |
1490 | Dependency<true> generated = consumeOp; |
1491 | bool autoSWSB = getSWSBDependencies(hw, insn, tokenInfo, generated); |
1492 | |
1493 | // Check for beginning of {Atomic} chain. |
1494 | if (insn.atomic() && inumChain < 0) { |
1495 | inumChain = inum; |
1496 | atChainStart = true; |
1497 | } |
1498 | |
1499 | // If token assigned, start by removing all live dependencies with this token. |
1500 | if (tokenInfo.hasToken()) { |
1501 | bb.producers.removeByTokenMask(1 << tokenInfo.token, true); |
1502 | preconsumeTokenSrc |= (1 << tokenInfo.token); |
1503 | preconsumeTokenDst |= (1 << tokenInfo.token); |
1504 | } else if (isVariableLatency(hw, insn.opcode())) { |
1505 | generated.token = tokenInfo.token = tokenInfo.tokenTBD; |
1506 | tokenInfo.tokenSrc = tokenInfo.tokenDst = true; |
1507 | } |
1508 | |
1509 | // For sync.allrd/sync.allwr, consume matching dependencies and add preconsumes |
1510 | // for unmatched tokens. |
1511 | if (insn.opcode() == Opcode::sync) { |
1512 | auto fc = insn.syncFC(); |
1513 | bool allrd = (fc == SyncFunction::allrd); |
1514 | bool allwr = (fc == SyncFunction::allwr); |
1515 | |
1516 | if (allrd || allwr) { |
1517 | uint32_t imm; |
1518 | if (!insn.getImm32(imm)) |
1519 | imm = ~0; |
1520 | |
1521 | auto unmatched = bb.producers.removeByTokenMask(imm, allwr); |
1522 | preconsumeTokenSrc |= unmatched; |
1523 | if (allwr) preconsumeTokenDst |= unmatched; |
1524 | } |
1525 | } |
1526 | |
1527 | // Grab pre-decoded operand regions for this instruction. |
1528 | auto ®ions = bb.opRegions[inum - bb.istart]; |
1529 | |
1530 | // Check for cr/ce/sr destination operand, and force A@1 on the next instruction. |
1531 | ARFType dstARFType; |
1532 | forceA1Next |= (insn.getARFType(dstARFType, -1) && arfNeedsSync(dstARFType)); |
1533 | |
1534 | if (autoSWSB) { |
1535 | // If auto-SWSB has been requested for this instruction, analyze its source operands. |
1536 | // Start a list of dependencies for this instruction. |
1537 | depList.clear(); |
1538 | depListIncoming.clear(); |
1539 | bool foundAllDeps = true; |
1540 | uint32_t tokenMaskSrc = 0, tokenMaskDst = 0, tokenMaskDstX = 0; |
1541 | SWSBInfo syncSWSB; |
1542 | |
1543 | if (!atChainStart && (inumChain >= 0)) { |
1544 | tokenMaskSrc = chainTokenMaskSrc; |
1545 | tokenMaskDst = chainTokenMaskDst; |
1546 | tokenMaskDstX = chainTokenMaskDstX; |
1547 | generated = chainGenerated; |
1548 | } |
1549 | |
1550 | // Add in OOO dst dependencies from previous wrdep(s). |
1551 | tokenMaskDst |= wrdepTokenMaskDst; |
1552 | wrdepTokenMaskDst = 0; |
1553 | |
1554 | // Jumps with unknown destination: preconsume all dependencies. |
1555 | if (inum == (bb.iend - 1)) { |
1556 | int jip, uip; |
1557 | if (insn.destinations(jip, uip) & DestUnknown) { |
1558 | tokenMaskDst = preconsumeTokenDst = (uint64_t(1) << tokenCount(hw)) - 1; |
1559 | for (auto &p : preconsumeIO[PipeBitA]) |
1560 | p = 0; |
1561 | bb.producers.clear(); |
1562 | bb.consumers.clear(); |
1563 | syncSWSB = (hw == HW::Gen12LP) ? SWSB(1) : SWSB<AllPipes>(1); |
1564 | } |
1565 | } |
1566 | |
1567 | // Check if we need to assign an SBID to this instruction. |
1568 | bool assignSBID = (phase == 1) && isVariableLatency(hw, insn.opcode()) && (tokenInfo.token == tokenInfo.tokenTBD) && !insn.atomic(); |
1569 | |
1570 | // Analyze operands. |
1571 | for (int srcN = 2; srcN >= -1; srcN--) { |
1572 | // Skip non-GRF operands. |
1573 | // Special case: check for cr/sr/ce source operands and force A@1 if any. |
1574 | if (regions[srcN + 1].empty()) { |
1575 | ARFType arfType; |
1576 | if ((srcN >= 0) && insn.getARFType(arfType, srcN) && arfNeedsSync(arfType)) { |
1577 | generated.depPipe = PipeMaskA; |
1578 | generated.dist = 1; |
1579 | } |
1580 | continue; |
1581 | } |
1582 | |
1583 | // Create associated dependency consumer. |
1584 | consumeOp.rw = (srcN < 0); |
1585 | consumeOp.region = regions[srcN + 1]; |
1586 | |
1587 | // Remove all intersecting live producers from the table and save them. |
1588 | auto dStart = depList.size(); |
1589 | bb.producers.findAndRemoveIntersections(consumeOp, &depList); |
1590 | auto dEnd = depList.size(); |
1591 | |
1592 | // Do the same for the incoming producers table if we need to assign an SBID. |
1593 | size_t dStartIncoming = 0, dEndIncoming = 0; |
1594 | if (assignSBID) { |
1595 | dStartIncoming = depListIncoming.size(); |
1596 | bb.incoming.findAndRemoveIntersections(consumeOp, &depListIncoming); |
1597 | dEndIncoming = depListIncoming.size(); |
1598 | } |
1599 | |
1600 | // If not final, subtract each of them from original dependency region. |
1601 | // If anything remains, add to consumer table. If it is not implied |
1602 | // by existing consumers, we didn't find all dependencies. |
1603 | if (!final) { |
1604 | for (auto d = dStart; d < dEnd; d++) |
1605 | consumeOp.region.subtract(depList[d].region); |
1606 | if (!consumeOp.region.empty()) |
1607 | foundAllDeps &= !bb.consumers.insertWeak(consumeOp); |
1608 | } |
1609 | |
1610 | // Add dependencies to SWSB. |
1611 | if (computeSWSB) for (auto d = dStart; d < dEnd; d++) { |
1612 | auto &dep = depList[d]; |
1613 | if (dep.pipe.inOrder()) { |
1614 | // Accumulate in-order dependencies. |
1615 | auto thisPipe = dep.pipe.inOrderPipe(); |
1616 | auto thisDist = distance(dep, generated, thisPipe); |
1617 | |
1618 | if (generated.depPipe == PipeMaskNone) |
1619 | generated.depPipe = thisPipe; |
1620 | else if (generated.depPipe != thisPipe) |
1621 | generated.depPipe = PipeMaskA; |
1622 | |
1623 | if (generated.hasDist()) |
1624 | generated.dist = std::min<int>(generated.dist, thisDist); |
1625 | else |
1626 | generated.dist = thisDist; |
1627 | } else if (dep.token != dep.tokenTBD) { |
1628 | // Remember out-of-order dependencies for later. |
1629 | if (dep.tokenSrc) tokenMaskSrc |= (1 << dep.token); |
1630 | if (dep.tokenDst) tokenMaskDst |= (1 << dep.token); |
1631 | } |
1632 | } |
1633 | |
1634 | // Also collect incoming SBIDs if choosing an SBID. |
1635 | tokenMaskDstX = tokenMaskDst; |
1636 | if (assignSBID) for (auto d = dStartIncoming; d < dEndIncoming; d++) { |
1637 | auto &dep = depListIncoming[d]; |
1638 | if (!dep.tokenDst) continue; |
1639 | if (dep.token != dep.tokenTBD) |
1640 | tokenMaskDstX |= (1 << dep.token); |
1641 | else { |
1642 | // Check SWSB again in case it was recently assigned. |
1643 | auto curSWSB = program[dep.inum()].swsb(); |
1644 | if (curSWSB.hasTokenSet()) |
1645 | tokenMaskDstX |= (1 << curSWSB.parts.token); |
1646 | } |
1647 | } |
1648 | } |
1649 | |
1650 | // Transfer dependencies down the {Atomic} chain (will be put later on first instruction). |
1651 | if (insn.atomic()) { |
1652 | chainTokenMaskSrc = tokenMaskSrc; |
1653 | chainTokenMaskDst = tokenMaskDst; |
1654 | chainTokenMaskDstX = tokenMaskDstX; |
1655 | chainGenerated = generated; |
1656 | tokenMaskSrc = tokenMaskDst = 0; |
1657 | generated = consumeOp; |
1658 | } |
1659 | |
1660 | // Always wait until phase 2 to assign SWSB to {Atomic} chains -- |
1661 | // it's not known if all dependencies for the chain have been found until the end. |
1662 | if (inumChain >= 0 || insn.atomic()) |
1663 | foundAllDeps = false; |
1664 | |
1665 | // If token missing on OOO instruction, assign one during phase 1. |
1666 | if (assignSBID) { |
1667 | auto newToken = chooseSBID(hw, program, bb, inum, counters[PipeBitA], bb.incoming, bb.producers, tokenMaskDstX); |
1668 | generated.token = tokenInfo.token = newToken; |
1669 | generated.tokenSrc = generated.tokenDst = true; |
1670 | insn.setSWSB(SBID(generated.token).set); |
1671 | preconsumeTokenSrc |= (1 << tokenInfo.token); |
1672 | preconsumeTokenDst |= (1 << tokenInfo.token); |
1673 | tokenMaskSrc &= ~(1 << tokenInfo.token); |
1674 | tokenMaskDst &= ~(1 << tokenInfo.token); |
1675 | } |
1676 | |
1677 | // Finalize SWSB computation. |
1678 | if (computeSWSB) { |
1679 | bool recordSWSB = (final || foundAllDeps); |
1680 | bool tokenAssigned = tokenInfo.hasToken() && (tokenInfo.token != tokenInfo.tokenTBD); |
1681 | |
1682 | // If last instruction forced A@1, enforce now. |
1683 | if (forceA1) { |
1684 | generated.depPipe = PipeMaskA; |
1685 | generated.dist = 1; |
1686 | if (tokenMaskSrc || tokenMaskDst) { |
1687 | bb.producers.removeIntersections(generated); |
1688 | generated.depPipe = PipeMaskNone; |
1689 | generated.dist = 0; |
1690 | auto swsb = (hw == HW::Gen12LP) ? SWSB(1) : SWSB<AllPipes>(1); |
1691 | if (recordSWSB) |
1692 | bb.syncs.push_back({uint32_t(inum), swsb, SyncFunction::nop, 0}); |
1693 | } |
1694 | } |
1695 | |
1696 | // If dual dependency (token + pipe) on OOO instruction, use A pipe for send, sync for others. |
1697 | if ((generated.hasToken() || tokenAssigned) && generated.hasDist()) { |
1698 | if (insn.opcode() == Opcode::send || insn.opcode() == Opcode::sendc) { |
1699 | if (!(hw >= HW::XeHPC && (generated.depPipe == PipeMaskI || generated.depPipe == PipeMaskF))) |
1700 | generated.depPipe = PipeMaskA; |
1701 | } else { |
1702 | auto distGen = generated; |
1703 | distGen.tokenSrc = distGen.tokenDst = false; |
1704 | syncSWSB = encodeSWSB(hw, insn, Dependency<false>(), distGen); |
1705 | generated.dist = 0; |
1706 | generated.depPipe = PipeMaskNone; |
1707 | } |
1708 | } |
1709 | |
1710 | // Handle OOO shootdown. Unless predicate is (W), it's possible that our token won't be claimed. |
1711 | // In this case, add sync on our token as a precaution. TODO: should check producer table. |
1712 | if (tokenAssigned && (insn.predicated() || inumChain >= 0)) |
1713 | tokenMaskDst |= (1 << tokenInfo.token); |
1714 | |
1715 | // Handle OOO dependencies. |
1716 | // - dst implies src |
1717 | // - use SWSB to mark src/dst w/o dist (in-order or no token) or dst + dist (in-order only, same pipe) |
1718 | // - add sync for any remaining dependencies. |
1719 | tokenMaskSrc &= ~tokenMaskDst; |
1720 | |
1721 | bool defaultPipe = generated.pipe.inOrder() && (generated.depPipe == generated.pipe.inOrderPipe()) |
1722 | && canDefaultPipe(hw, insn); |
1723 | |
1724 | bool acceptsSrc = false, acceptsDst = false; |
1725 | if (generated.pipe.inOrder() || !tokenAssigned) { |
1726 | if (hw >= HW::XeHPC) { |
1727 | acceptsSrc = (generated.depPipe == PipeMaskNone || defaultPipe); |
1728 | acceptsDst = acceptsSrc || (generated.depPipe == PipeMaskA); |
1729 | } else { |
1730 | acceptsSrc = (generated.depPipe == PipeMaskNone); |
1731 | acceptsDst = acceptsSrc || defaultPipe; |
1732 | } |
1733 | } |
1734 | |
1735 | if (tokenMaskDst && acceptsDst) { |
1736 | generated.token = utils::bsr(tokenMaskDst); |
1737 | generated.tokenDst = true; |
1738 | tokenMaskDst &= ~(1 << generated.token); |
1739 | } else if (tokenMaskSrc && acceptsSrc) { |
1740 | generated.token = utils::bsr(tokenMaskSrc); |
1741 | generated.tokenSrc = true; |
1742 | tokenMaskSrc &= ~(1 << generated.token); |
1743 | } |
1744 | |
1745 | bool oneSrc = tokenMaskSrc && utils::is_zero_or_pow2(tokenMaskSrc); |
1746 | bool oneDst = tokenMaskDst && utils::is_zero_or_pow2(tokenMaskDst); |
1747 | bool oneSrcSWSB = false, oneDstSWSB = false; |
1748 | auto inumSync = (inumChain >= 0) ? inumChain : inum; |
1749 | |
1750 | if (syncSWSB.empty()) { |
1751 | if (oneSrc) { |
1752 | syncSWSB = SBID(utils::bsr(tokenMaskSrc)).src; |
1753 | oneSrcSWSB = true; |
1754 | } else if (oneDst) { |
1755 | syncSWSB = SBID(utils::bsr(tokenMaskDst)).dst; |
1756 | oneDstSWSB = true; |
1757 | } |
1758 | } |
1759 | if (tokenMaskSrc && !oneSrcSWSB) { |
1760 | if (recordSWSB) |
1761 | bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::allrd, tokenMaskSrc}); |
1762 | syncSWSB = SWSBInfo(); |
1763 | } |
1764 | if (tokenMaskDst && !oneDstSWSB) { |
1765 | if (recordSWSB) |
1766 | bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::allwr, tokenMaskDst}); |
1767 | syncSWSB = SWSBInfo(); |
1768 | } |
1769 | if (!syncSWSB.empty() && recordSWSB) |
1770 | bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::nop, 0}); |
1771 | |
1772 | // If final or nothing added to consumer table, assign SWSB. |
1773 | // For {Atomic} chains, put SWSB for consumed dependencies at head of chain. |
1774 | if (recordSWSB) { |
1775 | if (inumChain >= 0) { |
1776 | if (!insn.atomic()) { |
1777 | program[inumChain].setSWSB(encodeSWSB(hw, insn, Dependency<false>(), generated)); |
1778 | insn.setSWSB(encodeSWSB(hw, insn, tokenInfo, Dependency<true>())); |
1779 | } |
1780 | } else |
1781 | insn.setSWSB(encodeSWSB(hw, insn, tokenInfo, generated)); |
1782 | insn.clearAutoSWSB(); |
1783 | } |
1784 | |
1785 | // After assigning SWSB to in-order instructions, clean producer list of known SWSB and sync dependencies. |
1786 | if (tokenMaskSrc) bb.producers.removeByTokenMask(tokenMaskSrc, false); |
1787 | if (tokenMaskDst) bb.producers.removeByTokenMask(tokenMaskDst, true); |
1788 | bb.producers.removeIntersections(generated); |
1789 | } |
1790 | } else { |
1791 | // SWSB specified. Consume any dependencies associated with this SWSB. |
1792 | bb.producers.removeIntersections(generated); |
1793 | |
1794 | // Record token dependencies for populating the consumer table. |
1795 | if (!final) { |
1796 | if (generated.tokenSrc) preconsumeTokenSrc |= (1 << tokenInfo.token); |
1797 | if (generated.tokenDst) preconsumeTokenDst |= (1 << tokenInfo.token); |
1798 | } |
1799 | |
1800 | // Consume destination dependencies too. |
1801 | if (!regions[0].empty()) { |
1802 | consumeOp.region = regions[0]; |
1803 | consumeOp.rw = true; |
1804 | bb.producers.removeIntersections(consumeOp); |
1805 | } |
1806 | |
1807 | // Absorb wrdeps. |
1808 | wrdepTokenMaskDst = 0; |
1809 | |
1810 | // Clear auto-SWSB bit if it was set. |
1811 | if (phase == 2) |
1812 | insn.clearAutoSWSB(); |
1813 | |
1814 | // Check for prior sync insertions and update tables appropriately. |
1815 | if (phase == 2) { |
1816 | for (const auto &sync: bb.syncs) { |
1817 | if (sync.inum != inum) |
1818 | continue; |
1819 | |
1820 | bool allrd = (sync.fc == SyncFunction::allrd); |
1821 | bool allwr = (sync.fc == SyncFunction::allwr); |
1822 | |
1823 | if (allrd || allwr) { |
1824 | auto unmatched = bb.producers.removeByTokenMask(sync.mask, allwr); |
1825 | preconsumeTokenSrc |= unmatched; |
1826 | if (allwr) preconsumeTokenDst |= unmatched; |
1827 | } |
1828 | |
1829 | if (!sync.swsb.empty()) { |
1830 | Dependency<false> produce; |
1831 | Dependency<true> consume; |
1832 | (void) getSWSBDependencies(hw, sync.swsb, PipeMaskNone, produce, consume); |
1833 | bb.producers.removeIntersections(consume); |
1834 | if (consume.tokenSrc) preconsumeTokenSrc |= (1 << consume.token); |
1835 | if (consume.tokenDst) preconsumeTokenDst |= (1 << consume.token); |
1836 | } |
1837 | } |
1838 | } |
1839 | } |
1840 | |
1841 | // First pass: record pipeline SWSB dependencies for later entry into consumer table. |
1842 | recordIOPreconsumes(generated); |
1843 | |
1844 | // Add producer dependencies for all operands. |
1845 | // Also record instruction number and token timeout. |
1846 | // During phase 0, only do this for OOO instructions, and if dst not null, only dst. |
1847 | if ((phase > 0) || tokenInfo.hasToken()) { |
1848 | auto produceOp = consumeOp.cast(); |
1849 | if (tokenInfo.hasToken()) { |
1850 | produceOp.token = tokenInfo.token; |
1851 | produceOp.tokenTime = estimateLatency(hw, insn); |
1852 | produceOp.inum() = inum; |
1853 | } |
1854 | |
1855 | for (int srcN = -1; srcN < 3; srcN++) { |
1856 | if (!regions[srcN + 1].empty()) { |
1857 | produceOp.rw = (srcN < 0); |
1858 | if (tokenInfo.hasToken()) { |
1859 | produceOp.tokenSrc = (srcN >= 0); |
1860 | produceOp.tokenDst = (srcN < 0); |
1861 | } |
1862 | produceOp.region = regions[srcN + 1]; |
1863 | if (insn.atomic()) |
1864 | chainProducers.push_back(produceOp); |
1865 | else |
1866 | bb.producers.insertStrong(produceOp); |
1867 | if (phase == 0 && srcN == -1) break; |
1868 | } |
1869 | } |
1870 | |
1871 | // Add producers for previous instructions in {Atomic} chain. |
1872 | if (!insn.atomic()) { |
1873 | for (auto &op : chainProducers) { |
1874 | if (!op.pipe.inOrder() || op.hasToken()) |
1875 | op.token = tokenInfo.token; |
1876 | bb.producers.insertStrong(op); |
1877 | } |
1878 | chainProducers.clear(); |
1879 | } |
1880 | } |
1881 | |
1882 | // Check for end of {Atomic} chain. |
1883 | if (!insn.atomic()) |
1884 | inumChain = -1; |
1885 | |
1886 | // Increment counters. |
1887 | auto pipeMask = getPipeMask(hw, insn); |
1888 | for (int pidx = 0; pidx < NPipes; pidx++) |
1889 | if (pipeMask & (1 << pidx)) |
1890 | counters[pidx]++; |
1891 | |
1892 | forceA1 = forceA1Next; |
1893 | } |
1894 | |
1895 | // Create sync insertion for any outstanding wrdep pseudo-instructions. |
1896 | if (wrdepTokenMaskDst && phase == 2) |
1897 | bb.syncs.push_back({uint32_t(bb.iend), SWSBInfo(), SyncFunction::allwr, wrdepTokenMaskDst}); |
1898 | |
1899 | // Add preconsume dependencies to consume list. |
1900 | if (!final) { |
1901 | // In-order preconsumes. |
1902 | if (phase == 1) for (int pOld = 0; pOld < NPipes; pOld++) { |
1903 | for (int pNew = 0; pNew <= NPipes; pNew++) { |
1904 | auto pc = preconsumeIO[pOld][pNew]; |
1905 | if (pc != noPreconsume) { |
1906 | Dependency<true> preconsume; |
1907 | preconsume.swsb = true; |
1908 | preconsume.counters[pOld] = pc + 1; |
1909 | preconsume.dist = 1; |
1910 | preconsume.pipe = (1 << pNew); |
1911 | preconsume.depPipe = (1 << pOld); |
1912 | bb.consumers.insertStrong(preconsume); |
1913 | } |
1914 | } |
1915 | } |
1916 | // Out of order preconsumes. |
1917 | auto preconsumeToken = preconsumeTokenSrc | preconsumeTokenDst; |
1918 | for (int token = 0; token < tokenCount(hw); token++) { |
1919 | if (preconsumeToken & (1 << token)) { |
1920 | Dependency<true> preconsume; |
1921 | preconsume.swsb = true; |
1922 | preconsume.token = token; |
1923 | preconsume.tokenSrc = (preconsumeTokenSrc & (1 << token)) != 0; |
1924 | preconsume.tokenDst = (preconsumeTokenDst & (1 << token)) != 0; |
1925 | bb.consumers.insertStrong(preconsume); |
1926 | } |
1927 | } |
1928 | if (~preconsumeTokenSrc == 0 || ~preconsumeTokenDst == 0) { |
1929 | Dependency<true> preconsume; |
1930 | preconsume.swsb = true; |
1931 | preconsume.token = preconsume.tokenTBD; |
1932 | preconsume.tokenSrc = (~preconsumeTokenSrc == 0); |
1933 | preconsume.tokenDst = (~preconsumeTokenDst == 0); |
1934 | bb.consumers.insertStrong(preconsume); |
1935 | } |
1936 | } |
1937 | } |
1938 | |
1939 | // Loop optimization. Add synchronizations before entering suspected loops to allow |
1940 | // weaker SWSB inside the loop. |
1941 | inline void loopOptimize(BasicBlock &bb) |
1942 | { |
1943 | // Loop through successors to this BB, looking for ones with |
1944 | // exactly one incoming backedge, not from this BB. |
1945 | // If any found, for every dep in produce table: |
1946 | // For each selector successor: |
1947 | // If backedge pred's produce table doesn't imply this dep, |
1948 | // add syncs to consume it. |
1949 | } |
1950 | |
1951 | // Propagate live dependencies forward through BB flow graph. |
1952 | inline void propagate(std::vector<BasicBlock> &BBs) |
1953 | { |
1954 | auto bbCount = int(BBs.size()); |
1955 | bool done = false; |
1956 | std::vector<Dependency<true>> consumerList; |
1957 | |
1958 | // Mark all incoming dependencies as new. |
1959 | for (auto &bb : BBs) { |
1960 | bb.label = 0; |
1961 | bb.producers.forEach([](Dependency<false> &dep) { |
1962 | dep.label = 0; |
1963 | }); |
1964 | } |
1965 | |
1966 | // Main loop: propagate live dependencies until all live tables stabilize. |
1967 | // This should require no more than bbCount loops. |
1968 | for (int age = 0; (age < bbCount) && !done; age++) { |
1969 | done = true; |
1970 | for (auto &bb : BBs) { |
1971 | // Examine each predecessor of this BB. |
1972 | for (auto pred : bb.pred) { |
1973 | if (pred->label < age) continue; |
1974 | |
1975 | pred->producers.forEach([&](const Dependency<false> &dep) { |
1976 | // New incoming dependency? If not, skip it. |
1977 | if (dep.label != age) return; |
1978 | |
1979 | #ifdef NGEN_DEBUG_PROPAGATE |
1980 | std::cerr << "Prop BB " << pred->id << " -> " << bb.id << ": " ; |
1981 | dep.dump(); |
1982 | #endif |
1983 | |
1984 | // Adjust counters. Exception: OOO tokenless dependencies: counter[0] stores instruction #. |
1985 | auto newDep = dep; |
1986 | if (newDep.tokenTime == 0) |
1987 | for (int p = 0; p < NPipes; p++) |
1988 | newDep.counters[p] -= pred->lengths[p]; |
1989 | |
1990 | // If an in-order dependency, check for timeout, and skip it if so. |
1991 | if (newDep.pipe.inOrder()) { |
1992 | auto pidx = utils::log2(newDep.pipe.inOrderPipe()); |
1993 | if (newDep.counters[pidx] <= -timeout(dep.pipe)) { |
1994 | #ifdef NGEN_DEBUG_PROPAGATE |
1995 | std::cerr << " timeout\n" ; |
1996 | #endif |
1997 | return; |
1998 | } |
1999 | } |
2000 | |
2001 | // Intersect new dependency (producer) with killed (consumer) table. |
2002 | // Subtract all intersections from dependency. |
2003 | consumerList.clear(); |
2004 | bb.consumers.findIntersections(newDep, consumerList); |
2005 | |
2006 | for (auto &consumer : consumerList) { |
2007 | newDep.region.subtract(consumer.region); |
2008 | if (newDep.region.empty()) { |
2009 | #ifdef NGEN_DEBUG_PROPAGATE |
2010 | std::cerr << " killed\n" ; |
2011 | #endif |
2012 | return; |
2013 | } |
2014 | } |
2015 | |
2016 | #ifdef NGEN_DEBUG_PROPAGATE |
2017 | std::cerr << " propagated\n" ; |
2018 | #endif |
2019 | |
2020 | // Dependency is new and was not consumed. |
2021 | // Add to produce table unless it's already implied by existing producers. |
2022 | newDep.label = age + 1; |
2023 | if (bb.producers.insert(newDep)) { |
2024 | done = false; |
2025 | bb.label = age + 1; |
2026 | } |
2027 | }); |
2028 | } |
2029 | } |
2030 | } |
2031 | |
2032 | #ifdef NGEN_SAFE |
2033 | if (!done) throw std::runtime_error("nGEN internal error: propagation failed." ); |
2034 | #endif |
2035 | |
2036 | // Perform final half-propagation step (tail-to-head) to accumulate incoming producers |
2037 | // for each BB. |
2038 | for (auto &bb : BBs) { |
2039 | for (auto pred : bb.pred) { |
2040 | pred->producers.forEach([&](const Dependency<false> &dep) { |
2041 | // Adjust counters, except for OOO tokenless dependencies. |
2042 | auto newDep = dep; |
2043 | if (newDep.tokenTime == 0) |
2044 | for (int p = 0; p < NPipes; p++) |
2045 | newDep.counters[p] -= pred->lengths[p]; |
2046 | |
2047 | // If an in-order dependency, check for timeout, and skip it if so. |
2048 | if (newDep.pipe.inOrder()) { |
2049 | auto pidx = utils::log2(newDep.pipe.inOrderPipe()); |
2050 | if (newDep.counters[pidx] <= -timeout(dep.pipe)) |
2051 | return; |
2052 | } |
2053 | |
2054 | bb.incoming.insert(newDep); |
2055 | }); |
2056 | } |
2057 | } |
2058 | } |
2059 | |
2060 | // Adjust jump targets for sync instruction insertions. |
2061 | template <typename Program> |
2062 | inline void adjustTargets(Program &program, BasicBlockList &list) |
2063 | { |
2064 | std::map<int32_t, int32_t> shifts; |
2065 | |
2066 | int32_t shift = 0; |
2067 | for (auto &bb : list) { |
2068 | shifts.insert({bb.istart, shift}); |
2069 | shift += int32_t(bb.syncs.size()) - bb.wrdeps; |
2070 | } |
2071 | |
2072 | shift = 0; |
2073 | for (auto &bb : list) { |
2074 | shift += int32_t(bb.syncs.size()) - bb.wrdeps; |
2075 | auto ntail = bb.iend - 1; |
2076 | auto &insn = program[ntail]; |
2077 | int jip = -1, uip = -1; |
2078 | auto dests = insn.destinations(jip, uip); |
2079 | if (dests & DestJIP) insn.shiftJIP(shifts[ntail + jip] - shift); |
2080 | if (dests & DestUIP) insn.shiftUIP(shifts[ntail + uip] - shift); |
2081 | } |
2082 | } |
2083 | |
2084 | // Entrypoint for automatic software scoreboarding. |
2085 | // Returns the list of basic blocks, containing information on sync instructions to insert. |
2086 | template <typename Program> |
2087 | inline BasicBlockList autoSWSB(HW hw, Program &program) |
2088 | { |
2089 | if (!hasAutoSWSB(hw, program)) |
2090 | return BasicBlockList(); |
2091 | |
2092 | // Find basic blocks. |
2093 | BasicBlockList bbList = getBasicBlocks(hw, program); |
2094 | |
2095 | #ifdef NGEN_DEBUG |
2096 | std::cerr << "BASIC BLOCKS\n" ; |
2097 | std::cerr << "------------\n" ; |
2098 | for (auto &bb : bbList) { |
2099 | std::cerr << "Basic Block " << bb.id; |
2100 | if (!bb.pred.empty()) { |
2101 | std::cerr << " <-" ; |
2102 | for (auto &pred : bb.pred) |
2103 | std::cerr << ' ' << pred->id; |
2104 | } |
2105 | if (!bb.succ.empty()) { |
2106 | std::cerr << " ->" ; |
2107 | for (auto &succ : bb.succ) |
2108 | std::cerr << ' ' << succ->id; |
2109 | } |
2110 | std::cerr << std::endl; |
2111 | } |
2112 | std::cerr << std::endl; |
2113 | #endif |
2114 | |
2115 | // Analysis round 0: gather OOO instruction usage. |
2116 | for (auto &bb : bbList) |
2117 | analyze(hw, program, bb, 0); |
2118 | |
2119 | #ifdef NGEN_DEBUG |
2120 | std::cerr << "ANALYZE PHASE 0\n" ; |
2121 | std::cerr << "---------------\n" ; |
2122 | for (auto &bb : bbList) { |
2123 | std::cerr << "Basic Block " << bb.id << std::endl; |
2124 | bb.consumers.dump(); |
2125 | bb.producers.dump(); |
2126 | std::cerr << std::endl; |
2127 | } |
2128 | #endif |
2129 | |
2130 | // Propagate OOO dependency producers through BB graph. |
2131 | propagate(bbList); |
2132 | for (auto &bb : bbList) { |
2133 | bb.producers.clear(); |
2134 | bb.consumers.clear(); |
2135 | } |
2136 | |
2137 | #ifdef NGEN_DEBUG |
2138 | std::cerr << "PROPAGATE\n" ; |
2139 | std::cerr << "---------\n" ; |
2140 | for (auto &bb : bbList) { |
2141 | std::cerr << "Basic Block " << bb.id << std::endl; |
2142 | bb.incoming.dump(); |
2143 | std::cerr << std::endl; |
2144 | } |
2145 | #endif |
2146 | |
2147 | // Analysis round 1: assign SBIDs and perform intra-BB analysis. |
2148 | for (auto &bb : bbList) { |
2149 | analyze(hw, program, bb, 1); |
2150 | bb.incoming.clear(); |
2151 | } |
2152 | |
2153 | #ifdef NGEN_DEBUG |
2154 | std::cerr << "ANALYZE PHASE 1\n" ; |
2155 | std::cerr << "---------------\n" ; |
2156 | for (auto &bb : bbList) { |
2157 | std::cerr << "Basic Block " << bb.id << std::endl; |
2158 | bb.consumers.dump(); |
2159 | bb.producers.dump(); |
2160 | std::cerr << std::endl; |
2161 | } |
2162 | #endif |
2163 | |
2164 | // Loop optimization. |
2165 | for (auto &bb : bbList) |
2166 | loopOptimize(bb); |
2167 | |
2168 | // Propagate live dependency producers through BB graph. |
2169 | propagate(bbList); |
2170 | |
2171 | for (auto &bb : bbList) { |
2172 | std::swap(bb.incoming, bb.producers); |
2173 | bb.incoming.clear(); |
2174 | } |
2175 | |
2176 | #ifdef NGEN_DEBUG |
2177 | std::cerr << "PROPAGATE\n" ; |
2178 | std::cerr << "---------\n" ; |
2179 | for (auto &bb : bbList) { |
2180 | std::cerr << "Basic Block " << bb.id << std::endl; |
2181 | bb.producers.dump(); |
2182 | std::cerr << std::endl; |
2183 | } |
2184 | #endif |
2185 | |
2186 | // Analysis round 2: final SWSB assignment. |
2187 | for (auto &bb : bbList) |
2188 | analyze(hw, program, bb, 2); |
2189 | |
2190 | #ifdef NGEN_DEBUG |
2191 | std::cerr << "ANALYZE PHASE 2\n" ; |
2192 | std::cerr << "---------------\n" ; |
2193 | for (auto &bb : bbList) { |
2194 | std::cerr << "Basic Block " << bb.id << std::endl; |
2195 | bb.consumers.dump(); |
2196 | bb.producers.dump(); |
2197 | std::cerr << std::endl; |
2198 | } |
2199 | #endif |
2200 | |
2201 | // Adjust jump targets after sync insertions. |
2202 | adjustTargets(program, bbList); |
2203 | |
2204 | return bbList; |
2205 | } |
2206 | |
2207 | } /* namespace autoswsb */ |
2208 | } /* namespace ngen */ |
2209 | |
2210 | // Instruction interface: |
2211 | // SWSBInfo swsb() const; |
2212 | // void setSWSB(SWSBInfo swsb) const; |
2213 | // Opcode opcode() const; |
2214 | // SyncFunction syncFC() const; |
2215 | // SharedFunction sfid() const; |
2216 | // DestinationMask destinations(int &jip, int &uip) const; |
2217 | // bool getOperandRegion(DependencyRegion ®ion, int opNum) const; // returns false if no such operand. |
2218 | // bool getImm32(uint32_t &imm) const; |
2219 | // |
2220 | // Program interface: |
2221 | // Instruction operator[](int inum); |
2222 | // size_t size() const; |
2223 | |
2224 | #endif /* NGEN_AUTOSWSB_HPP */ |
2225 | |