1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/*
18 * Do not #include this file directly; ngen uses it internally.
19 */
20
21#ifndef NGEN_AUTO_SWSB_HPP
22#define NGEN_AUTO_SWSB_HPP
23
24#if defined(NGEN_DEBUG) || defined(NGEN_DEBUG_PROPAGATE) || defined(NGEN_DEBUG_BB)
25#include <iomanip>
26#include <iostream>
27#endif
28
29#include <limits>
30#include <list>
31#include <map>
32
33namespace ngen {
34namespace autoswsb {
35
36/*******************/
37/* Data structures */
38/*******************/
39
40typedef uint8_t PipeMask;
41enum {
42 PipeMaskNone = 0,
43 PipeMaskA = 1, // All in-order pipes
44 PipeMaskF = 2,
45 PipeMaskI = 4,
46 PipeMaskL = 8,
47 PipeMaskM = 0x10,
48 PipeMaskO = 0x20, // All out-of-order pipes. Not a valid GeneralizedPipe.
49 PipeBitA = 0,
50 PipeBitF = 1,
51 PipeBitI = 2,
52 PipeBitL = 3,
53 PipeBitM = 4,
54 PipeBitO = 5,
55};
56static constexpr int NPipes = 5;
57
58static inline PipeMask toMask(Pipe pipe) { return (1 << (static_cast<unsigned>(pipe) - 1)); }
59static inline Pipe fromMask(PipeMask mask) { return mask ? static_cast<Pipe>(1 + utils::log2(mask)) : Pipe::Default; }
60
61typedef uint8_t DestinationMask;
62enum {
63 DestNone = 0,
64 DestNextIP = 1,
65 DestJIP = 2,
66 DestUIP = 4,
67 DestUnknown = 8
68};
69
70class GeneralizedPipe {
71 uint8_t v;
72
73 static constexpr uint8_t vInOrder = 0x00;
74 static constexpr uint8_t vSend = 0x40; // OR'ed with SFID
75 static constexpr uint8_t vSystolic = 0x80;
76 static constexpr uint8_t vMath = 0xC0;
77 static constexpr uint8_t vTypeMask = 0xC0;
78
79 GeneralizedPipe(uint8_t v_, int dummy) : v{v_} {}
80
81public:
82 GeneralizedPipe() : v{uint8_t(0)} {}
83 GeneralizedPipe(PipeMask pipe) : v{uint8_t(vInOrder | pipe)} {}
84 GeneralizedPipe(SharedFunction sfid) : v{uint8_t(vSend | static_cast<uint8_t>(sfid))} {}
85
86 static GeneralizedPipe Systolic() { return GeneralizedPipe(vSystolic, 0); }
87 static GeneralizedPipe Math() { return GeneralizedPipe(vMath, 0); }
88
89 bool operator==(GeneralizedPipe other) const { return v == other.v; }
90 bool operator!=(GeneralizedPipe other) const { return v != other.v; }
91
92 bool inOrder() const { return ((v & vTypeMask) == vInOrder) && (v != PipeMaskNone); }
93 PipeMask inOrderPipe() const { return inOrder() ? (v & ~vTypeMask) : PipeMaskNone; }
94 Pipe toPipe() const { return fromMask(inOrderPipe()); }
95 inline PipeMask syncPipes(HW hw) const;
96
97#ifdef NGEN_DEBUG
98 inline void dump() const;
99#endif
100};
101
102struct DependencyRegion {
103 uint8_t base, size;
104 uint8_t unspecified : 1;
105 uint8_t checkWAW : 1;
106 uint8_t arf : 1;
107 HW hw;
108 std::array<uint32_t, 32> masks;
109
110 DependencyRegion() : DependencyRegion(HW::Unknown) {}
111 explicit DependencyRegion(HW hw_) : base(0), size(0), unspecified{true}, checkWAW{false}, arf{false}, hw{hw_} {
112 for (auto &m: masks) m = 0;
113 }
114 inline DependencyRegion(HW hw, GRFRange r);
115 inline DependencyRegion(HW hw, int esize, RegData rr);
116
117 inline void intersect(const DependencyRegion &other);
118 inline void subtract(const DependencyRegion &other);
119
120 bool empty() const {
121 if (unspecified) return false;
122 if (size == 0) return true;
123 for (auto m : masks)
124 if (m != 0)
125 return false;
126 return true;
127 }
128 void clear() { *this = DependencyRegion(hw); unspecified = false; checkWAW = false; arf = false; }
129
130#ifdef NGEN_DEBUG
131 inline void dump() const;
132#endif
133};
134
135template <bool consumer>
136struct Dependency {
137 int32_t label; // Multipurpose label for use in algorithms
138
139 // Source instruction information.
140 GeneralizedPipe pipe; // Execution pipe for instruction
141 uint16_t tokenTime; // Estimated upper bound for token lifetime, in cycles.
142 std::array<int32_t, NPipes> counters; // Pipe counters, relative to start of BB.
143
144 // (Mostly) dependency information.
145 uint8_t token; // Out of order token
146 uint8_t tokenSrc : 1; // Src dependency on token?
147 uint8_t tokenDst : 1; // Dst dependency on token?
148 uint8_t rw : 1; // Flag: read or write?
149 uint8_t swsb : 1; // True for SWSB dependency consumers
150 uint8_t active : 1; // True if dependency is still alive.
151 PipeMask depPipe; // (swsb consumer only) Pipe to wait on
152 uint8_t dist; // (swsb consumer only) Pipe distance
153 DependencyRegion region; // GRF region covered
154
155 Dependency() : label{0}, pipe{}, tokenTime{0},
156 token{0}, tokenSrc{false}, tokenDst{false},
157 rw{false}, swsb{false}, active{true},
158 depPipe{PipeMaskNone}, dist{0}, region{} { counters.fill(0); }
159
160 bool operator==(const Dependency &other) {
161 return !std::memcmp(this, &other, sizeof(Dependency));
162 }
163 bool operator!=(const Dependency *other) { return !(operator==(other)); }
164
165 int32_t &inum() { return counters[1]; } // For OOO dependencies in phase 0
166 const int32_t &inum() const { return counters[1]; }
167
168 constexpr bool read() const { return !rw; }
169 constexpr bool write() const { return rw; }
170 constexpr bool hasToken() const { return tokenSrc || tokenDst; }
171 constexpr bool hasDist() const { return (dist > 0); }
172
173 Dependency<!consumer>& cast() { return reinterpret_cast<Dependency<!consumer>&>(*this); }
174
175 static constexpr uint8_t tokenTBD = 0xFF;
176
177#ifdef NGEN_DEBUG
178 inline void dump() const;
179#endif
180};
181
182template <bool consumer>
183class DependencyTable {
184 enum {
185 ListTypeGRF = 0, // Lists of DependencyFragments filtered by GRF base register.
186 ListTypeToken = 1, // Lists of DependencyFragments filtered by token.
187 ListTypePipe = 2, // Lists of DependencyFragments filtered by (in-order) pipe.
188 // fragsByToken/fragsByPipe contain only one DependencyFragment per Dependency.
189 NListTypes = 3
190 };
191
192 enum : uint32_t {
193 none = ~uint32_t(0) // Special value indicating end of list.
194 };
195
196 enum : int {
197 grfListIdxUnspecified = 256 // GRF list index for all unspecified regions.
198 };
199
200 struct DependencyFragment {
201 uint32_t depID; // Index of main Dependency struct in array.
202 uint8_t before, after; // # of consecutive fragments associated with the same Dependency
203 // before and after this one.
204 uint32_t prev[NListTypes]; // Previous pointers for doubly-linked lists.
205 uint32_t next[NListTypes]; // Next pointers for doubly-linked lists.
206 };
207
208 std::vector<Dependency<consumer>> deps; // List of all Dependencies (active or not)
209 std::vector<DependencyFragment> frags; // List of all DependencyFragments (active or not)
210 std::array<uint32_t, 257> heads[NListTypes]; // Heads of doubly-linked lists.
211
212 static bool isHeadLink(uint32_t id) { return ((id & 0x80000000) != 0) && (id != none); }
213 static uint32_t readHeadLink(uint32_t id) { return id & 0x7FFFFFFF; }
214 static uint32_t makeHeadLink(uint32_t idx) { return idx | 0x80000000; }
215
216 template <bool iconsumer> inline void findAndRemoveIntersections(int listType, int listIdx, const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove = true);
217 inline bool insertPrepare(int listType, int listIdx, Dependency<consumer> &dep, bool checkWeaker, bool checkStronger);
218 inline void insertLinkedList(int listType, int listIdx, int32_t fragID);
219
220 template <bool iconsumer> static inline int getPipeIndex(const Dependency<iconsumer> &dep);
221
222public:
223 DependencyTable() { clear(); }
224
225 inline void clear();
226 inline void reserve(int icount);
227 inline bool insert(Dependency<consumer> &dep, bool checkWeaker = true, bool checkStronger = true);
228 inline bool insertWeak(Dependency<consumer> &dep) { return insert(dep, true, false); }
229 inline void insertStrong(const Dependency<consumer> &dep) { (void) insert(const_cast<Dependency<consumer> &>(dep), false, true); }
230 inline void remove(int fragID);
231 template <bool iconsumer> inline void findIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> &out);
232 template <bool iconsumer> inline void findAndRemoveIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove = true);
233 template <bool iconsumer> inline void removeIntersections(const Dependency<iconsumer> &dep);
234 inline uint32_t removeByTokenMask(uint32_t mask, bool dst);
235 inline uint32_t removeOOOWritesByRegion(const DependencyRegion &region);
236
237 template <typename Func> inline void forEach(Func f) { for (auto &entry : deps) if (entry.active) f(entry); }
238 template <typename Func> inline void forEach(Func f) const { for (auto &entry : deps) if (entry.active) f(entry); }
239
240#ifdef NGEN_DEBUG
241 inline void dump() const;
242#endif
243};
244
245struct SyncInsertion {
246 uint32_t inum;
247 SWSBInfo swsb;
248 SyncFunction fc;
249 uint32_t mask; // (allrd/allwr) 0 indicates no mask to be applied.
250};
251
252struct BasicBlock;
253
254struct BasicBlock {
255 uint32_t id; // index
256 int32_t label; // multipurpose flag for use in algorithms
257 uint32_t istart, iend; // instruction range: [istart, iend)
258 uint32_t wrdeps; // # of wrdep pseudo-instructions in this BB
259 std::array<uint32_t, NPipes> lengths; // # of instructions in each pipe in this BB
260 std::vector<BasicBlock *> pred, succ; // list of predecessor/successor BBs
261 DependencyTable<false> producers; // table of dependencies produced and consumed by this BB.
262 DependencyTable<true> consumers; // production table re-used for live incoming dependencies.
263 DependencyTable<false> incoming; // table of dependencies produced by prior BBs (temporary).
264 std::vector<SyncInsertion> syncs; // list of sync instructions to generate.
265 std::vector<std::array<DependencyRegion, 4>> opRegions; // cache of instruction operand regions.
266
267 const DependencyRegion &getOperandRegion(int inum, int opNum) const {
268 return opRegions[inum - istart][opNum + 1];
269 }
270};
271
272using BasicBlockList = std::vector<BasicBlock>;
273
274/*****************/
275/* Pipe Handling */
276/*****************/
277
278// Get all pipes to track in-order dependencies on.
279inline PipeMask allPipes(HW hw)
280{
281 PipeMask mask = PipeMaskA | PipeMaskO;
282 if (hw >= HW::XeHP) mask |= PipeMaskF | PipeMaskI | PipeMaskL;
283 if (hw >= HW::XeHPC) mask |= PipeMaskM;
284
285 return mask;
286}
287
288// Get the execution pipe for an instruction.
289template <typename Instruction>
290inline GeneralizedPipe getPipe(HW hw, const Instruction &insn, bool checkOOO = true)
291{
292 // Check jumps and no-ops
293 auto op = insn.opcode();
294 if (isBranch(op) || op == Opcode::nop_gen12 || op == Opcode::sync || op == Opcode::illegal)
295 return GeneralizedPipe();
296
297 // Check OOO instructions.
298 if (isVariableLatency(hw, op)) {
299 if (!checkOOO)
300 return GeneralizedPipe();
301 switch (op) {
302 case Opcode::dpas:
303 case Opcode::dpasw:
304 return GeneralizedPipe::Systolic();
305 case Opcode::math:
306 return GeneralizedPipe::Math();
307 case Opcode::send:
308 case Opcode::sendc:
309 return GeneralizedPipe(insn.sfid());
310 default:
311 break;
312 }
313 }
314
315 if (hw >= HW::XeHPC && (op == Opcode::math))
316 return PipeMaskM;
317
318 PipeMask mask = PipeMaskNone;
319
320 // For SWSB purposes, Gen12LP has a single in-order pipe.
321 // Otherwise, in-order pipe determined by destination type.
322 // Exception: if there are any long operands, it's a long pipe instruction.
323 if (hw >= HW::XeHP) {
324 auto dt = insn.dstTypecode();
325 unsigned lmask = (hw == HW::XeHPC) ? 0b1011 : 0b0011; // Note: assumes PVC-XT
326 if ((dt & lmask) == lmask)
327 mask |= PipeMaskL;
328 else if (dt & 8)
329 mask |= PipeMaskF;
330 else
331 mask |= PipeMaskI;
332
333 if (!(mask & PipeMaskL)) {
334 if ((insn.src0Typecode() & lmask) == lmask)
335 mask = PipeMaskL;
336 else if ((insn.src1Typecode() & lmask) == lmask)
337 mask = PipeMaskL;
338 }
339 } else
340 mask = PipeMaskA;
341 return mask;
342}
343
344template <typename Instruction>
345inline PipeMask getPipeMask(HW hw, const Instruction &insn)
346{
347 PipeMask pipe = getPipe(hw, insn, false).inOrderPipe();
348 if (pipe != PipeMaskNone)
349 pipe |= PipeMaskA;
350 return pipe;
351}
352
353PipeMask GeneralizedPipe::syncPipes(HW hw) const
354{
355 if ((hw >= HW::XeHP) && (v & PipeMaskA))
356 return allPipes(hw) & ~PipeMaskA & ~PipeMaskO;
357 return (v == PipeMaskNone) ? allPipes(hw) : inOrderPipe();
358}
359
360/**********************/
361/* Dependency Regions */
362/**********************/
363DependencyRegion::DependencyRegion(HW hw_, GRFRange r)
364{
365 auto nmasks = int(masks.size());
366#ifdef NGEN_SAFE
367 if (r.isInvalid() || (r.getLen() > nmasks))
368 throw invalid_region_exception();
369#endif
370
371 hw = hw_;
372 unspecified = false;
373 checkWAW = false;
374 arf = false;
375 base = r.getBase();
376 size = r.getLen();
377 auto fullMask = ~uint32_t(0);
378 for (int i = 0; i < nmasks; i++)
379 masks[i] = (i < r.getLen()) ? fullMask : 0u;
380}
381
382DependencyRegion::DependencyRegion(HW hw_, int esize, RegData rr)
383{
384 const auto mbits = GRF::bytes(hw_);
385 const auto log2MBits = GRF::log2Bytes(hw_);
386
387 hw = hw_;
388 base = rr.getBase();
389 unspecified = false;
390 checkWAW = false;
391 arf = rr.isARF();
392
393 int hs = rr.getHS(), vs = rr.getVS();
394 int nh = rr.getWidth();
395#ifdef NGEN_SAFE
396 if (nh == 0) nh = 1;
397#endif
398 int nv = esize / nh;
399 int bytes = rr.getBytes();
400 int off = rr.getByteOffset();
401
402 auto makeMask = [](int sz) -> uint64_t {
403 return (uint64_t(1) << sz) - 1;
404 };
405
406 auto compress = [&](uint64_t m) -> uint32_t {
407 if (hw_ >= HW::XeHPC) {
408 // Regions tracked at word granularity. OR and pack adjacent bits.
409 // If any sub-word writes, need to track WAW dependencies.
410 if ((m ^ (m >> 1)) & 0x5555555555555555)
411 checkWAW = true;
412 m = (m | (m >> 1)) & 0x5555555555555555;
413 m = (m | (m >> 1)) & 0x3333333333333333;
414 m = (m | (m >> 2)) & 0x0F0F0F0F0F0F0F0F;
415 m = (m | (m >> 4)) & 0x00FF00FF00FF00FF;
416 m = (m | (m >> 8)) & 0x0000FFFF0000FFFF;
417 m |= (m >> 16);
418 }
419 return uint32_t(m);
420 };
421
422 if (hs == 0) nh = hs = 1;
423 if (vs == 0) nv = 1;
424 hs *= bytes;
425 vs *= bytes;
426
427 for (auto &m : masks)
428 m = 0;
429
430 uint64_t hmask = makeMask(bytes) * (makeMask(nh * hs) / makeMask(hs));
431 for (int j = 0; j < nv; j++) {
432 masks[off >> log2MBits] |= compress(hmask << (off & (mbits - 1)));
433 off += vs;
434 }
435
436 size = ((off - vs) >> log2MBits) + 1;
437}
438
439void DependencyRegion::intersect(const DependencyRegion &other)
440{
441 if (arf != other.arf) {
442 clear();
443 return;
444 }
445
446 if (unspecified || other.unspecified)
447 return;
448
449 int i, iOther;
450 for (i = 0, iOther = base - other.base; i < size; i++, iOther++) {
451 if (iOther >= 0 && iOther < other.size)
452 masks[i] &= other.masks[iOther];
453 else
454 masks[i] = 0;
455 }
456}
457
458// Check whether two regions overlap.
459inline bool intersects(const DependencyRegion &dep1, const DependencyRegion &dep2)
460{
461 // Check register file.
462 if (dep1.arf != dep2.arf)
463 return false;
464
465 // Unspecified regions might always overlap.
466 if (dep1.unspecified || dep2.unspecified)
467 return true;
468
469 // Quick check based on register bounds.
470 int diff = dep1.base - dep2.base;
471 if ((diff >= dep2.size) || (diff <= -dep1.size))
472 return false;
473
474 // Precise check.
475 int i1, i2;
476 for (i1 = 0, i2 = diff; i1 < dep1.size; i1++, i2++)
477 if (i2 >= 0 && i2 < dep2.size)
478 if (dep1.masks[i1] & dep2.masks[i2])
479 return true;
480
481 return false;
482}
483
484void DependencyRegion::subtract(const DependencyRegion &other)
485{
486 if (other.arf != arf)
487 return;
488 if (unspecified)
489 return;
490 if (other.unspecified)
491 clear();
492 else {
493 int i, iOther;
494 for (i = 0, iOther = base - other.base; i < size; i++, iOther++)
495 if (iOther >= 0 && iOther < other.size)
496 masks[i] &= ~other.masks[iOther];
497 }
498}
499
500inline bool contains(const DependencyRegion &dep1, const DependencyRegion &dep2)
501{
502 using mtype = decltype(DependencyRegion::masks)::value_type;
503
504 if (dep1.arf != dep2.arf) return false;
505 if (dep1.unspecified) return true;
506 if (dep2.unspecified) return false;
507
508 int i1, i2;
509 for (i1 = dep2.base - dep1.base, i2 = 0; i2 < dep2.size; i1++, i2++) {
510 mtype mask = (i1 >= 0 && i1 < dep1.size) ? dep1.masks[i1] : 0;
511 if (~mask && dep2.masks[i2])
512 return false;
513 }
514 return true;
515}
516
517// Check if an ARF type needs SWSB tracking.
518inline bool trackableARF(ARFType type)
519{
520 return (type == ARFType::acc || type == ARFType::a);
521}
522
523// Distance in an in-order pipe after which a dependency can be ignored.
524inline int timeout(GeneralizedPipe pipe)
525{
526 switch (pipe.inOrderPipe()) {
527 case PipeMaskA: return 11; // Gen12LP
528 case PipeMaskI: return 11;
529 case PipeMaskF: return 11;
530 case PipeMaskL: return 15;
531 case PipeMaskM: return 19;
532 default: return std::numeric_limits<int>::max();
533 }
534}
535
536// Approximate upper bound on cycle count for an OOO instruction.
537template <typename Instruction>
538inline int estimateLatency(HW hw, const Instruction &insn)
539{
540 switch (insn.opcode()) {
541 case Opcode::math: return (hw == HW::Gen12LP) ? 20 : 17;
542 case Opcode::dpas:
543 case Opcode::dpasw: return 20; // need correct value
544 case Opcode::send:
545 case Opcode::sendc: {
546 switch (insn.sfid()) {
547 case SharedFunction::dc0:
548 case SharedFunction::dc1: {
549 MessageDescriptor desc;
550 if (insn.getSendDesc(desc))
551 if (desc.surface.index == 0xFE)
552 return (hw == HW::Gen12LP) ? 33 : 25;
553 return (hw == HW::Gen12LP) ? 106 : 150;
554 }
555 case SharedFunction::sampler: return (hw == HW::Gen12LP) ? 175 : 210;
556 default: return 50;
557 }
558 }
559 default: return 0;
560 }
561}
562
563// Measure instruction distance between two Dependencies in a given pipe.
564template <bool consumer1, bool consumer2>
565inline int distance(const Dependency<consumer1> &dep1, const Dependency<consumer2> &dep2, GeneralizedPipe pipe)
566{
567 auto ioPipe = pipe.inOrderPipe();
568
569 if (ioPipe == PipeMaskNone)
570 return 0;
571
572 auto pidx = utils::log2(ioPipe);
573 return dep2.counters[pidx] - dep1.counters[pidx];
574}
575
576// Check whether two dependencies form a producer-consumer pair.
577inline bool intersects(const Dependency<false> &dep1, const Dependency<true> &dep2)
578{
579 if (!dep2.swsb) {
580 // Region-based dependency. First, quick check based on dependency type:
581 // RAR: ignore
582 // WAR/WAW: ignore if both instructions in same GeneralizedPipe (same in-order pipe, or sends with same SFID, etc.)
583 // If not ignorable, check:
584 // * If consumer is in-order, is that pipe still live (unsynchronized) in the producer?
585 // * If producer is in-order, is it close enough to require tracking the dependency?
586 // * Do the producer+consumer regions overlap?
587 if (dep1.read() && dep2.read()) return false;
588 if (!(dep1.write() && dep2.write() && (dep1.region.checkWAW || dep2.region.checkWAW)))
589 if (dep2.write() && (dep1.pipe == dep2.pipe) && (dep1.pipe != GeneralizedPipe::Math())) return false;
590 if (dep1.pipe.inOrder() && (distance(dep1, dep2, dep1.pipe) >= timeout(dep1.pipe))) return false;
591 if (dep2.region.arf && (dep2.read() || dep2.region.hw == HW::Gen12LP)) return false;
592 return intersects(dep1.region, dep2.region);
593 } else {
594 // SWSB dependency.
595 if (dep1.hasToken() && dep2.hasToken() && (dep1.token == dep2.token) && (dep1.tokenSrc || dep2.tokenDst))
596 return true;
597 if (dep1.pipe.inOrder()) {
598 auto commonPipe = (dep1.pipe.inOrderPipe() | PipeMaskA) & dep2.depPipe;
599 if (commonPipe)
600 return (distance(dep1, dep2, dep1.pipe) >= dep2.dist);
601 }
602 return false;
603 }
604}
605
606// Check whether two dependencies form a producer-consumer pair.
607inline bool intersects(const Dependency<true> &dep1, const Dependency<false> &dep2)
608{
609 return intersects(dep2, dep1);
610}
611
612// Check whether one producer dependency implies another, without checking regions.
613inline bool impliesWithoutRegion(const Dependency<false> &dep1, const Dependency<false> &dep2)
614{
615 // Reads never imply writes.
616 if (dep2.write() && dep1.read())
617 return false;
618 // Check pipes.
619 if (dep2.pipe != dep1.pipe)
620 return false;
621 if (dep2.hasToken()) {
622 // Token dependency: tokens must match. If tokens not assigned, instructions must match.
623 if (!dep1.hasToken())
624 return false;
625 if (!dep1.tokenDst && dep2.tokenDst)
626 return false;
627 if (dep1.token != dep2.token)
628 return false;
629 if ((dep1.token == dep1.tokenTBD) && (dep1.inum() != dep2.inum()))
630 return false;
631 }
632 if (dep2.pipe.inOrder()) {
633 // Pipeline dependency: compare counters.
634 if (dep1.counters[PipeBitA] < dep2.counters[PipeBitA])
635 return false;
636 auto pidx = utils::log2(dep2.pipe.inOrderPipe());
637 if (dep1.counters[pidx] < dep2.counters[pidx])
638 return false;
639 }
640 return true;
641}
642
643// Check whether one consumer dependency implies another, without checking regions.
644inline bool impliesWithoutRegion(const Dependency<true> &dep1, const Dependency<true> &dep2)
645{
646 // Writes never imply reads.
647 if (dep2.read() && dep1.write()) return false;
648
649 // Check pipes.
650 if (dep2.pipe != dep1.pipe)
651 return false;
652 if (dep2.depPipe != dep1.depPipe)
653 return false;
654 if (dep2.hasToken()) {
655 // Token dependency.
656 if (!dep1.hasToken())
657 return false;
658 if (!dep1.tokenDst && dep2.tokenDst)
659 return false;
660 if (dep1.token != dep2.token)
661 return false;
662 }
663 if (dep2.pipe.inOrder()) {
664 // Pipeline dependency. Consumer dependencies are only compared
665 // within BBs, so it's enough to check the A counter.
666 // Note distance check not always valid for A@ consumers >= XeHP,
667 // but is never used in these cases.
668 if (dep2.counters[PipeBitA] < dep1.counters[PipeBitA])
669 return false;
670 if (dep2.hasDist() != dep1.hasDist())
671 return false;
672 if (dep2.hasDist())
673 if (distance(dep1, dep2, dep2.pipe) - dep2.dist + dep1.dist < 0)
674 return false;
675 if (dep1.read() && dep2.write())
676 return false;
677 }
678 return true;
679}
680
681template <bool consumer>
682void DependencyTable<consumer>::clear()
683{
684 deps.clear();
685 frags.clear();
686 for (int l = 0; l < NListTypes; l++)
687 std::fill(heads[l].begin(), heads[l].end(), none);
688}
689
690template <bool consumer>
691void DependencyTable<consumer>::reserve(int icount)
692{
693 icount *= 4;
694 deps.reserve(icount);
695 frags.reserve(icount * 4);
696}
697
698template <bool consumer>
699bool DependencyTable<consumer>::insertPrepare(int listType, int listIdx, Dependency<consumer> &dep, bool checkWeaker, bool checkStronger)
700{
701 for (auto fragID = heads[listType][listIdx]; fragID != none;) {
702 auto &frag = frags[fragID];
703 auto &entry = deps[frag.depID];
704
705 bool noRegions = (dep.region.unspecified && entry.region.unspecified);
706
707 if (checkWeaker && impliesWithoutRegion(entry, dep)) {
708 if (noRegions)
709 return false;
710 dep.region.subtract(entry.region);
711 if (dep.region.empty())
712 return false;
713 }
714
715 if (checkStronger && impliesWithoutRegion(dep, entry)) {
716 entry.region.subtract(dep.region);
717 if (entry.region.empty() || noRegions)
718 remove(fragID);
719 }
720
721 fragID = frag.next[listType];
722 }
723
724 return true;
725}
726
727template <bool consumer>
728void DependencyTable<consumer>::insertLinkedList(int listType, int listIdx, int32_t fragID)
729{
730 auto &head = heads[listType][listIdx];
731 auto &frag = frags[fragID];
732
733 frag.next[listType] = head;
734 frag.prev[listType] = makeHeadLink(listIdx);
735 if (head != none)
736 frags[head].prev[listType] = fragID;
737 head = fragID;
738}
739
740template <bool consumer>
741template <bool iconsumer>
742int DependencyTable<consumer>::getPipeIndex(const Dependency<iconsumer> &dep)
743{
744 auto checkPipe = iconsumer ? dep.depPipe : dep.pipe.inOrderPipe();
745
746 if (!checkPipe)
747 return -1;
748
749 return utils::log2(checkPipe);
750}
751
752// Insert dependency into table.
753// If checkStronger set, remove any weaker existing dependencies.
754// If checkWeaker set, the input dependency's region will be adjusted to remove
755// overlapping stronger dependencies. If this dependency is already implied by the
756// table, it will not be added.
757// Return value indicates whether dependency added.
758template <bool consumer>
759bool DependencyTable<consumer>::insert(Dependency<consumer> &dep, bool checkWeaker, bool checkStronger)
760{
761 bool toAdd = true;
762 int pidx = getPipeIndex(dep);
763
764 bool checkToken = dep.hasToken()
765 && !(!consumer && dep.token == Dependency<consumer>::tokenTBD && !dep.region.unspecified);
766
767 if (checkToken)
768 toAdd = toAdd && insertPrepare(ListTypeToken, dep.token, dep, checkWeaker, checkStronger);
769 else if (!dep.region.unspecified) {
770 for (int r = dep.region.base; r < dep.region.base + dep.region.size; r++)
771 toAdd = toAdd && insertPrepare(ListTypeGRF, r, dep, checkWeaker, checkStronger);
772 } else if (pidx >= 0)
773 toAdd = toAdd && insertPrepare(ListTypePipe, pidx, dep, checkWeaker, checkStronger);
774
775 if (!toAdd)
776 return false;
777
778 auto depID = int(deps.size());
779 deps.push_back(dep);
780
781 // Create fragments.
782 bool hasRegion = !dep.region.unspecified && (dep.region.size > 0);
783 int ridx = hasRegion ? dep.region.base : grfListIdxUnspecified;
784 int nfrags = hasRegion ? dep.region.size : 1;
785 auto fragID = int(frags.size());
786
787 DependencyFragment frag;
788 frag.before = 0;
789 frag.after = nfrags - 1;
790 frag.depID = depID;
791 for (int l = 0; l < NListTypes; l++)
792 frag.prev[l] = frag.next[l] = none;
793
794 for (int o = 0; o < nfrags; o++, fragID++, frag.before++, frag.after--) {
795 frags.push_back(frag);
796 if (hasRegion || dep.region.unspecified)
797 insertLinkedList(ListTypeGRF, ridx++, fragID);
798 if (o > 0)
799 continue;
800 if (dep.hasToken())
801 insertLinkedList(ListTypeToken, dep.token, fragID);
802 if (pidx >= 0)
803 insertLinkedList(ListTypePipe, pidx, fragID);
804 }
805
806 return true;
807}
808
809template <bool consumer>
810void DependencyTable<consumer>::remove(int fragID)
811{
812 auto &frag0 = frags[fragID];
813 deps[frag0.depID].active = false;
814
815 fragID -= frag0.before;
816 int nfrag = frag0.before + frag0.after + 1;
817
818 for (int i = 0; i < nfrag; i++, fragID++) {
819 auto &frag = frags[fragID];
820
821 for (int l = 0; l < NListTypes; l++) {
822 if (isHeadLink(frag.prev[l]))
823 heads[l][readHeadLink(frag.prev[l])] = frag.next[l];
824 else if (frag.prev[l] != none)
825 frags[frag.prev[l]].next[l] = frag.next[l];
826 if (frag.next[l] != none)
827 frags[frag.next[l]].prev[l] = frag.prev[l];
828 if (i > 0)
829 break; // Only GRF linked lists contain multiple fragments per dependency.
830 }
831 }
832}
833
834// Find dependencies in the table intersecting the given dependency, and append them to the given list.
835// NB: the resulting list may contain duplicate dependencies.
836template <bool consumer>
837template <bool iconsumer>
838void DependencyTable<consumer>::findIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> &out)
839{
840 findAndRemoveIntersections(dep, &out, false);
841}
842
843template <bool consumer>
844template <bool iconsumer>
845void DependencyTable<consumer>::findAndRemoveIntersections(int listType, int listIdx, const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove)
846{
847 for (auto fragID = heads[listType][listIdx]; fragID != none;) {
848 auto &frag = frags[fragID];
849 auto &entry = deps[frag.depID];
850 if (doRemove && !consumer && (distance(entry, dep, entry.pipe) >= timeout(entry.pipe)))
851 remove(fragID);
852 else if (intersects(dep, entry)) {
853 if (out != nullptr)
854 out->push_back(entry);
855 if (doRemove)
856 remove(fragID);
857 }
858 fragID = frag.next[listType];
859 }
860}
861
862// Find dependencies in the table intersecting the given dependency.
863// Append them to the given list, and remove from table.
864// Also checks for, and removes, timed-out producer dependencies.
865template <bool consumer>
866template <bool iconsumer>
867void DependencyTable<consumer>::findAndRemoveIntersections(const Dependency<iconsumer> &dep, std::vector<Dependency<consumer>> *out, bool doRemove)
868{
869 PipeMask checkPipe = PipeMaskNone;
870 bool checkToken = false;
871 bool checkRegion = !dep.region.empty();
872
873 if (iconsumer) {
874 if (dep.swsb) {
875 checkToken = true;
876 checkPipe = dep.depPipe;
877 checkRegion = false;
878 }
879 } else {
880 checkToken = true;
881 checkPipe = dep.pipe.inOrderPipe();
882 }
883
884 // Handle token dependencies.
885 if (checkToken && dep.hasToken() && dep.token != dep.tokenTBD)
886 findAndRemoveIntersections(ListTypeToken, dep.token, dep, out, doRemove);
887
888 // Handle pipeline dependencies.
889 if (checkPipe & PipeMaskA) {
890 for (int pidx = 0; pidx < NPipes; pidx++)
891 findAndRemoveIntersections(ListTypePipe, pidx, dep, out, doRemove);
892 } else if (checkPipe != PipeMaskNone) {
893 int pidx = utils::log2(checkPipe);
894 findAndRemoveIntersections(ListTypePipe, pidx, dep, out, doRemove);
895 findAndRemoveIntersections(ListTypePipe, PipeBitA, dep, out, doRemove);
896 }
897
898 // Handle GRF dependencies.
899 if (checkRegion) {
900 int base = dep.region.unspecified ? 0 : dep.region.base;
901 int len = dep.region.unspecified ? 256 : dep.region.size;
902 for (int r = base; r < base + len; r++)
903 findAndRemoveIntersections(ListTypeGRF, r, dep, out, doRemove);
904 findAndRemoveIntersections(ListTypeGRF, grfListIdxUnspecified, dep, out, doRemove);
905 }
906}
907
908// Find dependencies in the table intersecting the given dependency, and remove them.
909template <bool consumer>
910template <bool iconsumer>
911void DependencyTable<consumer>::removeIntersections(const Dependency<iconsumer> &dep)
912{
913 findAndRemoveIntersections(dep, nullptr);
914}
915
916// Remove dependencies from the table matching a token mask.
917// Returns mask of unmatched tokens.
918template <bool consumer>
919uint32_t DependencyTable<consumer>::removeByTokenMask(uint32_t mask, bool dst)
920{
921 uint32_t unmatched = mask;
922
923 while (mask) {
924 uint32_t mask1 = mask & ~(mask & (mask - 1));
925 mask &= ~mask1;
926 int token = utils::log2(mask1);
927
928 for (auto fragID = heads[ListTypeToken][token]; fragID != none;) {
929 auto &frag = frags[fragID];
930 auto &entry = deps[frag.depID];
931
932 if (entry.tokenSrc || (entry.tokenDst && dst)) {
933 unmatched &= ~mask1;
934 remove(fragID);
935 }
936
937 fragID = frag.next[ListTypeToken];
938 }
939 }
940
941 return unmatched;
942}
943
944// Remove OOO writes intersecting the given region. Return mask of token IDs for these writes.
945template <bool consumer>
946uint32_t DependencyTable<consumer>::removeOOOWritesByRegion(const DependencyRegion &region)
947{
948 uint32_t removed = 0;
949
950 if (!region.unspecified) {
951 for (int r = region.base; r < region.base + region.size; r++) {
952 for (auto fragID = heads[ListTypeGRF][r]; fragID != none;) {
953 auto &frag = frags[fragID];
954 auto &entry = deps[frag.depID];
955
956 if (!entry.pipe.inOrder() && entry.write() && intersects(entry.region, region)) {
957 if (entry.token != entry.tokenTBD)
958 removed |= (1 << entry.token);
959 remove(fragID);
960 }
961
962 fragID = frag.next[ListTypeGRF];
963 }
964 }
965 }
966
967 return removed;
968}
969
970#ifdef NGEN_DEBUG
971inline void dumpPipeMask(PipeMask mask, bool spacers = true)
972{
973 if (spacers) {
974 std::cerr << char((mask & PipeMaskA) ? 'A' : ' ');
975 std::cerr << char((mask & PipeMaskF) ? 'F' : ' ');
976 std::cerr << char((mask & PipeMaskI) ? 'I' : ' ');
977 std::cerr << char((mask & PipeMaskL) ? 'L' : ' ');
978 std::cerr << char((mask & PipeMaskM) ? 'M' : ' ');
979 std::cerr << char((mask & PipeMaskO) ? 'O' : ' ');
980 } else {
981 if (mask & PipeMaskA) std::cerr << 'A';
982 if (mask & PipeMaskF) std::cerr << 'F';
983 if (mask & PipeMaskI) std::cerr << 'I';
984 if (mask & PipeMaskL) std::cerr << 'L';
985 if (mask & PipeMaskM) std::cerr << 'M';
986 if (mask & PipeMaskO) std::cerr << 'O';
987 if (mask == PipeMaskNone) std::cerr << '-';
988 }
989}
990
991void GeneralizedPipe::dump() const
992{
993 switch (v & vTypeMask) {
994 case vInOrder: dumpPipeMask(inOrderPipe(), false); break;
995 case vSystolic: std::cerr << 'D'; break;
996 case vMath: std::cerr << 'M'; break;
997 case vSend: std::cerr << 'S' << int(v & 0xF); break;
998 default: std::cerr << '?'; break;
999 }
1000}
1001
1002void DependencyRegion::dump() const
1003{
1004 if (unspecified)
1005 std::cerr << "[no region]";
1006 else if (size == 0)
1007 std::cerr << "[zero size region]";
1008 else {
1009 std::cerr << "r" << int(base);
1010 if (size > 1)
1011 std::cerr << "-r" << int(base + size - 1);
1012
1013 auto fullMask = ~uint32_t(0);
1014 bool partial = false;
1015 for (int ii = 0; ii < size; ii++)
1016 partial |= (masks[ii] != fullMask);
1017
1018 if (partial) {
1019 std::cerr << " (" << std::hex;
1020 for (int ii = 0; ii < size; ii++) {
1021 if (masks[ii] != fullMask)
1022 std::cerr << std::setw(sizeof(masks[ii]) * 2) << masks[ii];
1023 else
1024 std::cerr << "all";
1025 std::cerr << char((ii == (size - 1)) ? ')' : ' ');
1026 }
1027 std::cerr << std::dec;
1028 }
1029 }
1030}
1031
1032template <bool consumer>
1033void Dependency<consumer>::dump() const
1034{
1035 if (tokenTime > 0) {
1036 std::cerr << '[' << counters[PipeBitA] << " + " << tokenTime;
1037 std::cerr << ',' << inum();
1038 } else {
1039 std::cerr << '[';
1040 for (auto &counter : counters)
1041 std::cerr << counter << ',';
1042 pipe.dump();
1043 }
1044 std::cerr << ']';
1045 if (hasToken()) {
1046 std::cerr << " $";
1047 if (token == tokenTBD)
1048 std::cerr << '?';
1049 else
1050 std::cerr << std::hex << int(token) << std::dec;
1051 if (tokenSrc && !tokenDst)
1052 std::cerr << ".src";
1053 else if (tokenDst && !tokenSrc)
1054 std::cerr << ".dst";
1055 else
1056 std::cerr << " ";
1057 } else
1058 std::cerr << " ";
1059 if (dist > 0) {
1060 dumpPipeMask(depPipe, false);
1061 std::cerr << '@' << int(dist);
1062 } else
1063 std::cerr << " ";
1064
1065 std::cerr << (rw ? " write " : " read ");
1066 if (!region.unspecified)
1067 region.dump();
1068}
1069
1070template <bool consumer>
1071void DependencyTable<consumer>::dump() const
1072{
1073 std::cerr << (consumer ? "Consumers:\n" : "Producers:\n");
1074 for (size_t i = 0; i < deps.size(); i++) {
1075 if (!deps[i].active)
1076 continue;
1077 std::cerr << i << ":\t";
1078 deps[i].dump();
1079 std::cerr << std::endl;
1080 }
1081
1082 for (int l = 0; l < NListTypes; l++) {
1083 for (size_t i = 0; i < heads[l].size(); i++) {
1084 auto fragID = heads[l][i], lastFragID = makeHeadLink(i);
1085 if (fragID != none) {
1086 switch (l) {
1087 case ListTypeGRF:
1088 std::cerr << 'r';
1089 if (i == grfListIdxUnspecified)
1090 std::cerr << '?';
1091 else
1092 std::cerr << i;
1093 break;
1094 case ListTypeToken:
1095 std::cerr << '$';
1096 if (i == Dependency<consumer>::tokenTBD)
1097 std::cerr << '?';
1098 else
1099 std::cerr << i;
1100 break;
1101 case ListTypePipe:
1102 if (i > NPipes)
1103 std::cerr << '?';
1104 else
1105 std::cerr << "AFILMO"[i % (NPipes + 1)];
1106 break;
1107 }
1108 std::cerr << ":\t";
1109 while (fragID != none) {
1110 if (frags[fragID].prev[l] != lastFragID)
1111 std::cerr << "(bad last ptr) ";
1112 std::cerr << frags[fragID].depID << " -> ";
1113 lastFragID = fragID;
1114 fragID = frags[fragID].next[l];
1115 }
1116 std::cerr << std::endl;
1117 }
1118 }
1119 }
1120}
1121#endif
1122
1123/*****************/
1124/* Main Routines */
1125/*****************/
1126
1127template <typename Program>
1128inline bool hasAutoSWSB(HW hw, const Program &program)
1129{
1130 if (hw < HW::Gen12LP)
1131 return false;
1132 for (uint32_t n = 0; n < program.size(); n++)
1133 if (program[n].autoSWSB())
1134 return true;
1135 return false;
1136}
1137
1138// Get a list of basic blocks for this program.
1139template <typename Program>
1140inline BasicBlockList getBasicBlocks(HW hw, const Program &program)
1141{
1142 auto icount = int(program.size());
1143
1144 // Create map from BB head instructions to instruction #s.
1145 std::map<int, int> heads;
1146 heads.insert({0, 0});
1147
1148 // Scan through program and find all fixed jump targets. These will
1149 // be the BB heads (first instruction in block).
1150 // Also check for instructions which end blocks.
1151 for (int n = 0; n < icount; n++) {
1152 const auto &insn = program[n];
1153 int jip = -1, uip = -1;
1154 auto dests = insn.destinations(jip, uip);
1155
1156 if (dests == DestNextIP)
1157 continue;
1158
1159#ifdef NGEN_DEBUG_BB
1160 std::cerr << "Instruction " << n << " ->";
1161 if (dests & DestNextIP) std::cerr << " " << n + 1;
1162 if (dests & DestJIP) std::cerr << " " << n + jip;
1163 if (dests & DestUIP) std::cerr << " " << n + uip;
1164 std::cerr << std::endl;
1165#endif
1166
1167 heads.insert({n + 1, 0});
1168 if (dests & DestJIP) heads.insert({n + jip, 0});
1169 if (dests & DestUIP) heads.insert({n + uip, 0});
1170 }
1171
1172 // Create basic blocks and remember mapping from instruction #s to BBs.
1173 auto bbCount = uint32_t(heads.size());
1174 BasicBlockList list{bbCount};
1175
1176 int nextBB = 0;
1177 for (auto &head : heads) {
1178 auto istart = head.first;
1179 if (istart >= 0 && istart < icount) {
1180 head.second = nextBB;
1181 list[nextBB].id = nextBB;
1182 list[nextBB++].istart = istart;
1183 }
1184 }
1185
1186 bbCount = nextBB;
1187 list.resize(bbCount);
1188
1189 for (uint32_t i = 0; i < bbCount - 1; i++)
1190 list[i].iend = list[i + 1].istart;
1191 list[bbCount - 1].iend = icount;
1192
1193 // Scan through basic blocks again.
1194 for (auto &bb : list) {
1195 // Count in-order instructions in each pipe, and wrdep pseudo-instructions.
1196 for (auto &l : bb.lengths)
1197 l = 0;
1198 bb.wrdeps = 0;
1199
1200 for (uint32_t n = bb.istart; n < bb.iend; n++) {
1201 const auto &insn = program[n];
1202
1203 if (insn.opcode() == Opcode::wrdep)
1204 bb.wrdeps++;
1205 auto pipes = getPipeMask(hw, insn);
1206 for (int p = 0; p < NPipes; p++)
1207 if (pipes & (1 << p)) bb.lengths[p]++;
1208 }
1209
1210 // Identify successor BBs from final instruction.
1211 auto ntail = bb.iend - 1;
1212 const auto &insn = program[ntail];
1213 int jip = 0, uip = 0;
1214 auto dests = insn.destinations(jip, uip);
1215
1216 auto addSuccessor = [&](int inum) {
1217 if ((inum >= 0) && (inum < icount)) bb.succ.push_back(&list[heads[inum]]);
1218 };
1219
1220 if (dests & DestNextIP) addSuccessor(bb.iend);
1221 if (dests & DestJIP) addSuccessor(jip + ntail);
1222 if (dests & DestUIP) addSuccessor(uip + ntail);
1223
1224 // Add predecessor links to every successor.
1225 for (auto succ : bb.succ)
1226 succ->pred.push_back(&bb);
1227
1228 // Preallocate dependency memory.
1229 bb.producers.reserve(bb.iend - bb.istart);
1230 bb.consumers.reserve(bb.iend - bb.istart);
1231
1232 // Decode and cache operand regions.
1233 bb.opRegions.resize(bb.iend - bb.istart);
1234 for (uint32_t n = bb.istart; n < bb.iend; n++) {
1235 auto &regions = bb.opRegions[n - bb.istart];
1236 const auto &insn = program[n];
1237
1238 for (int srcN = -1; srcN < 3; srcN++) {
1239 regions[srcN + 1].hw = hw;
1240 if (!insn.getOperandRegion(regions[srcN + 1], srcN))
1241 regions[srcN + 1].clear();
1242 }
1243 }
1244 }
1245
1246 return list;
1247}
1248
1249template <typename Instruction>
1250inline bool canDefaultPipe(HW hw, const Instruction &insn)
1251{
1252 if (hw >= HW::XeHP && insn.opcode() == Opcode::mov_gen12 && (insn.dstTypecode() ^ insn.src0Typecode()) & 0x8)
1253 return false;
1254 if (hw >= HW::XeHPC && insn.dstTypecode() == 0xB /* :df */)
1255 return false;
1256 return true;
1257}
1258
1259// Read SWSB from instruction and output:
1260// * token dependency it produces, if any
1261// * dependencies it consumes
1262// * whether auto SWSB requested (bool return value)
1263// Assumes pipe information for this instruction already set up in consume dependency.
1264inline bool getSWSBDependencies(HW hw, const SWSBInfo &swsb, PipeMask defaultPipe, Dependency<false> &produce, Dependency<true> &consume)
1265{
1266 produce.token = 0;
1267 produce.tokenSrc = false;
1268 produce.tokenDst = false;
1269 consume.depPipe = PipeMaskNone;
1270 consume.dist = 0;
1271 consume.token = 0;
1272 consume.tokenSrc = false;
1273 consume.tokenDst = false;
1274 consume.swsb = true;
1275 bool enableAutoSWSB = true;
1276
1277 if (swsb.hasDist()) {
1278 auto pipe = swsb.getPipe();
1279 consume.depPipe = (hw == HW::Gen12LP) ? PipeMaskA :
1280 (pipe == Pipe::Default) ? defaultPipe
1281 : toMask(pipe);
1282 if (consume.depPipe) { // if is here to ignore default pipe deps for OOO instructions.
1283 consume.dist = swsb.parts.dist;
1284 enableAutoSWSB = false;
1285 }
1286 }
1287 if (swsb.hasToken()) {
1288 consume.token = swsb.parts.token;
1289 consume.tokenSrc = swsb.parts.src;
1290 consume.tokenDst = swsb.parts.dst;
1291 if (swsb.hasTokenSet()) {
1292 produce.token = consume.token;
1293 produce.tokenSrc = consume.tokenSrc;
1294 produce.tokenDst = consume.tokenDst;
1295 }
1296 }
1297
1298 return enableAutoSWSB;
1299}
1300
1301template <typename Instruction>
1302inline bool getSWSBDependencies(HW hw, const Instruction &insn, Dependency<false> &produce, Dependency<true> &consume)
1303{
1304 bool autoSWSB = insn.autoSWSB();
1305 autoSWSB &= getSWSBDependencies(hw, insn.swsb(), getPipe(hw, insn).inOrderPipe(), produce, consume);
1306 return autoSWSB;
1307}
1308
1309// Encode SWSB information.
1310template <typename Instruction>
1311inline SWSBInfo encodeSWSB(HW hw, const Instruction &insn, const Dependency<false> &produce, const Dependency<true> &consume)
1312{
1313 SWSBInfo swsb{};
1314
1315 if (produce.hasToken()) {
1316 swsb.parts.token = produce.token;
1317 swsb.parts.src = swsb.parts.dst = true;
1318 } else if (consume.tokenSrc) {
1319 swsb.parts.token = consume.token;
1320 swsb.parts.src = true;
1321 } else if (consume.tokenDst) {
1322 swsb.parts.token = consume.token;
1323 swsb.parts.dst = true;
1324 }
1325
1326 if (consume.hasDist()) {
1327 if (canDefaultPipe(hw, insn) && ((hw == HW::Gen12LP) || (GeneralizedPipe(consume.depPipe) == consume.pipe)))
1328 swsb.setPipe(Pipe::Default);
1329 else
1330 swsb.setPipe(fromMask(consume.depPipe));
1331 swsb.parts.dist = std::min<int>(consume.dist, 7);
1332 }
1333
1334 return swsb;
1335}
1336
1337// Check if ARF src/dst requires special handling
1338inline bool arfNeedsSync(ARFType type)
1339{
1340 return (type == ARFType::ce || type == ARFType::cr || type == ARFType::sr);
1341}
1342
1343// Get preferred SBID for a given GRF.
1344inline uint8_t preferredSBID(HW hw, uint8_t base)
1345{
1346 if (hw >= HW::XeHPC)
1347 return (base >> 2) & 0x1F;
1348 return (base >> 3) & 0xF;
1349}
1350
1351// Choose SBID for an OOO instruction, based on preceding OOO instructions.
1352template <typename Program>
1353inline uint8_t chooseSBID(HW hw, Program &program, const BasicBlock &bb, int32_t inum, int32_t counterA, const DependencyTable<false> &incoming, const DependencyTable<false> &producers, uint32_t maskDst)
1354{
1355 uint32_t unclaimed = (uint64_t(1) << tokenCount(hw)) - 1;
1356 std::array<int32_t, 32> pastExpiration;
1357 constexpr int32_t infinite = std::numeric_limits<int32_t>::max();
1358
1359 // Priority 1: choose SBID that is an explicit dst dependency for this instruction, if any.
1360 if (maskDst)
1361 return utils::bsf(maskDst);
1362
1363 // Otherwise, look through incoming OOO producers and accumulate most recent use of each token.
1364 for (auto &dist : pastExpiration) dist = infinite;
1365
1366 auto accumulateTokens = [&](const Dependency<false> &dep) {
1367 if (!dep.hasToken()) return;
1368
1369 auto depSWSB = program[dep.inum()].swsb();
1370 if (!depSWSB.hasTokenSet()) return;
1371
1372 auto token = depSWSB.parts.token;
1373 unclaimed &= ~(1 << token);
1374
1375 int32_t pe = counterA - (dep.counters[PipeBitA] + dep.tokenTime);
1376 pastExpiration[token] = std::min<int32_t>(pastExpiration[token], pe);
1377 };
1378
1379 incoming.forEach(accumulateTokens);
1380 producers.forEach(accumulateTokens);
1381
1382 int32_t bestPE = std::numeric_limits<int32_t>::min();
1383 uint8_t bestPESBID = 0;
1384 for (int token = 0; token < tokenCount(hw); token++) {
1385 if (pastExpiration[token] > bestPE) {
1386 bestPE = pastExpiration[token];
1387 bestPESBID = token;
1388 }
1389 }
1390
1391 // Priority 2: assign SBID based on base register of dst, src1, src0 (in that order),
1392 // if it's unclaimed or expired.
1393 for (int opNum : {-1, 1, 0}) {
1394 auto &region = bb.getOperandRegion(inum, opNum);
1395 if (region.size > 0) {
1396 auto sbid = preferredSBID(hw, region.base);
1397 if (pastExpiration[sbid] >= 0)
1398 return sbid;
1399 }
1400 }
1401
1402 // Priority 3: choose highest-numbered unclaimed SBID.
1403 if (unclaimed)
1404 return utils::bsr(unclaimed);
1405
1406 // Priority 4: choose token that's longest expired or closest to expiring.
1407 return bestPESBID;
1408}
1409
1410// Main dependency analysis.
1411// This is run three times on every BB.
1412// Phase 0
1413// Generate dependency tables for SBID assignment:
1414// - produced OOO dependencies: outgoing dependencies from this BB (w/o final SBIDs)
1415// - consumed dependencies: incoming dependencies this BB must synchronize on
1416// Phase 1
1417// Input:
1418// - incoming OOO dependencies, with expirations.
1419// Output:
1420// - produced dependencies: outgoing dependencies this BB creates and does not synchronize on
1421// - consumed dependencies: incoming dependencies this BB must synchronize on
1422// - SBIDs assigned where needed.
1423// Instructions whose dependencies are all inside this BB are scoreboarded now for efficiency.
1424// Phase 2
1425// Input: complete list of live dependencies.
1426// All unscoreboarded instructions are reanalyzed and scoreboarded now.
1427template <typename Program>
1428inline void analyze(HW hw, Program &program, BasicBlock &bb, int phase)
1429{
1430 const bool final = (phase == 2);
1431 const bool computeSWSB = (phase > 0);
1432 bool forceA1 = false;
1433 int inumChain = -1;
1434 uint32_t chainTokenMaskSrc = 0, chainTokenMaskDst = 0, chainTokenMaskDstX = 0;
1435 uint32_t wrdepTokenMaskDst = 0;
1436 Dependency<true> chainGenerated;
1437 std::array<int32_t, NPipes> counters;
1438 std::vector<Dependency<false>> depList, depListIncoming, chainProducers;
1439
1440 // Initialize "preconsumes." These are region-less consumes arising from SWSB.
1441 int noPreconsume = std::numeric_limits<int>::min();
1442 std::array<std::array<int, NPipes + 1>, NPipes> preconsumeIO;
1443 uint32_t preconsumeTokenSrc = 0, preconsumeTokenDst = 0;
1444
1445 auto recordIOPreconsumes = [&](Dependency<true> &generated) {
1446 if ((phase == 1) && generated.hasDist()) {
1447 auto spipes = generated.pipe.syncPipes(hw);
1448 auto dpipes = GeneralizedPipe(generated.depPipe).syncPipes(hw);
1449 for (int dpidx = 0; dpidx < NPipes; dpidx++)
1450 if (dpipes & (1 << dpidx))
1451 for (int pidx = 0; pidx <= NPipes; pidx++)
1452 if (spipes & (1 << pidx))
1453 preconsumeIO[dpidx][pidx] = std::max<int>(preconsumeIO[dpidx][pidx], counters[dpidx] - generated.dist);
1454 }
1455 };
1456
1457 if (phase == 1)
1458 for (auto &pcList : preconsumeIO)
1459 for (auto &pc : pcList)
1460 pc = noPreconsume;
1461
1462 // Initialize counters.
1463 for (auto &counter : counters)
1464 counter = 0;
1465
1466 for (uint32_t inum = bb.istart; inum < bb.iend; inum++) {
1467 auto &insn = program[inum];
1468 bool forceA1Next = false;
1469 bool atChainStart = false;
1470
1471 // Ignore illegal instructions.
1472 if (insn.opcode() == Opcode::illegal)
1473 continue;
1474
1475 // Process wrdep pseudo-instructions, which add OOO write dependencies on the next real instruction.
1476 if (insn.opcode() == Opcode::wrdep) {
1477 auto &region = bb.getOperandRegion(inum, 0);
1478 if (!region.empty())
1479 wrdepTokenMaskDst |= bb.producers.removeOOOWritesByRegion(region);
1480 continue;
1481 }
1482
1483 // Placeholder for dependency consumers from this instruction's operands.
1484 Dependency<true> consumeOp;
1485 consumeOp.counters = counters;
1486 consumeOp.pipe = getPipe(hw, insn);
1487
1488 // Read SWSB information for this instruction, if already present.
1489 Dependency<false> tokenInfo;
1490 Dependency<true> generated = consumeOp;
1491 bool autoSWSB = getSWSBDependencies(hw, insn, tokenInfo, generated);
1492
1493 // Check for beginning of {Atomic} chain.
1494 if (insn.atomic() && inumChain < 0) {
1495 inumChain = inum;
1496 atChainStart = true;
1497 }
1498
1499 // If token assigned, start by removing all live dependencies with this token.
1500 if (tokenInfo.hasToken()) {
1501 bb.producers.removeByTokenMask(1 << tokenInfo.token, true);
1502 preconsumeTokenSrc |= (1 << tokenInfo.token);
1503 preconsumeTokenDst |= (1 << tokenInfo.token);
1504 } else if (isVariableLatency(hw, insn.opcode())) {
1505 generated.token = tokenInfo.token = tokenInfo.tokenTBD;
1506 tokenInfo.tokenSrc = tokenInfo.tokenDst = true;
1507 }
1508
1509 // For sync.allrd/sync.allwr, consume matching dependencies and add preconsumes
1510 // for unmatched tokens.
1511 if (insn.opcode() == Opcode::sync) {
1512 auto fc = insn.syncFC();
1513 bool allrd = (fc == SyncFunction::allrd);
1514 bool allwr = (fc == SyncFunction::allwr);
1515
1516 if (allrd || allwr) {
1517 uint32_t imm;
1518 if (!insn.getImm32(imm))
1519 imm = ~0;
1520
1521 auto unmatched = bb.producers.removeByTokenMask(imm, allwr);
1522 preconsumeTokenSrc |= unmatched;
1523 if (allwr) preconsumeTokenDst |= unmatched;
1524 }
1525 }
1526
1527 // Grab pre-decoded operand regions for this instruction.
1528 auto &regions = bb.opRegions[inum - bb.istart];
1529
1530 // Check for cr/ce/sr destination operand, and force A@1 on the next instruction.
1531 ARFType dstARFType;
1532 forceA1Next |= (insn.getARFType(dstARFType, -1) && arfNeedsSync(dstARFType));
1533
1534 if (autoSWSB) {
1535 // If auto-SWSB has been requested for this instruction, analyze its source operands.
1536 // Start a list of dependencies for this instruction.
1537 depList.clear();
1538 depListIncoming.clear();
1539 bool foundAllDeps = true;
1540 uint32_t tokenMaskSrc = 0, tokenMaskDst = 0, tokenMaskDstX = 0;
1541 SWSBInfo syncSWSB;
1542
1543 if (!atChainStart && (inumChain >= 0)) {
1544 tokenMaskSrc = chainTokenMaskSrc;
1545 tokenMaskDst = chainTokenMaskDst;
1546 tokenMaskDstX = chainTokenMaskDstX;
1547 generated = chainGenerated;
1548 }
1549
1550 // Add in OOO dst dependencies from previous wrdep(s).
1551 tokenMaskDst |= wrdepTokenMaskDst;
1552 wrdepTokenMaskDst = 0;
1553
1554 // Jumps with unknown destination: preconsume all dependencies.
1555 if (inum == (bb.iend - 1)) {
1556 int jip, uip;
1557 if (insn.destinations(jip, uip) & DestUnknown) {
1558 tokenMaskDst = preconsumeTokenDst = (uint64_t(1) << tokenCount(hw)) - 1;
1559 for (auto &p : preconsumeIO[PipeBitA])
1560 p = 0;
1561 bb.producers.clear();
1562 bb.consumers.clear();
1563 syncSWSB = (hw == HW::Gen12LP) ? SWSB(1) : SWSB<AllPipes>(1);
1564 }
1565 }
1566
1567 // Check if we need to assign an SBID to this instruction.
1568 bool assignSBID = (phase == 1) && isVariableLatency(hw, insn.opcode()) && (tokenInfo.token == tokenInfo.tokenTBD) && !insn.atomic();
1569
1570 // Analyze operands.
1571 for (int srcN = 2; srcN >= -1; srcN--) {
1572 // Skip non-GRF operands.
1573 // Special case: check for cr/sr/ce source operands and force A@1 if any.
1574 if (regions[srcN + 1].empty()) {
1575 ARFType arfType;
1576 if ((srcN >= 0) && insn.getARFType(arfType, srcN) && arfNeedsSync(arfType)) {
1577 generated.depPipe = PipeMaskA;
1578 generated.dist = 1;
1579 }
1580 continue;
1581 }
1582
1583 // Create associated dependency consumer.
1584 consumeOp.rw = (srcN < 0);
1585 consumeOp.region = regions[srcN + 1];
1586
1587 // Remove all intersecting live producers from the table and save them.
1588 auto dStart = depList.size();
1589 bb.producers.findAndRemoveIntersections(consumeOp, &depList);
1590 auto dEnd = depList.size();
1591
1592 // Do the same for the incoming producers table if we need to assign an SBID.
1593 size_t dStartIncoming = 0, dEndIncoming = 0;
1594 if (assignSBID) {
1595 dStartIncoming = depListIncoming.size();
1596 bb.incoming.findAndRemoveIntersections(consumeOp, &depListIncoming);
1597 dEndIncoming = depListIncoming.size();
1598 }
1599
1600 // If not final, subtract each of them from original dependency region.
1601 // If anything remains, add to consumer table. If it is not implied
1602 // by existing consumers, we didn't find all dependencies.
1603 if (!final) {
1604 for (auto d = dStart; d < dEnd; d++)
1605 consumeOp.region.subtract(depList[d].region);
1606 if (!consumeOp.region.empty())
1607 foundAllDeps &= !bb.consumers.insertWeak(consumeOp);
1608 }
1609
1610 // Add dependencies to SWSB.
1611 if (computeSWSB) for (auto d = dStart; d < dEnd; d++) {
1612 auto &dep = depList[d];
1613 if (dep.pipe.inOrder()) {
1614 // Accumulate in-order dependencies.
1615 auto thisPipe = dep.pipe.inOrderPipe();
1616 auto thisDist = distance(dep, generated, thisPipe);
1617
1618 if (generated.depPipe == PipeMaskNone)
1619 generated.depPipe = thisPipe;
1620 else if (generated.depPipe != thisPipe)
1621 generated.depPipe = PipeMaskA;
1622
1623 if (generated.hasDist())
1624 generated.dist = std::min<int>(generated.dist, thisDist);
1625 else
1626 generated.dist = thisDist;
1627 } else if (dep.token != dep.tokenTBD) {
1628 // Remember out-of-order dependencies for later.
1629 if (dep.tokenSrc) tokenMaskSrc |= (1 << dep.token);
1630 if (dep.tokenDst) tokenMaskDst |= (1 << dep.token);
1631 }
1632 }
1633
1634 // Also collect incoming SBIDs if choosing an SBID.
1635 tokenMaskDstX = tokenMaskDst;
1636 if (assignSBID) for (auto d = dStartIncoming; d < dEndIncoming; d++) {
1637 auto &dep = depListIncoming[d];
1638 if (!dep.tokenDst) continue;
1639 if (dep.token != dep.tokenTBD)
1640 tokenMaskDstX |= (1 << dep.token);
1641 else {
1642 // Check SWSB again in case it was recently assigned.
1643 auto curSWSB = program[dep.inum()].swsb();
1644 if (curSWSB.hasTokenSet())
1645 tokenMaskDstX |= (1 << curSWSB.parts.token);
1646 }
1647 }
1648 }
1649
1650 // Transfer dependencies down the {Atomic} chain (will be put later on first instruction).
1651 if (insn.atomic()) {
1652 chainTokenMaskSrc = tokenMaskSrc;
1653 chainTokenMaskDst = tokenMaskDst;
1654 chainTokenMaskDstX = tokenMaskDstX;
1655 chainGenerated = generated;
1656 tokenMaskSrc = tokenMaskDst = 0;
1657 generated = consumeOp;
1658 }
1659
1660 // Always wait until phase 2 to assign SWSB to {Atomic} chains --
1661 // it's not known if all dependencies for the chain have been found until the end.
1662 if (inumChain >= 0 || insn.atomic())
1663 foundAllDeps = false;
1664
1665 // If token missing on OOO instruction, assign one during phase 1.
1666 if (assignSBID) {
1667 auto newToken = chooseSBID(hw, program, bb, inum, counters[PipeBitA], bb.incoming, bb.producers, tokenMaskDstX);
1668 generated.token = tokenInfo.token = newToken;
1669 generated.tokenSrc = generated.tokenDst = true;
1670 insn.setSWSB(SBID(generated.token).set);
1671 preconsumeTokenSrc |= (1 << tokenInfo.token);
1672 preconsumeTokenDst |= (1 << tokenInfo.token);
1673 tokenMaskSrc &= ~(1 << tokenInfo.token);
1674 tokenMaskDst &= ~(1 << tokenInfo.token);
1675 }
1676
1677 // Finalize SWSB computation.
1678 if (computeSWSB) {
1679 bool recordSWSB = (final || foundAllDeps);
1680 bool tokenAssigned = tokenInfo.hasToken() && (tokenInfo.token != tokenInfo.tokenTBD);
1681
1682 // If last instruction forced A@1, enforce now.
1683 if (forceA1) {
1684 generated.depPipe = PipeMaskA;
1685 generated.dist = 1;
1686 if (tokenMaskSrc || tokenMaskDst) {
1687 bb.producers.removeIntersections(generated);
1688 generated.depPipe = PipeMaskNone;
1689 generated.dist = 0;
1690 auto swsb = (hw == HW::Gen12LP) ? SWSB(1) : SWSB<AllPipes>(1);
1691 if (recordSWSB)
1692 bb.syncs.push_back({uint32_t(inum), swsb, SyncFunction::nop, 0});
1693 }
1694 }
1695
1696 // If dual dependency (token + pipe) on OOO instruction, use A pipe for send, sync for others.
1697 if ((generated.hasToken() || tokenAssigned) && generated.hasDist()) {
1698 if (insn.opcode() == Opcode::send || insn.opcode() == Opcode::sendc) {
1699 if (!(hw >= HW::XeHPC && (generated.depPipe == PipeMaskI || generated.depPipe == PipeMaskF)))
1700 generated.depPipe = PipeMaskA;
1701 } else {
1702 auto distGen = generated;
1703 distGen.tokenSrc = distGen.tokenDst = false;
1704 syncSWSB = encodeSWSB(hw, insn, Dependency<false>(), distGen);
1705 generated.dist = 0;
1706 generated.depPipe = PipeMaskNone;
1707 }
1708 }
1709
1710 // Handle OOO shootdown. Unless predicate is (W), it's possible that our token won't be claimed.
1711 // In this case, add sync on our token as a precaution. TODO: should check producer table.
1712 if (tokenAssigned && (insn.predicated() || inumChain >= 0))
1713 tokenMaskDst |= (1 << tokenInfo.token);
1714
1715 // Handle OOO dependencies.
1716 // - dst implies src
1717 // - use SWSB to mark src/dst w/o dist (in-order or no token) or dst + dist (in-order only, same pipe)
1718 // - add sync for any remaining dependencies.
1719 tokenMaskSrc &= ~tokenMaskDst;
1720
1721 bool defaultPipe = generated.pipe.inOrder() && (generated.depPipe == generated.pipe.inOrderPipe())
1722 && canDefaultPipe(hw, insn);
1723
1724 bool acceptsSrc = false, acceptsDst = false;
1725 if (generated.pipe.inOrder() || !tokenAssigned) {
1726 if (hw >= HW::XeHPC) {
1727 acceptsSrc = (generated.depPipe == PipeMaskNone || defaultPipe);
1728 acceptsDst = acceptsSrc || (generated.depPipe == PipeMaskA);
1729 } else {
1730 acceptsSrc = (generated.depPipe == PipeMaskNone);
1731 acceptsDst = acceptsSrc || defaultPipe;
1732 }
1733 }
1734
1735 if (tokenMaskDst && acceptsDst) {
1736 generated.token = utils::bsr(tokenMaskDst);
1737 generated.tokenDst = true;
1738 tokenMaskDst &= ~(1 << generated.token);
1739 } else if (tokenMaskSrc && acceptsSrc) {
1740 generated.token = utils::bsr(tokenMaskSrc);
1741 generated.tokenSrc = true;
1742 tokenMaskSrc &= ~(1 << generated.token);
1743 }
1744
1745 bool oneSrc = tokenMaskSrc && utils::is_zero_or_pow2(tokenMaskSrc);
1746 bool oneDst = tokenMaskDst && utils::is_zero_or_pow2(tokenMaskDst);
1747 bool oneSrcSWSB = false, oneDstSWSB = false;
1748 auto inumSync = (inumChain >= 0) ? inumChain : inum;
1749
1750 if (syncSWSB.empty()) {
1751 if (oneSrc) {
1752 syncSWSB = SBID(utils::bsr(tokenMaskSrc)).src;
1753 oneSrcSWSB = true;
1754 } else if (oneDst) {
1755 syncSWSB = SBID(utils::bsr(tokenMaskDst)).dst;
1756 oneDstSWSB = true;
1757 }
1758 }
1759 if (tokenMaskSrc && !oneSrcSWSB) {
1760 if (recordSWSB)
1761 bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::allrd, tokenMaskSrc});
1762 syncSWSB = SWSBInfo();
1763 }
1764 if (tokenMaskDst && !oneDstSWSB) {
1765 if (recordSWSB)
1766 bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::allwr, tokenMaskDst});
1767 syncSWSB = SWSBInfo();
1768 }
1769 if (!syncSWSB.empty() && recordSWSB)
1770 bb.syncs.push_back({uint32_t(inumSync), syncSWSB, SyncFunction::nop, 0});
1771
1772 // If final or nothing added to consumer table, assign SWSB.
1773 // For {Atomic} chains, put SWSB for consumed dependencies at head of chain.
1774 if (recordSWSB) {
1775 if (inumChain >= 0) {
1776 if (!insn.atomic()) {
1777 program[inumChain].setSWSB(encodeSWSB(hw, insn, Dependency<false>(), generated));
1778 insn.setSWSB(encodeSWSB(hw, insn, tokenInfo, Dependency<true>()));
1779 }
1780 } else
1781 insn.setSWSB(encodeSWSB(hw, insn, tokenInfo, generated));
1782 insn.clearAutoSWSB();
1783 }
1784
1785 // After assigning SWSB to in-order instructions, clean producer list of known SWSB and sync dependencies.
1786 if (tokenMaskSrc) bb.producers.removeByTokenMask(tokenMaskSrc, false);
1787 if (tokenMaskDst) bb.producers.removeByTokenMask(tokenMaskDst, true);
1788 bb.producers.removeIntersections(generated);
1789 }
1790 } else {
1791 // SWSB specified. Consume any dependencies associated with this SWSB.
1792 bb.producers.removeIntersections(generated);
1793
1794 // Record token dependencies for populating the consumer table.
1795 if (!final) {
1796 if (generated.tokenSrc) preconsumeTokenSrc |= (1 << tokenInfo.token);
1797 if (generated.tokenDst) preconsumeTokenDst |= (1 << tokenInfo.token);
1798 }
1799
1800 // Consume destination dependencies too.
1801 if (!regions[0].empty()) {
1802 consumeOp.region = regions[0];
1803 consumeOp.rw = true;
1804 bb.producers.removeIntersections(consumeOp);
1805 }
1806
1807 // Absorb wrdeps.
1808 wrdepTokenMaskDst = 0;
1809
1810 // Clear auto-SWSB bit if it was set.
1811 if (phase == 2)
1812 insn.clearAutoSWSB();
1813
1814 // Check for prior sync insertions and update tables appropriately.
1815 if (phase == 2) {
1816 for (const auto &sync: bb.syncs) {
1817 if (sync.inum != inum)
1818 continue;
1819
1820 bool allrd = (sync.fc == SyncFunction::allrd);
1821 bool allwr = (sync.fc == SyncFunction::allwr);
1822
1823 if (allrd || allwr) {
1824 auto unmatched = bb.producers.removeByTokenMask(sync.mask, allwr);
1825 preconsumeTokenSrc |= unmatched;
1826 if (allwr) preconsumeTokenDst |= unmatched;
1827 }
1828
1829 if (!sync.swsb.empty()) {
1830 Dependency<false> produce;
1831 Dependency<true> consume;
1832 (void) getSWSBDependencies(hw, sync.swsb, PipeMaskNone, produce, consume);
1833 bb.producers.removeIntersections(consume);
1834 if (consume.tokenSrc) preconsumeTokenSrc |= (1 << consume.token);
1835 if (consume.tokenDst) preconsumeTokenDst |= (1 << consume.token);
1836 }
1837 }
1838 }
1839 }
1840
1841 // First pass: record pipeline SWSB dependencies for later entry into consumer table.
1842 recordIOPreconsumes(generated);
1843
1844 // Add producer dependencies for all operands.
1845 // Also record instruction number and token timeout.
1846 // During phase 0, only do this for OOO instructions, and if dst not null, only dst.
1847 if ((phase > 0) || tokenInfo.hasToken()) {
1848 auto produceOp = consumeOp.cast();
1849 if (tokenInfo.hasToken()) {
1850 produceOp.token = tokenInfo.token;
1851 produceOp.tokenTime = estimateLatency(hw, insn);
1852 produceOp.inum() = inum;
1853 }
1854
1855 for (int srcN = -1; srcN < 3; srcN++) {
1856 if (!regions[srcN + 1].empty()) {
1857 produceOp.rw = (srcN < 0);
1858 if (tokenInfo.hasToken()) {
1859 produceOp.tokenSrc = (srcN >= 0);
1860 produceOp.tokenDst = (srcN < 0);
1861 }
1862 produceOp.region = regions[srcN + 1];
1863 if (insn.atomic())
1864 chainProducers.push_back(produceOp);
1865 else
1866 bb.producers.insertStrong(produceOp);
1867 if (phase == 0 && srcN == -1) break;
1868 }
1869 }
1870
1871 // Add producers for previous instructions in {Atomic} chain.
1872 if (!insn.atomic()) {
1873 for (auto &op : chainProducers) {
1874 if (!op.pipe.inOrder() || op.hasToken())
1875 op.token = tokenInfo.token;
1876 bb.producers.insertStrong(op);
1877 }
1878 chainProducers.clear();
1879 }
1880 }
1881
1882 // Check for end of {Atomic} chain.
1883 if (!insn.atomic())
1884 inumChain = -1;
1885
1886 // Increment counters.
1887 auto pipeMask = getPipeMask(hw, insn);
1888 for (int pidx = 0; pidx < NPipes; pidx++)
1889 if (pipeMask & (1 << pidx))
1890 counters[pidx]++;
1891
1892 forceA1 = forceA1Next;
1893 }
1894
1895 // Create sync insertion for any outstanding wrdep pseudo-instructions.
1896 if (wrdepTokenMaskDst && phase == 2)
1897 bb.syncs.push_back({uint32_t(bb.iend), SWSBInfo(), SyncFunction::allwr, wrdepTokenMaskDst});
1898
1899 // Add preconsume dependencies to consume list.
1900 if (!final) {
1901 // In-order preconsumes.
1902 if (phase == 1) for (int pOld = 0; pOld < NPipes; pOld++) {
1903 for (int pNew = 0; pNew <= NPipes; pNew++) {
1904 auto pc = preconsumeIO[pOld][pNew];
1905 if (pc != noPreconsume) {
1906 Dependency<true> preconsume;
1907 preconsume.swsb = true;
1908 preconsume.counters[pOld] = pc + 1;
1909 preconsume.dist = 1;
1910 preconsume.pipe = (1 << pNew);
1911 preconsume.depPipe = (1 << pOld);
1912 bb.consumers.insertStrong(preconsume);
1913 }
1914 }
1915 }
1916 // Out of order preconsumes.
1917 auto preconsumeToken = preconsumeTokenSrc | preconsumeTokenDst;
1918 for (int token = 0; token < tokenCount(hw); token++) {
1919 if (preconsumeToken & (1 << token)) {
1920 Dependency<true> preconsume;
1921 preconsume.swsb = true;
1922 preconsume.token = token;
1923 preconsume.tokenSrc = (preconsumeTokenSrc & (1 << token)) != 0;
1924 preconsume.tokenDst = (preconsumeTokenDst & (1 << token)) != 0;
1925 bb.consumers.insertStrong(preconsume);
1926 }
1927 }
1928 if (~preconsumeTokenSrc == 0 || ~preconsumeTokenDst == 0) {
1929 Dependency<true> preconsume;
1930 preconsume.swsb = true;
1931 preconsume.token = preconsume.tokenTBD;
1932 preconsume.tokenSrc = (~preconsumeTokenSrc == 0);
1933 preconsume.tokenDst = (~preconsumeTokenDst == 0);
1934 bb.consumers.insertStrong(preconsume);
1935 }
1936 }
1937}
1938
1939// Loop optimization. Add synchronizations before entering suspected loops to allow
1940// weaker SWSB inside the loop.
1941inline void loopOptimize(BasicBlock &bb)
1942{
1943 // Loop through successors to this BB, looking for ones with
1944 // exactly one incoming backedge, not from this BB.
1945 // If any found, for every dep in produce table:
1946 // For each selector successor:
1947 // If backedge pred's produce table doesn't imply this dep,
1948 // add syncs to consume it.
1949}
1950
1951// Propagate live dependencies forward through BB flow graph.
1952inline void propagate(std::vector<BasicBlock> &BBs)
1953{
1954 auto bbCount = int(BBs.size());
1955 bool done = false;
1956 std::vector<Dependency<true>> consumerList;
1957
1958 // Mark all incoming dependencies as new.
1959 for (auto &bb : BBs) {
1960 bb.label = 0;
1961 bb.producers.forEach([](Dependency<false> &dep) {
1962 dep.label = 0;
1963 });
1964 }
1965
1966 // Main loop: propagate live dependencies until all live tables stabilize.
1967 // This should require no more than bbCount loops.
1968 for (int age = 0; (age < bbCount) && !done; age++) {
1969 done = true;
1970 for (auto &bb : BBs) {
1971 // Examine each predecessor of this BB.
1972 for (auto pred : bb.pred) {
1973 if (pred->label < age) continue;
1974
1975 pred->producers.forEach([&](const Dependency<false> &dep) {
1976 // New incoming dependency? If not, skip it.
1977 if (dep.label != age) return;
1978
1979#ifdef NGEN_DEBUG_PROPAGATE
1980 std::cerr << "Prop BB " << pred->id << " -> " << bb.id << ": ";
1981 dep.dump();
1982#endif
1983
1984 // Adjust counters. Exception: OOO tokenless dependencies: counter[0] stores instruction #.
1985 auto newDep = dep;
1986 if (newDep.tokenTime == 0)
1987 for (int p = 0; p < NPipes; p++)
1988 newDep.counters[p] -= pred->lengths[p];
1989
1990 // If an in-order dependency, check for timeout, and skip it if so.
1991 if (newDep.pipe.inOrder()) {
1992 auto pidx = utils::log2(newDep.pipe.inOrderPipe());
1993 if (newDep.counters[pidx] <= -timeout(dep.pipe)) {
1994#ifdef NGEN_DEBUG_PROPAGATE
1995 std::cerr << " timeout\n";
1996#endif
1997 return;
1998 }
1999 }
2000
2001 // Intersect new dependency (producer) with killed (consumer) table.
2002 // Subtract all intersections from dependency.
2003 consumerList.clear();
2004 bb.consumers.findIntersections(newDep, consumerList);
2005
2006 for (auto &consumer : consumerList) {
2007 newDep.region.subtract(consumer.region);
2008 if (newDep.region.empty()) {
2009#ifdef NGEN_DEBUG_PROPAGATE
2010 std::cerr << " killed\n";
2011#endif
2012 return;
2013 }
2014 }
2015
2016#ifdef NGEN_DEBUG_PROPAGATE
2017 std::cerr << " propagated\n";
2018#endif
2019
2020 // Dependency is new and was not consumed.
2021 // Add to produce table unless it's already implied by existing producers.
2022 newDep.label = age + 1;
2023 if (bb.producers.insert(newDep)) {
2024 done = false;
2025 bb.label = age + 1;
2026 }
2027 });
2028 }
2029 }
2030 }
2031
2032#ifdef NGEN_SAFE
2033 if (!done) throw std::runtime_error("nGEN internal error: propagation failed.");
2034#endif
2035
2036 // Perform final half-propagation step (tail-to-head) to accumulate incoming producers
2037 // for each BB.
2038 for (auto &bb : BBs) {
2039 for (auto pred : bb.pred) {
2040 pred->producers.forEach([&](const Dependency<false> &dep) {
2041 // Adjust counters, except for OOO tokenless dependencies.
2042 auto newDep = dep;
2043 if (newDep.tokenTime == 0)
2044 for (int p = 0; p < NPipes; p++)
2045 newDep.counters[p] -= pred->lengths[p];
2046
2047 // If an in-order dependency, check for timeout, and skip it if so.
2048 if (newDep.pipe.inOrder()) {
2049 auto pidx = utils::log2(newDep.pipe.inOrderPipe());
2050 if (newDep.counters[pidx] <= -timeout(dep.pipe))
2051 return;
2052 }
2053
2054 bb.incoming.insert(newDep);
2055 });
2056 }
2057 }
2058}
2059
2060// Adjust jump targets for sync instruction insertions.
2061template <typename Program>
2062inline void adjustTargets(Program &program, BasicBlockList &list)
2063{
2064 std::map<int32_t, int32_t> shifts;
2065
2066 int32_t shift = 0;
2067 for (auto &bb : list) {
2068 shifts.insert({bb.istart, shift});
2069 shift += int32_t(bb.syncs.size()) - bb.wrdeps;
2070 }
2071
2072 shift = 0;
2073 for (auto &bb : list) {
2074 shift += int32_t(bb.syncs.size()) - bb.wrdeps;
2075 auto ntail = bb.iend - 1;
2076 auto &insn = program[ntail];
2077 int jip = -1, uip = -1;
2078 auto dests = insn.destinations(jip, uip);
2079 if (dests & DestJIP) insn.shiftJIP(shifts[ntail + jip] - shift);
2080 if (dests & DestUIP) insn.shiftUIP(shifts[ntail + uip] - shift);
2081 }
2082}
2083
2084// Entrypoint for automatic software scoreboarding.
2085// Returns the list of basic blocks, containing information on sync instructions to insert.
2086template <typename Program>
2087inline BasicBlockList autoSWSB(HW hw, Program &program)
2088{
2089 if (!hasAutoSWSB(hw, program))
2090 return BasicBlockList();
2091
2092 // Find basic blocks.
2093 BasicBlockList bbList = getBasicBlocks(hw, program);
2094
2095#ifdef NGEN_DEBUG
2096 std::cerr << "BASIC BLOCKS\n";
2097 std::cerr << "------------\n";
2098 for (auto &bb : bbList) {
2099 std::cerr << "Basic Block " << bb.id;
2100 if (!bb.pred.empty()) {
2101 std::cerr << " <-";
2102 for (auto &pred : bb.pred)
2103 std::cerr << ' ' << pred->id;
2104 }
2105 if (!bb.succ.empty()) {
2106 std::cerr << " ->";
2107 for (auto &succ : bb.succ)
2108 std::cerr << ' ' << succ->id;
2109 }
2110 std::cerr << std::endl;
2111 }
2112 std::cerr << std::endl;
2113#endif
2114
2115 // Analysis round 0: gather OOO instruction usage.
2116 for (auto &bb : bbList)
2117 analyze(hw, program, bb, 0);
2118
2119#ifdef NGEN_DEBUG
2120 std::cerr << "ANALYZE PHASE 0\n";
2121 std::cerr << "---------------\n";
2122 for (auto &bb : bbList) {
2123 std::cerr << "Basic Block " << bb.id << std::endl;
2124 bb.consumers.dump();
2125 bb.producers.dump();
2126 std::cerr << std::endl;
2127 }
2128#endif
2129
2130 // Propagate OOO dependency producers through BB graph.
2131 propagate(bbList);
2132 for (auto &bb : bbList) {
2133 bb.producers.clear();
2134 bb.consumers.clear();
2135 }
2136
2137#ifdef NGEN_DEBUG
2138 std::cerr << "PROPAGATE\n";
2139 std::cerr << "---------\n";
2140 for (auto &bb : bbList) {
2141 std::cerr << "Basic Block " << bb.id << std::endl;
2142 bb.incoming.dump();
2143 std::cerr << std::endl;
2144 }
2145#endif
2146
2147 // Analysis round 1: assign SBIDs and perform intra-BB analysis.
2148 for (auto &bb : bbList) {
2149 analyze(hw, program, bb, 1);
2150 bb.incoming.clear();
2151 }
2152
2153#ifdef NGEN_DEBUG
2154 std::cerr << "ANALYZE PHASE 1\n";
2155 std::cerr << "---------------\n";
2156 for (auto &bb : bbList) {
2157 std::cerr << "Basic Block " << bb.id << std::endl;
2158 bb.consumers.dump();
2159 bb.producers.dump();
2160 std::cerr << std::endl;
2161 }
2162#endif
2163
2164 // Loop optimization.
2165 for (auto &bb : bbList)
2166 loopOptimize(bb);
2167
2168 // Propagate live dependency producers through BB graph.
2169 propagate(bbList);
2170
2171 for (auto &bb : bbList) {
2172 std::swap(bb.incoming, bb.producers);
2173 bb.incoming.clear();
2174 }
2175
2176#ifdef NGEN_DEBUG
2177 std::cerr << "PROPAGATE\n";
2178 std::cerr << "---------\n";
2179 for (auto &bb : bbList) {
2180 std::cerr << "Basic Block " << bb.id << std::endl;
2181 bb.producers.dump();
2182 std::cerr << std::endl;
2183 }
2184#endif
2185
2186 // Analysis round 2: final SWSB assignment.
2187 for (auto &bb : bbList)
2188 analyze(hw, program, bb, 2);
2189
2190#ifdef NGEN_DEBUG
2191 std::cerr << "ANALYZE PHASE 2\n";
2192 std::cerr << "---------------\n";
2193 for (auto &bb : bbList) {
2194 std::cerr << "Basic Block " << bb.id << std::endl;
2195 bb.consumers.dump();
2196 bb.producers.dump();
2197 std::cerr << std::endl;
2198 }
2199#endif
2200
2201 // Adjust jump targets after sync insertions.
2202 adjustTargets(program, bbList);
2203
2204 return bbList;
2205}
2206
2207} /* namespace autoswsb */
2208} /* namespace ngen */
2209
2210// Instruction interface:
2211// SWSBInfo swsb() const;
2212// void setSWSB(SWSBInfo swsb) const;
2213// Opcode opcode() const;
2214// SyncFunction syncFC() const;
2215// SharedFunction sfid() const;
2216// DestinationMask destinations(int &jip, int &uip) const;
2217// bool getOperandRegion(DependencyRegion &region, int opNum) const; // returns false if no such operand.
2218// bool getImm32(uint32_t &imm) const;
2219//
2220// Program interface:
2221// Instruction operator[](int inum);
2222// size_t size() const;
2223
2224#endif /* NGEN_AUTOSWSB_HPP */
2225