1/*******************************************************************************
2* Copyright 2019-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17/*
18 * Do not #include this file directly; ngen uses it internally.
19 */
20
21
22// Pseudo-instructions and macros.
23template <typename DT = void>
24void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
25 sel(mod | lt | f0[0], dst, src0, src1);
26}
27template <typename DT = void>
28void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
29 sel(mod | lt | f0[0], dst, src0, src1);
30}
31#ifndef NGEN_WINDOWS_COMPAT
32template <typename DT = void>
33void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
34 sel(mod | lt | f0[0], dst, src0, src1);
35}
36template <typename DT = void>
37void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
38 sel(mod | lt | f0[0], dst, src0, src1);
39}
40#endif
41template <typename DT = void>
42void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
43 sel(mod | ge | f0[0], dst, src0, src1);
44}
45template <typename DT = void>
46void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
47 sel(mod | ge | f0[0], dst, src0, src1);
48}
49#ifndef NGEN_WINDOWS_COMPAT
50template <typename DT = void>
51void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
52 sel(mod | ge | f0[0], dst, src0, src1);
53}
54template <typename DT = void>
55void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
56 sel(mod | ge | f0[0], dst, src0, src1);
57}
58#endif
59
60template <typename DT = void>
61void bfi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, const RegData &src3) {
62 bfi1(mod, dst, src0, src1);
63 bfi2(mod, dst, dst, src2, src3);
64}
65
66// Brief compare instructions.
67template <typename DT = void>
68void cmp(const InstructionModifier &mod, const RegData &src0, const RegData &src1) {
69 auto dt = getDataType<DT>();
70 if (dt == DataType::invalid)
71 dt = src0.getType();
72 cmp<DT>(mod, null.retype(dt), src0, src1);
73}
74template <typename DT = void>
75void cmp(const InstructionModifier &mod, const RegData &src0, const Immediate &src1) {
76 auto dt = getDataType<DT>();
77 if (dt == DataType::invalid)
78 dt = src0.getType();
79 cmp<DT>(mod, null.retype(dt), src0, src1);
80}
81
82// Brief math instructions.
83template <typename DT = void>
84void cos(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
85 math<DT>(mod, MathFunction::cos, dst, src0);
86}
87template <typename DT = void>
88void exp(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
89 math<DT>(mod, MathFunction::exp, dst, src0);
90}
91template <typename DT = void>
92void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
93 math<DT>(mod, MathFunction::fdiv, dst, src0, src1);
94}
95template <typename DT = void>
96void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
97 math<DT>(mod, MathFunction::fdiv, dst, src0, src1);
98}
99template <typename DT = void>
100void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
101 math<DT>(mod, MathFunction::idiv, dst, src0, src1);
102}
103template <typename DT = void>
104void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
105 math<DT>(mod, MathFunction::idiv, dst, src0, src1);
106}
107template <typename DT = void>
108void inv(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
109 math<DT>(mod, MathFunction::inv, dst, src0);
110}
111template <typename DT = void>
112void invm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1) {
113 math<DT>(mod, MathFunction::invm, dst, src0, src1);
114}
115template <typename DT = void>
116void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
117 math<DT>(mod, MathFunction::iqot, dst, src0, src1);
118}
119template <typename DT = void>
120void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
121 math<DT>(mod, MathFunction::iqot, dst, src0, src1);
122}
123template <typename DT = void>
124void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
125 math<DT>(mod, MathFunction::irem, dst, src0, src1);
126}
127template <typename DT = void>
128void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
129 math<DT>(mod, MathFunction::irem, dst, src0, src1);
130}
131template <typename DT = void>
132void log(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
133 math<DT>(mod, MathFunction::log, dst, src0);
134}
135template <typename DT = void>
136void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) {
137 math<DT>(mod, MathFunction::pow, dst, src0, src1);
138}
139template <typename DT = void>
140void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) {
141 math<DT>(mod, MathFunction::pow, dst, src0, src1);
142}
143template <typename DT = void>
144void rsqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
145 math<DT>(mod, MathFunction::rsqt, dst, src0);
146}
147template <typename DT = void>
148void rsqtm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0) {
149 math<DT>(mod, MathFunction::rsqtm, dst, src0);
150}
151template <typename DT = void>
152void sin(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
153 math<DT>(mod, MathFunction::sin, dst, src0);
154}
155template <typename DT = void>
156void sqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0) {
157 math<DT>(mod, MathFunction::sqt, dst, src0);
158}
159
160#define TMP(n) tmp[n].retype(dst.getType())
161
162// IEEE 754-compliant divide math macro sequence.
163// Requires GRFs initialized with 0.0 and 1.0, as well as temporary GRFs (4 for single precision, 5 for double precision).
164// dst, num, denom must be distinct GRFs.
165template <typename DT = void, typename A>
166void fdiv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData num, RegData denom,
167 RegData zero, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier())
168{
169 DataType dt = getDataType<DT>();
170 if (dt == DataType::invalid)
171 dt = dst.getType();
172 if (cfmod.getExecSize() == 0)
173 cfmod = mod;
174
175 Label labelSkip;
176
177 switch (dt) {
178 case DataType::hf:
179 fdiv<DT>(mod, dst, num, denom);
180 break;
181 case DataType::f:
182 invm<DT>(mod | eo | flag, dst | mme0, num | nomme, denom | nomme);
183 if_(cfmod | ~flag, labelSkip);
184
185 madm<DT>(mod, TMP(0) | mme1, zero | nomme, num | nomme, dst | mme0);
186 madm<DT>(mod, TMP(1) | mme2, one | nomme, -denom | nomme, dst | mme0);
187 madm<DT>(mod, TMP(2) | mme3, dst | mme0, TMP(1) | mme2, dst | mme0);
188 madm<DT>(mod, TMP(3) | mme4, num | nomme, -denom | nomme, TMP(0) | mme1);
189 madm<DT>(mod, TMP(0) | mme5, TMP(0) | mme1, TMP(3) | mme4, TMP(2) | mme3);
190 madm<DT>(mod, TMP(1) | mme6, num | nomme, -denom | nomme, TMP(0) | mme5);
191 madm<DT>(mod, dst | nomme, TMP(0) | mme5, TMP(1) | mme6, TMP(2) | mme3);
192
193 mark(labelSkip);
194 endif(cfmod);
195 break;
196 case DataType::df:
197 invm<DT>(mod | eo | flag, dst | mme0, num | nomme, denom | nomme);
198 if_(cfmod | ~flag, labelSkip);
199
200 madm<DT>(mod, TMP(0) | mme1, zero | nomme, num | nomme, dst | mme0);
201 madm<DT>(mod, TMP(1) | mme2, one | nomme, -denom | nomme, dst | mme0);
202 madm<DT>(mod, TMP(2) | mme3, num | nomme, -denom | nomme, TMP(0) | mme1);
203 madm<DT>(mod, TMP(3) | mme4, dst | mme0, TMP(1) | mme2, dst | mme0);
204 madm<DT>(mod, TMP(4) | mme5, one | nomme, -denom | nomme, TMP(3) | mme4);
205 madm<DT>(mod, dst | mme6, dst | mme0, TMP(1) | mme2, TMP(3) | mme4);
206 madm<DT>(mod, TMP(0) | mme7, TMP(0) | mme1, TMP(2) | mme3, TMP(3) | mme4);
207 madm<DT>(mod, TMP(3) | mme0, TMP(3) | mme4, dst | mme6, TMP(4) | mme5);
208 madm<DT>(mod, TMP(2) | mme1, num | nomme, -denom | nomme, TMP(0) | mme7);
209 madm<DT>(mod, dst | nomme, TMP(0) | mme7, TMP(2) | mme1, TMP(3) | mme0);
210
211 mark(labelSkip);
212 endif(cfmod);
213 break;
214 default:
215#ifdef NGEN_SAFE
216 throw invalid_type_exception();
217#endif
218 break;
219 }
220}
221
222// IEEE 754-compliant reciprocal math macro sequence.
223// Requires GRF initialized with 1.0, as well as 3 temporary GRFs.
224// dst and src must be distinct GRFs.
225template <typename DT = void, typename A>
226void inv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src, RegData one,
227 const A &tmp, InstructionModifier cfmod = InstructionModifier())
228{
229 DataType dt = getDataType<DT>();
230 if (dt == DataType::invalid)
231 dt = dst.getType();
232 if (cfmod.getExecSize() == 0)
233 cfmod = mod;
234
235 Label labelSkip;
236
237 switch (dt) {
238 case DataType::hf:
239 inv<DT>(mod, dst, src);
240 break;
241 case DataType::f:
242 invm<DT>(mod | eo | flag, dst | mme0, one | nomme, src | nomme);
243 if_(cfmod | ~flag, labelSkip);
244
245 madm<DT>(mod, TMP(1) | mme2, one | nomme, -src | nomme, dst | mme0);
246 madm<DT>(mod, TMP(2) | mme3, dst | mme0, TMP(1) | mme2, dst | mme0);
247 madm<DT>(mod, TMP(0) | mme5, dst | mme0, TMP(1) | mme2, TMP(2) | mme3);
248 madm<DT>(mod, TMP(1) | mme6, one | nomme, -src | nomme, TMP(0) | mme5);
249 madm<DT>(mod, dst | nomme, TMP(0) | mme5, TMP(1) | mme6, TMP(2) | mme3);
250
251 mark(labelSkip);
252 endif(cfmod);
253 break;
254 case DataType::df:
255 invm<DT>(mod | eo | flag, dst | mme0, one | nomme, src | nomme);
256 if_(cfmod | ~flag, labelSkip);
257
258 madm<DT>(mod, TMP(0) | mme2, one | nomme, -src | nomme, dst | mme0);
259 madm<DT>(mod, TMP(1) | mme4, dst | mme0, TMP(0) | mme2, dst | mme0);
260 madm<DT>(mod, TMP(2) | mme5, one | nomme, -src | nomme, TMP(1) | mme4);
261 madm<DT>(mod, dst | mme6, dst | mme0, TMP(0) | mme2, TMP(1) | mme4);
262 madm<DT>(mod, TMP(1) | mme0, TMP(1) | mme4, dst | mme6, TMP(2) | mme5);
263 madm<DT>(mod, TMP(0) | mme1, one | nomme, -src | nomme, dst | mme6);
264 madm<DT>(mod, dst | nomme, dst | mme6, TMP(0) | mme1, TMP(1) | mme0);
265
266 mark(labelSkip);
267 endif(cfmod);
268 break;
269 default:
270#ifdef NGEN_SAFE
271 throw invalid_type_exception();
272#endif
273 break;
274 }
275}
276
277// IEEE 754-compliant square root macro sequence.
278// Requires GRFs initialized with 0.0 and 0.5 (also 1.0 for double precision),
279// and temporary GRFs (3 for single precision, 4 for double precision).
280// dst and src must be distinct GRFs.
281template <typename DT = void, typename A>
282void sqt_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src,
283 RegData zero, RegData oneHalf, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier())
284{
285 DataType dt = getDataType<DT>();
286 if (dt == DataType::invalid)
287 dt = dst.getType();
288 if (cfmod.getExecSize() == 0)
289 cfmod = mod;
290
291 Label labelSkip;
292
293 switch (dt) {
294 case DataType::hf:
295 sqt<DT>(mod, dst, src);
296 break;
297 case DataType::f:
298 rsqtm<DT>(mod | eo | flag, dst | mme0, src | nomme);
299 if_(cfmod | ~flag, labelSkip);
300
301 madm<DT>(mod, TMP(0) | mme1, zero | nomme, oneHalf | nomme, dst | mme0);
302 madm<DT>(mod, TMP(1) | mme2, zero | nomme, src | nomme, dst | mme0);
303 madm<DT>(mod, TMP(2) | mme3, oneHalf | nomme, -TMP(1) | mme2, TMP(0) | mme1);
304 madm<DT>(mod, TMP(0) | mme4, TMP(0) | mme1, TMP(2) | mme3, TMP(0) | mme1);
305 madm<DT>(mod, dst | mme5, TMP(1) | mme2, TMP(2) | mme3, TMP(1) | mme2);
306 madm<DT>(mod, TMP(2) | mme6, src | nomme, -dst | mme5, dst | mme5);
307 madm<DT>(mod, dst | nomme, dst | mme5, TMP(0) | mme4, TMP(2) | mme6);
308
309 mark(labelSkip);
310 endif(cfmod);
311 break;
312 case DataType::df:
313 rsqtm<DT>(mod | eo | flag, dst | mme0, src | nomme);
314 if_(cfmod | ~flag, labelSkip);
315
316 madm<DT>(mod, TMP(0) | mme1, zero | mme0, oneHalf | nomme, dst | mme0);
317 madm<DT>(mod, TMP(1) | mme2, zero | mme0, src | nomme, dst | mme0);
318 madm<DT>(mod, TMP(2) | mme3, oneHalf | nomme, -TMP(1) | mme2, TMP(0) | mme1);
319 madm<DT>(mod, TMP(3) | mme4, one | nomme, oneHalf | nomme, dst | nomme);
320 madm<DT>(mod, TMP(3) | mme5, one | nomme, TMP(3) | mme4, TMP(2) | mme3);
321 madm<DT>(mod, dst | mme6, zero | mme0, TMP(2) | mme3, TMP(1) | mme2);
322 madm<DT>(mod, TMP(2) | mme7, zero | mme0, TMP(2) | mme3, TMP(0) | mme1);
323 madm<DT>(mod, dst | mme6, TMP(1) | mme2, TMP(3) | mme5, dst | mme6);
324 madm<DT>(mod, TMP(3) | mme5, TMP(0) | mme1, TMP(3) | mme5, TMP(2) | mme7);
325 madm<DT>(mod, TMP(0) | mme1, src | nomme, -dst | mme6, dst | mme6);
326 madm<DT>(mod, dst | nomme, dst | mme6, TMP(0) | mme1, TMP(3) | mme5);
327
328 mark(labelSkip);
329 endif(cfmod);
330 break;
331 default:
332#ifdef NGEN_SAFE
333 throw invalid_type_exception();
334#endif
335 break;
336 }
337}
338
339#undef TMP
340
341// Thread spawner messages.
342void threadend(const InstructionModifier &mod, const RegData &r0_info) {
343 auto sf = (hardware <= HW::XeHP) ? SharedFunction::ts
344 : SharedFunction::gtwy;
345 uint32_t exdesc = 0x20 | (static_cast<int>(sf) & 0xF);
346 send(8 | EOT | mod | NoMask, null, r0_info, exdesc, 0x2000010);
347}
348
349void threadend(const RegData &r0_info) { threadend(InstructionModifier(), r0_info); }
350
351// Gateway messages.
352void barriermsg(const InstructionModifier &mod, const GRF &header)
353{
354 uint32_t exdesc = static_cast<int>(SharedFunction::gtwy) & 0xF;
355 send(1 | mod | NoMask, null, header, exdesc, 0x2000004);
356}
357
358void barriermsg(const GRF &header) { barriermsg(InstructionModifier(), header); }
359
360// Prepare barrier header.
361void barrierheader(const GRF &header, const GRF &r0_info = r0) {
362 if (hardware >= HW::XeHPG) {
363 mov(1 | NoMask, header.hf(4), Immediate::hf(0));
364 mov(2 | NoMask, header.ub(10)(1), r0_info.ub(11)(0));
365 } else
366 and_(8 | NoMask, header.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000));
367}
368
369void barriersignal(const InstructionModifier &mod, const GRF &temp, const GRF &r0_info = r0)
370{
371 barrierheader(temp, r0_info);
372 barriermsg(mod, temp);
373}
374
375void barriersignal(const InstructionModifier &mod, const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0)
376{
377 if (hardware >= HW::XeHPG)
378 mov(1 | NoMask, temp.ud(2), (threadCount << 24) | (threadCount << 16));
379 else
380 {
381 and_(8 | NoMask, temp.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000));
382 mov(1 | NoMask, temp.ub(9), 0x80 | (threadCount & 0x7F));
383 }
384 barriermsg(mod, temp);
385}
386
387void barriersignal(const GRF &temp, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), temp, r0_info); }
388void barriersignal(const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), temp, threadCount, r0_info); }
389
390// Named barriers.
391void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0)
392{
393#ifdef NGEN_SAFE
394 if (hardware != HW::XeHPC)
395 throw unsupported_message();
396#endif
397 mov(1 | NoMask, temp.uw(4), uint8_t(barrierID));
398 mov(2 | NoMask, temp.ub(10)(1), r0_info.ub(11)(0));
399 barriermsg(mod, temp);
400}
401
402void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers)
403{
404#ifdef NGEN_SAFE
405 if (hardware != HW::XeHPC)
406 throw unsupported_message();
407#endif
408 mov(1 | NoMask, temp.ud(2), (barrierID & 0xFF) | (static_cast<uint32_t>(barrierType) << 14) | ((producers & 0xFF) << 16) | ((consumers & 0xFF) << 24));
409 barriermsg(mod, temp);
410}
411
412void barriersignal(uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), barrierID, temp, r0_info); }
413void barriersignal(uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers) { barriersignal(InstructionModifier(), barrierID, temp, barrierType, producers, consumers); }
414
415void barrierwait()
416{
417 if (isGen12)
418 sync.bar(NoMask);
419 else
420 wait(NoMask, n0[0]);
421}
422
423template <typename... Targs>
424void barrier(const Targs &...barrierArgs)
425{
426 barriersignal(barrierArgs...);
427 barrierwait();
428}
429
430// Global memory fence.
431void memfence(const InstructionModifier &mod, const RegData &dst, const RegData &header = GRF(0))
432{
433 if (hardware <= HW::XeHP) {
434 const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF;
435 send(8 | mod | NoMask, dst, header, exdesc, 0x219E000);
436 } else
437 send(1 | mod | NoMask, SharedFunction::ugm, dst, header, null, 0, 0x214031F);
438}
439
440void memfence(const RegData &dst, const RegData &header = GRF(0)) { memfence(InstructionModifier(), dst, header); }
441
442// SLM-only memory fence.
443void slmfence(const InstructionModifier &mod, const RegData &dst, const RegData &header = GRF(0))
444{
445 if (hardware <= HW::XeHP) {
446 const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF;
447 send(8 | mod | NoMask, dst, header, exdesc, 0x219E0FE);
448 } else
449 send(1 | mod | NoMask, SharedFunction::slm, dst, header, null, 0, 0x210011F);
450}
451
452void slmfence(const RegData &dst, const RegData &header = GRF(0)) { slmfence(InstructionModifier(), dst, header); }
453
454// XeHP+ prologues.
455void loadlid(int argBytes, int dims = 3, int simd = 8, const GRF &temp = GRF(127), int paddedSize = 0)
456{
457 if (hardware >= HW::XeHP) {
458 const int grfSize = GRF::bytes(hardware);
459 const int grfOW = grfSize / 16;
460 int simdGRFs = (simd > 16 && grfSize < 64) ? 2 : 1;
461 int insns = 0;
462
463 if (dims > 0) {
464 auto dmSave = defaultModifier;
465 defaultModifier |= NoMask | AutoSWSB;
466
467 mov<uint32_t>(8, temp, uint16_t(0));
468 and_<uint32_t>(1, temp[2], r0[0], uint32_t(~0x1F));
469 and_<uint16_t>(1, temp[0], r0[4], uint16_t(0xFF));
470 add<uint32_t>(1, temp[2], temp[2], uint16_t(argBytes));
471 if (simd == 1) {
472 mad<uint32_t>(1, temp[2], temp[2], temp.uw(0), uint16_t(grfSize));
473 load(8, r1, aligned_block_oword(1), A32NC, temp);
474 } else {
475 mad<uint32_t>(1, temp[2], temp[2], temp.uw(0), uint16_t(3 * simdGRFs * grfSize));
476 load(8, r1, aligned_block_oword(simdGRFs * ((dims == 1) ? 1 : 2) * grfOW), A32NC, temp);
477 insns += 6;
478 if (dims == 3) {
479 add<uint32_t>(1, temp[2], temp[2], uint16_t(2 * simdGRFs * grfSize));
480 load(8, GRF(1 + 2 * simdGRFs), aligned_block_oword(grfOW * simdGRFs), A32NC, temp);
481 insns += 2;
482 }
483 }
484
485 defaultModifier = dmSave;
486 }
487
488 if (paddedSize > 0) {
489 int nops = (paddedSize >> 4) - insns;
490#ifdef NGEN_SAFE
491 if (paddedSize & 0xF) throw invalid_operand_exception();
492 if (nops < 0) throw invalid_operand_exception();
493#endif
494 for (int i = 0; i < nops; i++)
495 nop();
496 }
497
498 if (!_labelLocalIDsLoaded.defined(labelManager))
499 mark(_labelLocalIDsLoaded);
500 }
501}
502
503void loadargs(const GRF &base, int argGRFs, const GRF &temp = GRF(127))
504{
505 if (hardware >= HW::XeHP) {
506 if (argGRFs > 0) {
507 auto dst = base;
508 auto dmSave = defaultModifier;
509 defaultModifier |= NoMask | AutoSWSB;
510
511 mov<uint32_t>(8, temp, uint16_t(0));
512 and_<uint32_t>(1, temp[2], r0[0], uint32_t(~0x1F));
513 while (argGRFs > 0) {
514 int nload = std::min(utils::rounddown_pow2(argGRFs), 4);
515 load(8, dst, aligned_block_oword(GRF::bytes(hardware) * nload / 16), A32NC, temp);
516 argGRFs -= nload;
517 dst += nload;
518 if (argGRFs > 0)
519 add<uint32_t>(1, temp[2], temp[2], uint32_t(GRF::bytes(hardware) * nload));
520 }
521
522 defaultModifier = dmSave;
523 }
524
525 if (!_labelArgsLoaded.defined(labelManager))
526 mark(_labelArgsLoaded);
527 }
528}
529
530void epilogue(int GRFCount, bool hasSLM, const RegData &r0_info)
531{
532 GRF tmp0(GRFCount - 3);
533 GRF tmp1(GRFCount - 2);
534 GRF lastReg(GRFCount - 1);
535
536 bool doMemFence = false;
537 bool doSLMFence = false;
538 bool setAccToZero = false;
539
540 switch (hardware) {
541 case HW::XeLP:
542 case HW::XeHP:
543 case HW::XeHPG:
544 doMemFence = true;
545 doSLMFence = true;
546 setAccToZero = true;
547 break;
548 default: break;
549 }
550
551 if (!hasSLM) doSLMFence = false;
552
553 int dwordsPerReg = GRF::bytes(hardware) / sizeof(uint32_t);
554 mov<uint32_t>(dwordsPerReg, lastReg, r0_info);
555
556 if (doMemFence) memfence(tmp0, r0_info);
557 if (doSLMFence) slmfence(tmp1, r0_info);
558
559 if (setAccToZero) {
560 mov(16, acc0.f(), 0.f);
561 if (hardware == HW::XeHP) mov(16, acc2.f(), 0.f);
562 }
563
564 if (doMemFence) wrdep(tmp0);
565 if (doSLMFence) wrdep(tmp1);
566
567 threadend(lastReg);
568}
569