1 | /******************************************************************************* |
2 | * Copyright 2019-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | /* |
18 | * Do not #include this file directly; ngen uses it internally. |
19 | */ |
20 | |
21 | |
22 | // Pseudo-instructions and macros. |
23 | template <typename DT = void> |
24 | void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
25 | sel(mod | lt | f0[0], dst, src0, src1); |
26 | } |
27 | template <typename DT = void> |
28 | void min_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
29 | sel(mod | lt | f0[0], dst, src0, src1); |
30 | } |
31 | #ifndef NGEN_WINDOWS_COMPAT |
32 | template <typename DT = void> |
33 | void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
34 | sel(mod | lt | f0[0], dst, src0, src1); |
35 | } |
36 | template <typename DT = void> |
37 | void min(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
38 | sel(mod | lt | f0[0], dst, src0, src1); |
39 | } |
40 | #endif |
41 | template <typename DT = void> |
42 | void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
43 | sel(mod | ge | f0[0], dst, src0, src1); |
44 | } |
45 | template <typename DT = void> |
46 | void max_(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
47 | sel(mod | ge | f0[0], dst, src0, src1); |
48 | } |
49 | #ifndef NGEN_WINDOWS_COMPAT |
50 | template <typename DT = void> |
51 | void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
52 | sel(mod | ge | f0[0], dst, src0, src1); |
53 | } |
54 | template <typename DT = void> |
55 | void max(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
56 | sel(mod | ge | f0[0], dst, src0, src1); |
57 | } |
58 | #endif |
59 | |
60 | template <typename DT = void> |
61 | void bfi(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1, const RegData &src2, const RegData &src3) { |
62 | bfi1(mod, dst, src0, src1); |
63 | bfi2(mod, dst, dst, src2, src3); |
64 | } |
65 | |
66 | // Brief compare instructions. |
67 | template <typename DT = void> |
68 | void cmp(const InstructionModifier &mod, const RegData &src0, const RegData &src1) { |
69 | auto dt = getDataType<DT>(); |
70 | if (dt == DataType::invalid) |
71 | dt = src0.getType(); |
72 | cmp<DT>(mod, null.retype(dt), src0, src1); |
73 | } |
74 | template <typename DT = void> |
75 | void cmp(const InstructionModifier &mod, const RegData &src0, const Immediate &src1) { |
76 | auto dt = getDataType<DT>(); |
77 | if (dt == DataType::invalid) |
78 | dt = src0.getType(); |
79 | cmp<DT>(mod, null.retype(dt), src0, src1); |
80 | } |
81 | |
82 | // Brief math instructions. |
83 | template <typename DT = void> |
84 | void cos(const InstructionModifier &mod, const RegData &dst, const RegData &src0) { |
85 | math<DT>(mod, MathFunction::cos, dst, src0); |
86 | } |
87 | template <typename DT = void> |
88 | void exp(const InstructionModifier &mod, const RegData &dst, const RegData &src0) { |
89 | math<DT>(mod, MathFunction::exp, dst, src0); |
90 | } |
91 | template <typename DT = void> |
92 | void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
93 | math<DT>(mod, MathFunction::fdiv, dst, src0, src1); |
94 | } |
95 | template <typename DT = void> |
96 | void fdiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
97 | math<DT>(mod, MathFunction::fdiv, dst, src0, src1); |
98 | } |
99 | template <typename DT = void> |
100 | void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
101 | math<DT>(mod, MathFunction::idiv, dst, src0, src1); |
102 | } |
103 | template <typename DT = void> |
104 | void idiv(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
105 | math<DT>(mod, MathFunction::idiv, dst, src0, src1); |
106 | } |
107 | template <typename DT = void> |
108 | void inv(const InstructionModifier &mod, const RegData &dst, const RegData &src0) { |
109 | math<DT>(mod, MathFunction::inv, dst, src0); |
110 | } |
111 | template <typename DT = void> |
112 | void invm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0, const ExtendedReg &src1) { |
113 | math<DT>(mod, MathFunction::invm, dst, src0, src1); |
114 | } |
115 | template <typename DT = void> |
116 | void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
117 | math<DT>(mod, MathFunction::iqot, dst, src0, src1); |
118 | } |
119 | template <typename DT = void> |
120 | void iqot(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
121 | math<DT>(mod, MathFunction::iqot, dst, src0, src1); |
122 | } |
123 | template <typename DT = void> |
124 | void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
125 | math<DT>(mod, MathFunction::irem, dst, src0, src1); |
126 | } |
127 | template <typename DT = void> |
128 | void irem(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
129 | math<DT>(mod, MathFunction::irem, dst, src0, src1); |
130 | } |
131 | template <typename DT = void> |
132 | void log(const InstructionModifier &mod, const RegData &dst, const RegData &src0) { |
133 | math<DT>(mod, MathFunction::log, dst, src0); |
134 | } |
135 | template <typename DT = void> |
136 | void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const RegData &src1) { |
137 | math<DT>(mod, MathFunction::pow, dst, src0, src1); |
138 | } |
139 | template <typename DT = void> |
140 | void pow(const InstructionModifier &mod, const RegData &dst, const RegData &src0, const Immediate &src1) { |
141 | math<DT>(mod, MathFunction::pow, dst, src0, src1); |
142 | } |
143 | template <typename DT = void> |
144 | void rsqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0) { |
145 | math<DT>(mod, MathFunction::rsqt, dst, src0); |
146 | } |
147 | template <typename DT = void> |
148 | void rsqtm(const InstructionModifier &mod, const ExtendedReg &dst, const ExtendedReg &src0) { |
149 | math<DT>(mod, MathFunction::rsqtm, dst, src0); |
150 | } |
151 | template <typename DT = void> |
152 | void sin(const InstructionModifier &mod, const RegData &dst, const RegData &src0) { |
153 | math<DT>(mod, MathFunction::sin, dst, src0); |
154 | } |
155 | template <typename DT = void> |
156 | void sqt(const InstructionModifier &mod, const RegData &dst, const RegData &src0) { |
157 | math<DT>(mod, MathFunction::sqt, dst, src0); |
158 | } |
159 | |
160 | #define TMP(n) tmp[n].retype(dst.getType()) |
161 | |
162 | // IEEE 754-compliant divide math macro sequence. |
163 | // Requires GRFs initialized with 0.0 and 1.0, as well as temporary GRFs (4 for single precision, 5 for double precision). |
164 | // dst, num, denom must be distinct GRFs. |
165 | template <typename DT = void, typename A> |
166 | void fdiv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData num, RegData denom, |
167 | RegData zero, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier()) |
168 | { |
169 | DataType dt = getDataType<DT>(); |
170 | if (dt == DataType::invalid) |
171 | dt = dst.getType(); |
172 | if (cfmod.getExecSize() == 0) |
173 | cfmod = mod; |
174 | |
175 | Label labelSkip; |
176 | |
177 | switch (dt) { |
178 | case DataType::hf: |
179 | fdiv<DT>(mod, dst, num, denom); |
180 | break; |
181 | case DataType::f: |
182 | invm<DT>(mod | eo | flag, dst | mme0, num | nomme, denom | nomme); |
183 | if_(cfmod | ~flag, labelSkip); |
184 | |
185 | madm<DT>(mod, TMP(0) | mme1, zero | nomme, num | nomme, dst | mme0); |
186 | madm<DT>(mod, TMP(1) | mme2, one | nomme, -denom | nomme, dst | mme0); |
187 | madm<DT>(mod, TMP(2) | mme3, dst | mme0, TMP(1) | mme2, dst | mme0); |
188 | madm<DT>(mod, TMP(3) | mme4, num | nomme, -denom | nomme, TMP(0) | mme1); |
189 | madm<DT>(mod, TMP(0) | mme5, TMP(0) | mme1, TMP(3) | mme4, TMP(2) | mme3); |
190 | madm<DT>(mod, TMP(1) | mme6, num | nomme, -denom | nomme, TMP(0) | mme5); |
191 | madm<DT>(mod, dst | nomme, TMP(0) | mme5, TMP(1) | mme6, TMP(2) | mme3); |
192 | |
193 | mark(labelSkip); |
194 | endif(cfmod); |
195 | break; |
196 | case DataType::df: |
197 | invm<DT>(mod | eo | flag, dst | mme0, num | nomme, denom | nomme); |
198 | if_(cfmod | ~flag, labelSkip); |
199 | |
200 | madm<DT>(mod, TMP(0) | mme1, zero | nomme, num | nomme, dst | mme0); |
201 | madm<DT>(mod, TMP(1) | mme2, one | nomme, -denom | nomme, dst | mme0); |
202 | madm<DT>(mod, TMP(2) | mme3, num | nomme, -denom | nomme, TMP(0) | mme1); |
203 | madm<DT>(mod, TMP(3) | mme4, dst | mme0, TMP(1) | mme2, dst | mme0); |
204 | madm<DT>(mod, TMP(4) | mme5, one | nomme, -denom | nomme, TMP(3) | mme4); |
205 | madm<DT>(mod, dst | mme6, dst | mme0, TMP(1) | mme2, TMP(3) | mme4); |
206 | madm<DT>(mod, TMP(0) | mme7, TMP(0) | mme1, TMP(2) | mme3, TMP(3) | mme4); |
207 | madm<DT>(mod, TMP(3) | mme0, TMP(3) | mme4, dst | mme6, TMP(4) | mme5); |
208 | madm<DT>(mod, TMP(2) | mme1, num | nomme, -denom | nomme, TMP(0) | mme7); |
209 | madm<DT>(mod, dst | nomme, TMP(0) | mme7, TMP(2) | mme1, TMP(3) | mme0); |
210 | |
211 | mark(labelSkip); |
212 | endif(cfmod); |
213 | break; |
214 | default: |
215 | #ifdef NGEN_SAFE |
216 | throw invalid_type_exception(); |
217 | #endif |
218 | break; |
219 | } |
220 | } |
221 | |
222 | // IEEE 754-compliant reciprocal math macro sequence. |
223 | // Requires GRF initialized with 1.0, as well as 3 temporary GRFs. |
224 | // dst and src must be distinct GRFs. |
225 | template <typename DT = void, typename A> |
226 | void inv_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src, RegData one, |
227 | const A &tmp, InstructionModifier cfmod = InstructionModifier()) |
228 | { |
229 | DataType dt = getDataType<DT>(); |
230 | if (dt == DataType::invalid) |
231 | dt = dst.getType(); |
232 | if (cfmod.getExecSize() == 0) |
233 | cfmod = mod; |
234 | |
235 | Label labelSkip; |
236 | |
237 | switch (dt) { |
238 | case DataType::hf: |
239 | inv<DT>(mod, dst, src); |
240 | break; |
241 | case DataType::f: |
242 | invm<DT>(mod | eo | flag, dst | mme0, one | nomme, src | nomme); |
243 | if_(cfmod | ~flag, labelSkip); |
244 | |
245 | madm<DT>(mod, TMP(1) | mme2, one | nomme, -src | nomme, dst | mme0); |
246 | madm<DT>(mod, TMP(2) | mme3, dst | mme0, TMP(1) | mme2, dst | mme0); |
247 | madm<DT>(mod, TMP(0) | mme5, dst | mme0, TMP(1) | mme2, TMP(2) | mme3); |
248 | madm<DT>(mod, TMP(1) | mme6, one | nomme, -src | nomme, TMP(0) | mme5); |
249 | madm<DT>(mod, dst | nomme, TMP(0) | mme5, TMP(1) | mme6, TMP(2) | mme3); |
250 | |
251 | mark(labelSkip); |
252 | endif(cfmod); |
253 | break; |
254 | case DataType::df: |
255 | invm<DT>(mod | eo | flag, dst | mme0, one | nomme, src | nomme); |
256 | if_(cfmod | ~flag, labelSkip); |
257 | |
258 | madm<DT>(mod, TMP(0) | mme2, one | nomme, -src | nomme, dst | mme0); |
259 | madm<DT>(mod, TMP(1) | mme4, dst | mme0, TMP(0) | mme2, dst | mme0); |
260 | madm<DT>(mod, TMP(2) | mme5, one | nomme, -src | nomme, TMP(1) | mme4); |
261 | madm<DT>(mod, dst | mme6, dst | mme0, TMP(0) | mme2, TMP(1) | mme4); |
262 | madm<DT>(mod, TMP(1) | mme0, TMP(1) | mme4, dst | mme6, TMP(2) | mme5); |
263 | madm<DT>(mod, TMP(0) | mme1, one | nomme, -src | nomme, dst | mme6); |
264 | madm<DT>(mod, dst | nomme, dst | mme6, TMP(0) | mme1, TMP(1) | mme0); |
265 | |
266 | mark(labelSkip); |
267 | endif(cfmod); |
268 | break; |
269 | default: |
270 | #ifdef NGEN_SAFE |
271 | throw invalid_type_exception(); |
272 | #endif |
273 | break; |
274 | } |
275 | } |
276 | |
277 | // IEEE 754-compliant square root macro sequence. |
278 | // Requires GRFs initialized with 0.0 and 0.5 (also 1.0 for double precision), |
279 | // and temporary GRFs (3 for single precision, 4 for double precision). |
280 | // dst and src must be distinct GRFs. |
281 | template <typename DT = void, typename A> |
282 | void sqt_ieee(const InstructionModifier &mod, FlagRegister flag, RegData dst, RegData src, |
283 | RegData zero, RegData oneHalf, RegData one, const A &tmp, InstructionModifier cfmod = InstructionModifier()) |
284 | { |
285 | DataType dt = getDataType<DT>(); |
286 | if (dt == DataType::invalid) |
287 | dt = dst.getType(); |
288 | if (cfmod.getExecSize() == 0) |
289 | cfmod = mod; |
290 | |
291 | Label labelSkip; |
292 | |
293 | switch (dt) { |
294 | case DataType::hf: |
295 | sqt<DT>(mod, dst, src); |
296 | break; |
297 | case DataType::f: |
298 | rsqtm<DT>(mod | eo | flag, dst | mme0, src | nomme); |
299 | if_(cfmod | ~flag, labelSkip); |
300 | |
301 | madm<DT>(mod, TMP(0) | mme1, zero | nomme, oneHalf | nomme, dst | mme0); |
302 | madm<DT>(mod, TMP(1) | mme2, zero | nomme, src | nomme, dst | mme0); |
303 | madm<DT>(mod, TMP(2) | mme3, oneHalf | nomme, -TMP(1) | mme2, TMP(0) | mme1); |
304 | madm<DT>(mod, TMP(0) | mme4, TMP(0) | mme1, TMP(2) | mme3, TMP(0) | mme1); |
305 | madm<DT>(mod, dst | mme5, TMP(1) | mme2, TMP(2) | mme3, TMP(1) | mme2); |
306 | madm<DT>(mod, TMP(2) | mme6, src | nomme, -dst | mme5, dst | mme5); |
307 | madm<DT>(mod, dst | nomme, dst | mme5, TMP(0) | mme4, TMP(2) | mme6); |
308 | |
309 | mark(labelSkip); |
310 | endif(cfmod); |
311 | break; |
312 | case DataType::df: |
313 | rsqtm<DT>(mod | eo | flag, dst | mme0, src | nomme); |
314 | if_(cfmod | ~flag, labelSkip); |
315 | |
316 | madm<DT>(mod, TMP(0) | mme1, zero | mme0, oneHalf | nomme, dst | mme0); |
317 | madm<DT>(mod, TMP(1) | mme2, zero | mme0, src | nomme, dst | mme0); |
318 | madm<DT>(mod, TMP(2) | mme3, oneHalf | nomme, -TMP(1) | mme2, TMP(0) | mme1); |
319 | madm<DT>(mod, TMP(3) | mme4, one | nomme, oneHalf | nomme, dst | nomme); |
320 | madm<DT>(mod, TMP(3) | mme5, one | nomme, TMP(3) | mme4, TMP(2) | mme3); |
321 | madm<DT>(mod, dst | mme6, zero | mme0, TMP(2) | mme3, TMP(1) | mme2); |
322 | madm<DT>(mod, TMP(2) | mme7, zero | mme0, TMP(2) | mme3, TMP(0) | mme1); |
323 | madm<DT>(mod, dst | mme6, TMP(1) | mme2, TMP(3) | mme5, dst | mme6); |
324 | madm<DT>(mod, TMP(3) | mme5, TMP(0) | mme1, TMP(3) | mme5, TMP(2) | mme7); |
325 | madm<DT>(mod, TMP(0) | mme1, src | nomme, -dst | mme6, dst | mme6); |
326 | madm<DT>(mod, dst | nomme, dst | mme6, TMP(0) | mme1, TMP(3) | mme5); |
327 | |
328 | mark(labelSkip); |
329 | endif(cfmod); |
330 | break; |
331 | default: |
332 | #ifdef NGEN_SAFE |
333 | throw invalid_type_exception(); |
334 | #endif |
335 | break; |
336 | } |
337 | } |
338 | |
339 | #undef TMP |
340 | |
341 | // Thread spawner messages. |
342 | void threadend(const InstructionModifier &mod, const RegData &r0_info) { |
343 | auto sf = (hardware <= HW::XeHP) ? SharedFunction::ts |
344 | : SharedFunction::gtwy; |
345 | uint32_t exdesc = 0x20 | (static_cast<int>(sf) & 0xF); |
346 | send(8 | EOT | mod | NoMask, null, r0_info, exdesc, 0x2000010); |
347 | } |
348 | |
349 | void threadend(const RegData &r0_info) { threadend(InstructionModifier(), r0_info); } |
350 | |
351 | // Gateway messages. |
352 | void barriermsg(const InstructionModifier &mod, const GRF &) |
353 | { |
354 | uint32_t exdesc = static_cast<int>(SharedFunction::gtwy) & 0xF; |
355 | send(1 | mod | NoMask, null, header, exdesc, 0x2000004); |
356 | } |
357 | |
358 | void barriermsg(const GRF &) { barriermsg(InstructionModifier(), header); } |
359 | |
360 | // Prepare barrier header. |
361 | void (const GRF &, const GRF &r0_info = r0) { |
362 | if (hardware >= HW::XeHPG) { |
363 | mov(1 | NoMask, header.hf(4), Immediate::hf(0)); |
364 | mov(2 | NoMask, header.ub(10)(1), r0_info.ub(11)(0)); |
365 | } else |
366 | and_(8 | NoMask, header.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000)); |
367 | } |
368 | |
369 | void barriersignal(const InstructionModifier &mod, const GRF &temp, const GRF &r0_info = r0) |
370 | { |
371 | barrierheader(temp, r0_info); |
372 | barriermsg(mod, temp); |
373 | } |
374 | |
375 | void barriersignal(const InstructionModifier &mod, const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0) |
376 | { |
377 | if (hardware >= HW::XeHPG) |
378 | mov(1 | NoMask, temp.ud(2), (threadCount << 24) | (threadCount << 16)); |
379 | else |
380 | { |
381 | and_(8 | NoMask, temp.ud(), r0_info.ud(2), uint32_t((hardware >= HW::Gen11) ? 0x7F000000 : 0x8F000000)); |
382 | mov(1 | NoMask, temp.ub(9), 0x80 | (threadCount & 0x7F)); |
383 | } |
384 | barriermsg(mod, temp); |
385 | } |
386 | |
387 | void barriersignal(const GRF &temp, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), temp, r0_info); } |
388 | void barriersignal(const GRF &temp, uint32_t threadCount, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), temp, threadCount, r0_info); } |
389 | |
390 | // Named barriers. |
391 | void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0) |
392 | { |
393 | #ifdef NGEN_SAFE |
394 | if (hardware != HW::XeHPC) |
395 | throw unsupported_message(); |
396 | #endif |
397 | mov(1 | NoMask, temp.uw(4), uint8_t(barrierID)); |
398 | mov(2 | NoMask, temp.ub(10)(1), r0_info.ub(11)(0)); |
399 | barriermsg(mod, temp); |
400 | } |
401 | |
402 | void barriersignal(const InstructionModifier &mod, uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers) |
403 | { |
404 | #ifdef NGEN_SAFE |
405 | if (hardware != HW::XeHPC) |
406 | throw unsupported_message(); |
407 | #endif |
408 | mov(1 | NoMask, temp.ud(2), (barrierID & 0xFF) | (static_cast<uint32_t>(barrierType) << 14) | ((producers & 0xFF) << 16) | ((consumers & 0xFF) << 24)); |
409 | barriermsg(mod, temp); |
410 | } |
411 | |
412 | void barriersignal(uint32_t barrierID, const GRF &temp, const GRF &r0_info = r0) { barriersignal(InstructionModifier(), barrierID, temp, r0_info); } |
413 | void barriersignal(uint32_t barrierID, const GRF &temp, BarrierType barrierType, uint32_t producers, uint32_t consumers) { barriersignal(InstructionModifier(), barrierID, temp, barrierType, producers, consumers); } |
414 | |
415 | void barrierwait() |
416 | { |
417 | if (isGen12) |
418 | sync.bar(NoMask); |
419 | else |
420 | wait(NoMask, n0[0]); |
421 | } |
422 | |
423 | template <typename... Targs> |
424 | void barrier(const Targs &...barrierArgs) |
425 | { |
426 | barriersignal(barrierArgs...); |
427 | barrierwait(); |
428 | } |
429 | |
430 | // Global memory fence. |
431 | void memfence(const InstructionModifier &mod, const RegData &dst, const RegData & = GRF(0)) |
432 | { |
433 | if (hardware <= HW::XeHP) { |
434 | const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF; |
435 | send(8 | mod | NoMask, dst, header, exdesc, 0x219E000); |
436 | } else |
437 | send(1 | mod | NoMask, SharedFunction::ugm, dst, header, null, 0, 0x214031F); |
438 | } |
439 | |
440 | void memfence(const RegData &dst, const RegData & = GRF(0)) { memfence(InstructionModifier(), dst, header); } |
441 | |
442 | // SLM-only memory fence. |
443 | void slmfence(const InstructionModifier &mod, const RegData &dst, const RegData & = GRF(0)) |
444 | { |
445 | if (hardware <= HW::XeHP) { |
446 | const uint32_t exdesc = static_cast<int>(SharedFunction::dc0) & 0xF; |
447 | send(8 | mod | NoMask, dst, header, exdesc, 0x219E0FE); |
448 | } else |
449 | send(1 | mod | NoMask, SharedFunction::slm, dst, header, null, 0, 0x210011F); |
450 | } |
451 | |
452 | void slmfence(const RegData &dst, const RegData & = GRF(0)) { slmfence(InstructionModifier(), dst, header); } |
453 | |
454 | // XeHP+ prologues. |
455 | void loadlid(int argBytes, int dims = 3, int simd = 8, const GRF &temp = GRF(127), int paddedSize = 0) |
456 | { |
457 | if (hardware >= HW::XeHP) { |
458 | const int grfSize = GRF::bytes(hardware); |
459 | const int grfOW = grfSize / 16; |
460 | int simdGRFs = (simd > 16 && grfSize < 64) ? 2 : 1; |
461 | int insns = 0; |
462 | |
463 | if (dims > 0) { |
464 | auto dmSave = defaultModifier; |
465 | defaultModifier |= NoMask | AutoSWSB; |
466 | |
467 | mov<uint32_t>(8, temp, uint16_t(0)); |
468 | and_<uint32_t>(1, temp[2], r0[0], uint32_t(~0x1F)); |
469 | and_<uint16_t>(1, temp[0], r0[4], uint16_t(0xFF)); |
470 | add<uint32_t>(1, temp[2], temp[2], uint16_t(argBytes)); |
471 | if (simd == 1) { |
472 | mad<uint32_t>(1, temp[2], temp[2], temp.uw(0), uint16_t(grfSize)); |
473 | load(8, r1, aligned_block_oword(1), A32NC, temp); |
474 | } else { |
475 | mad<uint32_t>(1, temp[2], temp[2], temp.uw(0), uint16_t(3 * simdGRFs * grfSize)); |
476 | load(8, r1, aligned_block_oword(simdGRFs * ((dims == 1) ? 1 : 2) * grfOW), A32NC, temp); |
477 | insns += 6; |
478 | if (dims == 3) { |
479 | add<uint32_t>(1, temp[2], temp[2], uint16_t(2 * simdGRFs * grfSize)); |
480 | load(8, GRF(1 + 2 * simdGRFs), aligned_block_oword(grfOW * simdGRFs), A32NC, temp); |
481 | insns += 2; |
482 | } |
483 | } |
484 | |
485 | defaultModifier = dmSave; |
486 | } |
487 | |
488 | if (paddedSize > 0) { |
489 | int nops = (paddedSize >> 4) - insns; |
490 | #ifdef NGEN_SAFE |
491 | if (paddedSize & 0xF) throw invalid_operand_exception(); |
492 | if (nops < 0) throw invalid_operand_exception(); |
493 | #endif |
494 | for (int i = 0; i < nops; i++) |
495 | nop(); |
496 | } |
497 | |
498 | if (!_labelLocalIDsLoaded.defined(labelManager)) |
499 | mark(_labelLocalIDsLoaded); |
500 | } |
501 | } |
502 | |
503 | void loadargs(const GRF &base, int argGRFs, const GRF &temp = GRF(127)) |
504 | { |
505 | if (hardware >= HW::XeHP) { |
506 | if (argGRFs > 0) { |
507 | auto dst = base; |
508 | auto dmSave = defaultModifier; |
509 | defaultModifier |= NoMask | AutoSWSB; |
510 | |
511 | mov<uint32_t>(8, temp, uint16_t(0)); |
512 | and_<uint32_t>(1, temp[2], r0[0], uint32_t(~0x1F)); |
513 | while (argGRFs > 0) { |
514 | int nload = std::min(utils::rounddown_pow2(argGRFs), 4); |
515 | load(8, dst, aligned_block_oword(GRF::bytes(hardware) * nload / 16), A32NC, temp); |
516 | argGRFs -= nload; |
517 | dst += nload; |
518 | if (argGRFs > 0) |
519 | add<uint32_t>(1, temp[2], temp[2], uint32_t(GRF::bytes(hardware) * nload)); |
520 | } |
521 | |
522 | defaultModifier = dmSave; |
523 | } |
524 | |
525 | if (!_labelArgsLoaded.defined(labelManager)) |
526 | mark(_labelArgsLoaded); |
527 | } |
528 | } |
529 | |
530 | void epilogue(int GRFCount, bool hasSLM, const RegData &r0_info) |
531 | { |
532 | GRF tmp0(GRFCount - 3); |
533 | GRF tmp1(GRFCount - 2); |
534 | GRF lastReg(GRFCount - 1); |
535 | |
536 | bool doMemFence = false; |
537 | bool doSLMFence = false; |
538 | bool setAccToZero = false; |
539 | |
540 | switch (hardware) { |
541 | case HW::XeLP: |
542 | case HW::XeHP: |
543 | case HW::XeHPG: |
544 | doMemFence = true; |
545 | doSLMFence = true; |
546 | setAccToZero = true; |
547 | break; |
548 | default: break; |
549 | } |
550 | |
551 | if (!hasSLM) doSLMFence = false; |
552 | |
553 | int dwordsPerReg = GRF::bytes(hardware) / sizeof(uint32_t); |
554 | mov<uint32_t>(dwordsPerReg, lastReg, r0_info); |
555 | |
556 | if (doMemFence) memfence(tmp0, r0_info); |
557 | if (doSLMFence) slmfence(tmp1, r0_info); |
558 | |
559 | if (setAccToZero) { |
560 | mov(16, acc0.f(), 0.f); |
561 | if (hardware == HW::XeHP) mov(16, acc2.f(), 0.f); |
562 | } |
563 | |
564 | if (doMemFence) wrdep(tmp0); |
565 | if (doSLMFence) wrdep(tmp1); |
566 | |
567 | threadend(lastReg); |
568 | } |
569 | |