1/*******************************************************************************
2* Copyright 2020-2022 Intel Corporation
3*
4* Licensed under the Apache License, Version 2.0 (the "License");
5* you may not use this file except in compliance with the License.
6* You may obtain a copy of the License at
7*
8* http://www.apache.org/licenses/LICENSE-2.0
9*
10* Unless required by applicable law or agreed to in writing, software
11* distributed under the License is distributed on an "AS IS" BASIS,
12* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13* See the License for the specific language governing permissions and
14* limitations under the License.
15*******************************************************************************/
16
17#ifndef EMULATION_HPP
18#define EMULATION_HPP
19
20#include <exception>
21
22namespace dnnl {
23namespace impl {
24namespace gpu {
25namespace jit {
26
27struct EmulationStrategy {
28 // Emulate 64-bit arithmetic (required for GenXLP)
29 bool emulate64 = false;
30 // Emulate DW x DW -> DW multiplication (required for Gen12)
31 bool emulateDWxDW = false;
32 // Use 32-bit adds for 64-bit arithmetic, assuming no 2^32 boundaries crossed.
33 bool emulate64_add32 = false;
34 // Emulate DW x DW -> QW multiplication (XeHPC)
35 bool emulate64_mul = false;
36 // Emulate QW and/or/xor operations (XeHPC)
37 bool emulate64_logic = false;
38 // Don't emulate QW shl/shr (XeHPC)
39 bool noemulate64_shift = false;
40
41 EmulationStrategy() = default;
42 EmulationStrategy(ngen::HW hw_, int stepping = 0) {
43 using namespace ngen;
44 if (hw_ == HW::Gen11) emulate64 = true;
45 if (hw_ >= HW::Gen11) emulateDWxDW = true;
46 if (hw_ == HW::Gen12LP) emulate64 = true;
47 if (hw_ == HW::XeHPG) emulate64 = true;
48 if (hw_ == HW::XeHPC) {
49 if (stepping >= SteppingPVCXTB0)
50 emulate64_mul = emulate64_logic = true;
51 else
52 emulate64 = noemulate64_shift = true;
53 }
54 }
55};
56
57struct EmulationState {
58 ngen::GRF temp[2]; // Temporary GRFs for use in emulation sequences
59 ngen::FlagRegister
60 flag; // Flag register for use in emulating 64-bit adds (optional, avoids temporary registers/acc)
61 int flagOffset = 0; // Channel offset to use with flag register.
62};
63
64// Implementation wrapped as static methods in non-instantiated class.
65// Clients should declare EmulationImplementation as a friend.
66struct EmulationImplementation {
67 [[noreturn]] static void stub() {
68 throw std::runtime_error("Unimplemented");
69 }
70
71 template <typename DT, typename O>
72 static void applyDefaultType(O &op) {
73 using namespace ngen;
74 if (op.getType() == DataType::invalid) op.setType(getDataType<DT>());
75 }
76
77 template <typename O>
78 static bool isQW(const O &op) {
79 using namespace ngen;
80 using dnnl::impl::utils::one_of;
81 return one_of(op.getType(), DataType::q, DataType::uq);
82 }
83
84 template <typename O>
85 static bool isDW(const O &op) {
86 using namespace ngen;
87 using dnnl::impl::utils::one_of;
88 return one_of(op.getType(), DataType::d, DataType::ud);
89 }
90
91 template <typename O>
92 static bool isW(const O &op) {
93 using namespace ngen;
94 using dnnl::impl::utils::one_of;
95 return one_of(op.getType(), DataType::w, DataType::uw);
96 }
97
98 template <typename T1, typename T2>
99 static bool equal(const T1 &o1, const T2 &o2) {
100 return o1 == o2;
101 }
102 static bool equal(const ngen::RegData &o1, const ngen::Immediate &o2) {
103 return false;
104 }
105
106 static void downgradeToDW(ngen::RegData &op) {
107 using namespace ngen;
108 if (isQW(op)) {
109 op.setType(
110 (op.getType() == DataType::q) ? DataType::d : DataType::ud);
111 op.setOffset(op.getOffset() * 2);
112 }
113 }
114
115 static void downgradeToDW(ngen::Immediate &op) {
116 using namespace ngen;
117 if (isQW(op))
118 op.setType(
119 (op.getType() == DataType::q) ? DataType::d : DataType::ud);
120 }
121
122 // Get the DW equivalent of a QW region.
123 static void makeDWPair(ngen::RegData &op, int esize) {
124 if (isQW(op)) {
125 downgradeToDW(op);
126 if (op.getHS() > 1) {
127 if (op.getVS() != op.getHS() * op.getWidth()) stub();
128 op.setRegion(op.getHS() * 2, 2, 1);
129 } else {
130 auto newVS = op.getVS() * 2;
131 if (esize == op.getWidth()) newVS = esize * 2;
132 op.setRegion(newVS, op.getWidth() * 2, 1);
133 }
134 }
135 }
136
137 // Split a register into DW pairs.
138 static void splitToDW(
139 ngen::RegData in, ngen::RegData &outLo, ngen::RegData &outHi) {
140 using namespace ngen;
141 bool isQ = (in.getType() == DataType::q);
142 bool isUQ = (in.getType() == DataType::uq);
143
144 if (isQ || isUQ) {
145 outLo = in;
146 outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2);
147 outLo.setOffset(in.getOffset() * 2);
148 outLo.setType(DataType::ud);
149
150 outHi = outLo;
151 outHi.setOffset(in.getOffset() * 2 + 1);
152 outHi.setType(isQ ? DataType::d : DataType::ud);
153 } else {
154 outLo = in;
155 outHi = Subregister {}; // invalid
156 }
157 }
158
159 // Split an ngen::Immediate into DW pairs.
160 static void splitToDW(const ngen::Immediate &in, ngen::Immediate &outLo,
161 ngen::Immediate &outHi) {
162 using namespace ngen;
163 bool isQ = (in.getType() == DataType::q);
164 bool isUQ = (in.getType() == DataType::uq);
165
166 if (isQ || isUQ) {
167 outLo = uint32_t(static_cast<uint64_t>(in));
168 outLo.setType(DataType::ud);
169
170 outHi = uint32_t(static_cast<uint64_t>(in) >> 32);
171 outHi.setType(isQ ? DataType::d : DataType::ud);
172 } else {
173 outLo = in;
174 outHi = uint16_t(0);
175 }
176 }
177
178 static ngen::RegData lowWord(ngen::RegData in) {
179 using namespace ngen;
180 if (isW(in)) return in;
181
182 auto outLo = in;
183 outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2);
184 outLo.setOffset(in.getOffset() * 2);
185 outLo.setType(DataType::uw);
186
187 return outLo;
188 }
189
190 static ngen::Immediate lowWord(const ngen::Immediate &in) {
191 return uint16_t(static_cast<uint64_t>(in) & 0xffff);
192 }
193
194 static bool isUnitStride(const ngen::RegData &rd) {
195 return (rd.getHS() == 1 && rd.getVS() == rd.getWidth());
196 }
197
198 // Move, emulating 64-bit moves with 32-bit (generally a good idea).
199 template <typename DT = void, typename Generator>
200 static void emov(Generator &g, const ngen::InstructionModifier &mod,
201 ngen::RegData dst, ngen::RegData src0,
202 const EmulationStrategy &strategy) {
203 using namespace ngen;
204 applyDefaultType<DT>(dst);
205 applyDefaultType<DT>(src0);
206
207 bool dstQ = isQW(dst);
208 bool s0Q = isQW(src0);
209 bool s0D = isDW(src0);
210 bool isDF = (src0.getType() == DataType::df
211 && dst.getType() == DataType::df);
212 bool unaligned = (mod.getExecSize() > 1 && src0.getHS() != 0
213 && src0.getOffset() != dst.getOffset());
214
215 if ((dstQ && s0D) && strategy.emulate64) {
216 if (src0.getNeg()) stub();
217 bool s0Signed = isSigned(src0.getType());
218 RegData dstHi, dstLo;
219 splitToDW(dst, dstLo, dstHi);
220 g.mov(mod, dstLo, src0);
221 if (!s0Signed) {
222 g.mov(mod, dstHi, 0);
223 } else {
224 g.asr(mod, dstHi, dstLo, uint16_t(31));
225 }
226 } else if ((isDF && unaligned && g.hardware >= ngen::HW::XeHP)
227 || ((dstQ || s0Q) && (strategy.emulate64))) {
228 if (dstQ != s0Q) stub();
229
230 auto mod2x = mod;
231 mod2x.setExecSize(mod.getExecSize() * 2);
232
233 makeDWPair(dst, mod.getExecSize());
234 makeDWPair(src0, mod.getExecSize());
235 g.mov(mod2x, dst, src0);
236 } else if (dst.getType() == DataType::f
237 && src0.getType() == DataType::bf
238 && (src0.getHS() != 1 || mod.getExecSize() == 1)) {
239 // Emulate bf16->f32 upconversion
240 dst.setType(DataType::ud);
241 src0.setType(DataType::uw);
242 g.shl(mod, dst, src0, 16);
243 } else
244 g.mov(mod, dst, src0);
245 }
246
247 template <typename DT = void, typename Generator>
248 static void emov(Generator &g, const ngen::InstructionModifier &mod,
249 ngen::RegData dst, ngen::Immediate src0,
250 const EmulationStrategy &strategy) {
251 using namespace ngen;
252 applyDefaultType<DT>(dst);
253 applyDefaultType<DT>(src0);
254
255 bool dstQ = isQW(dst);
256 bool s0Q = isQW(src0);
257
258 if ((dstQ || s0Q) && strategy.emulate64) {
259 if (!dstQ) stub();
260
261 RegData dstHi, dstLo;
262 Immediate s0Hi = 0, s0Lo = 0;
263
264 splitToDW(src0, s0Lo, s0Hi);
265
266 if (static_cast<uint64_t>(s0Lo) == static_cast<uint64_t>(s0Hi)
267 && dst.getHS() <= 1) {
268 auto mod2x = mod;
269 mod2x.setExecSize(mod.getExecSize() * 2);
270
271 downgradeToDW(dst);
272 dst.setRegion(0, 0, 1);
273 g.mov(mod2x, dst, s0Lo);
274 } else {
275 splitToDW(dst, dstLo, dstHi);
276 g.mov(mod, dstLo, s0Lo);
277 g.mov(mod, dstHi, s0Hi);
278 }
279 } else
280 g.mov(mod, dst, src0);
281 }
282
283 template <typename Generator>
284 static void eaddSignExtend1(Generator &g,
285 const ngen::InstructionModifier &mod, bool &doSub,
286 const ngen::Immediate &src1, ngen::Immediate &s1LoPos,
287 const ngen::Immediate &s1Lo, const ngen::Immediate &s1Hi, bool &s1Q,
288 const ngen::GRF (&temp)[2]) {
289 using namespace ngen;
290 uint64_t raw = static_cast<uint64_t>(src1);
291 if (src1.getType() == DataType::d) {
292 int32_t val = raw;
293 s1LoPos = uint32_t(std::abs(val));
294 doSub = (val < 0);
295 } else if (src1.getType() == DataType::w) {
296 int16_t val = raw;
297 s1LoPos = uint16_t(std::abs(val));
298 doSub = (val < 0);
299 }
300 }
301
302 template <typename Generator>
303 static void eaddSignExtend1(Generator &g,
304 const ngen::InstructionModifier &mod, bool &doSub,
305 const ngen::RegData &src1, ngen::RegData &s1LoPos,
306 ngen::RegData &s1Lo, ngen::RegData &s1Hi, bool &s1Q,
307 const ngen::GRF (&temp)[2]) {
308 using namespace ngen;
309 s1Q = true;
310 s1Hi = temp[0].d();
311 if (s1Lo.getNeg()) {
312 g.asr(mod, s1Hi, -s1Lo, uint16_t(31));
313 s1Hi = -s1Hi;
314 } else
315 g.asr(mod, s1Hi, s1Lo, uint16_t(31));
316 s1Lo.setType(DataType::ud);
317 }
318
319 static void eaddHandleS1Neg(
320 bool &doSub, ngen::RegData &s1LoPos, const ngen::RegData &s1Lo) {
321 if (isSigned(s1Lo.getType())) stub();
322 doSub = s1Lo.getNeg();
323 s1LoPos = -s1Lo;
324 }
325
326 static void eaddHandleS1Neg(bool &doSub, const ngen::Immediate &s1LoPos,
327 const ngen::Immediate &s1Lo) {
328 /* no-op */
329 }
330
331 template <typename Generator>
332 static void eaddFixupQD(Generator &g, const ngen::InstructionModifier &mod,
333 const ngen::FlagRegister &flag, const ngen::RegData &dstHi,
334 const ngen::RegData &src1) {
335 if ((src1.getBytes() < 8) && isSigned(src1.getType())) {
336 // Add sign extension of src1 to high 32 bits of dst (inefficient but rarely used path).
337 g.cmp(mod | (src1.getNeg() ? g.gt : g.lt) | flag, src1, 0);
338 g.add(mod | flag, dstHi, dstHi, -1);
339 }
340 }
341
342 template <typename Generator>
343 static void eaddFixupQD(Generator &g, const ngen::InstructionModifier &mod,
344 const ngen::FlagRegister &flag, const ngen::RegData &dstHi,
345 const ngen::Immediate &src1) {
346 /* no-op */
347 }
348
349 static bool eaddIsNegative(const ngen::RegData &r) { return r.getNeg(); }
350
351 static bool eaddIsNegative(const ngen::Immediate &i) {
352 return int32_t(uint64_t(i)) < 0;
353 }
354
355 // Integer addition, emulating 64-bit arithmetic if configured.
356 template <typename DT = void, typename S1, typename Generator>
357 static void eaddInternal(Generator &g, const ngen::InstructionModifier &mod,
358 ngen::RegData dst, ngen::RegData src0, S1 src1,
359 const EmulationStrategy &strategy, const EmulationState &state) {
360 using namespace ngen;
361 const auto &temp = state.temp;
362
363 applyDefaultType<DT>(dst);
364 applyDefaultType<DT>(src0);
365 applyDefaultType<DT>(src1);
366
367 bool dstQ = isQW(dst);
368 bool s0Q = isQW(src0);
369 bool s1Q = isQW(src1);
370
371 if (dstQ && strategy.emulate64_add32) {
372 RegData dstHi, dstLo, s0Hi, s0Lo;
373 S1 s1Hi, s1Lo;
374
375 splitToDW(dst, dstLo, dstHi);
376 splitToDW(src0, s0Lo, s0Hi);
377 splitToDW(src1, s1Lo, s1Hi);
378 g.add(mod, dstLo, s0Lo, s1Lo);
379
380 if (s0Q && s1Q)
381 g.add(mod, dstHi, s0Hi, s1Hi);
382 else if (s0Q) {
383 if (!equal(dstHi, s0Hi)) g.mov(mod, dstHi, s0Hi);
384 } else if (s1Q) {
385 if (!equal(dstHi, s1Hi)) g.mov(mod, dstHi, s1Hi);
386 } else
387 g.mov(mod, dstHi, uint16_t(0));
388 } else if (!strategy.emulate64)
389 g.add(mod, dst, src0, src1);
390 else {
391 if (!dstQ) {
392 downgradeToDW(src0);
393 downgradeToDW(src1);
394 g.add(mod, dst, src0, src1);
395 } else {
396 RegData dstHi, dstLo, s0Hi, s0Lo;
397 S1 s1Hi, s1Lo, s1LoPos;
398 FlagRegister flag = state.flag;
399
400 splitToDW(dst, dstLo, dstHi);
401 splitToDW(src0, s0Lo, s0Hi);
402 splitToDW(src1, s1Lo, s1Hi);
403 s1LoPos = s1Lo;
404
405 bool s0Signed = isSigned(s0Lo.getType());
406 bool s1Signed = isSigned(s1Lo.getType());
407
408 if (flag.isValid() && !eaddIsNegative(s0Lo)) {
409 // Use flag register + ov.
410 auto Mx = g.ExecutionOffset(state.flagOffset);
411 bool neg = eaddIsNegative(s1Lo);
412
413 auto s0LoUD = s0Lo;
414 auto s1LoMod = s1Lo;
415 s0LoUD.setType(DataType::ud);
416 if (s1Signed
417 && !std::is_base_of<ngen::Immediate, S1>::value) {
418 s1LoMod.setType(DataType::ud);
419 neg = false;
420 }
421
422 g.add(mod | Mx | g.ov | flag, dstLo, s0LoUD, s1LoMod);
423 if (s0Q && s1Q)
424 g.add(mod, dstHi, s0Hi, s1Hi);
425 else if (s0Q && !equal(dstHi, s0Hi))
426 g.mov(mod, dstHi, s0Hi);
427 else if (s1Q && !equal(dstHi, s1Hi))
428 g.mov(mod, dstHi, s1Hi);
429 else if (!s0Q && !s1Q)
430 g.mov(mod, dstHi, 0);
431 g.add(mod | Mx | flag, dstHi, dstHi, neg ? -1 : +1);
432 eaddFixupQD(g, mod | Mx, flag, dstHi, src1);
433 } else {
434 // Slow path: addc/subb + acc.
435 RegData carry = temp[0].ud();
436 bool lateCarry = false;
437 RegData subDstLo;
438 bool doSub = false;
439
440 // For :uq + :d or :q + :ud, sign extend 32-bit input to 64 bits.
441 if (s0Signed != s1Signed) {
442 if (s0Signed) {
443 s0Q = true;
444 s0Hi = temp[0].d();
445 g.asr(mod, s0Hi, s0Lo, uint16_t(31));
446 s0Lo.setType(DataType::ud);
447 if (s0Lo.getNeg()) s0Hi = -s0Hi;
448 } else
449 eaddSignExtend1(g, mod, doSub, src1, s1LoPos, s1Lo,
450 s1Hi, s1Q, temp);
451 carry = temp[1].ud();
452 lateCarry = true;
453 }
454
455 // Handle modifiers.
456 if (s0Lo.getNeg()) stub();
457 eaddHandleS1Neg(doSub, s1LoPos, s1Lo);
458
459 // Compute low 32 bits, saving carry/borrow.
460 if (dstLo.getOffset() != 0) {
461 doSub ? g.subb(mod, g.null.retype(s0Lo.getType()), s0Lo,
462 s1LoPos)
463 : g.addc(mod, g.null.retype(s0Lo.getType()), s0Lo,
464 s1Lo);
465 g.add(mod, dstLo, s0Lo, s1Lo);
466 } else if ((mod.getExecSize() > 1)
467 && !isUnitStride(dstLo)) {
468 subDstLo = temp[1].ud();
469 doSub ? g.subb(mod, subDstLo, s0Lo, s1LoPos)
470 : g.addc(mod, subDstLo, s0Lo, s1Lo);
471 } else {
472 doSub ? g.subb(mod, dstLo, s0Lo, s1LoPos)
473 : g.addc(mod, dstLo, s0Lo, s1Lo);
474 }
475
476 // Retrieve carry from accumulator, unless it conflicts with subDstLo.
477 if (!lateCarry) g.mov(mod, carry, g.acc0.ud());
478
479 // Move low 32-bits to final resting place, if needed.
480 if (subDstLo.isValid()) g.mov(mod, dstLo, subDstLo);
481
482 // Retrieve carry from accumulator once subDstLo isn't needed.
483 if (lateCarry) g.mov(mod, carry, g.acc0.ud());
484
485 if (doSub) carry = -carry;
486
487 // Compute high 32 bits of sum.
488 if (s0Q && s1Q) {
489 g.add(mod, dstHi, s0Hi, s1Hi);
490 g.add(mod, dstHi, carry, dstHi);
491 } else if (s0Q)
492 g.add(mod, dstHi, carry, s0Hi);
493 else if (s1Q)
494 g.add(mod, dstHi, carry, s1Hi);
495 else
496 g.mov(mod, dstHi, carry);
497 }
498 }
499 }
500 }
501
502 template <typename DT = void, typename Generator>
503 static void eadd(Generator &g, const ngen::InstructionModifier &mod,
504 const ngen::RegData &dst, const ngen::RegData &src0,
505 const ngen::RegData &src1, const EmulationStrategy &strategy,
506 const EmulationState &state) {
507 eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state);
508 }
509
510 template <typename DT = void, typename Generator>
511 static void eadd(Generator &g, const ngen::InstructionModifier &mod,
512 const ngen::RegData &dst, const ngen::RegData &src0,
513 ngen::Immediate src1, const EmulationStrategy &strategy,
514 const EmulationState &state) {
515 eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state);
516 }
517
518 // Integer multiplication, emulating 32x32 multiplication as configured.
519 template <typename DT = void, typename S1, typename Generator>
520 static void emulInternal(Generator &g, const ngen::InstructionModifier &mod,
521 ngen::RegData dst, ngen::RegData src0, S1 src1,
522 const EmulationStrategy &strategy, const EmulationState &state) {
523 using namespace ngen;
524 applyDefaultType<DT>(dst);
525 applyDefaultType<DT>(src0);
526 applyDefaultType<DT>(src1);
527
528 bool dstD = isDW(dst);
529 bool dstQ = isQW(dst);
530 bool s0W = isW(src0);
531 bool s0D = isDW(src0);
532 bool s0Q = isQW(src0);
533 bool s1W = isW(src1);
534 bool s1D = isDW(src1);
535 bool s1Q = isQW(src1);
536 bool s1Immed = std::is_base_of<ngen::Immediate, S1>::value;
537
538 bool s0Signed = isSigned(src0.getType());
539 bool s1Signed = isSigned(src1.getType());
540 auto mulHiType = (s0Signed || s1Signed) ? DataType::d : DataType::ud;
541
542 bool emulate64 = strategy.emulate64;
543 emulate64 |= strategy.emulate64_mul;
544
545 if (s0Q || s1Q) {
546 stub();
547 } else if (dstQ && s0W && s1W) {
548 RegData dstLo, dstHi;
549 splitToDW(dst, dstLo, dstHi);
550
551 g.mul(mod, dstLo, src0, src1);
552
553 dstHi.setType(mulHiType);
554 dstLo.setType(mulHiType);
555
556 if (s0Signed || s1Signed)
557 g.asr(mod, dstHi, dstLo, 31);
558 else
559 g.mov(mod, dstHi, 0);
560 } else if (dstQ && s0W && s1D) {
561 stub();
562 } else if (dstQ && s0D
563 && ((s1W && !s1Immed) || ((s1W || s1D) && emulate64))) {
564 RegData dstLo, dstHi;
565 splitToDW(dst, dstLo, dstHi);
566
567 auto acc = g.acc0.retype(mulHiType)[dstLo.getOffset()](
568 dstLo.getHS());
569
570 g.mul(mod, acc, src0, lowWord(src1));
571 if (s1D)
572 g.mach(mod, dstLo, src0, src1);
573 else
574 g.mach(mod, dstLo, src0, int32_t(0));
575 g.mov(mod, dstHi, dstLo);
576 g.mov(mod, dstLo, acc);
577 } else if (dstD && s0D && s1D && strategy.emulateDWxDW) {
578 auto acc = g.acc0.retype(mulHiType)[dst.getOffset()](dst.getHS());
579 auto dummy = g.null.retype(mulHiType)[dst.getOffset()](dst.getHS());
580
581 g.mul(mod, acc, src0, lowWord(src1));
582
583 if (g.hardware < HW::Gen10) {
584 g.mach(mod, dummy, src0, src1);
585 g.mov(mod, dst, acc);
586 } else {
587 g.macl(mod, dst, src0, src1);
588 }
589 } else
590 g.mul(mod, dst, src0, src1);
591 }
592
593 template <typename DT = void, typename Generator>
594 static void emul(Generator &g, const ngen::InstructionModifier &mod,
595 const ngen::RegData &dst, const ngen::RegData &src0,
596 const ngen::RegData &src1, const EmulationStrategy &strategy,
597 const EmulationState &state) {
598 emulInternal<DT>(g, mod, dst, src0, src1, strategy, state);
599 }
600
601 template <typename DT = void, typename Generator>
602 static void emul(Generator &g, const ngen::InstructionModifier &mod,
603 const ngen::RegData &dst, const ngen::RegData &src0,
604 ngen::Immediate src1, const EmulationStrategy &strategy,
605 const EmulationState &state) {
606 emulInternal<DT>(g, mod, dst, src0, src1, strategy, state);
607 }
608
609 template <typename S1, typename Generator>
610 static void emul32High(Generator &g, const ngen::InstructionModifier &mod,
611 const ngen::RegData &dstHi, const ngen::RegData &src0,
612 const S1 &src1) {
613 g.mul(mod, g.acc0.ud(dstHi.getOffset()), src0, lowWord(src1));
614 g.mach(mod, dstHi, src0, src1);
615 }
616
617 // Shift left, emulating 64-bit arithmetic if configured.
618 template <typename DT = void, typename Generator>
619 static void eshl(Generator &g, const ngen::InstructionModifier &mod,
620 ngen::RegData dst, ngen::RegData src0, uint16_t src1,
621 const EmulationStrategy &strategy, const EmulationState &state) {
622 using namespace ngen;
623 const auto &temp = state.temp;
624
625 applyDefaultType<DT>(dst);
626 applyDefaultType<DT>(src0);
627
628 bool dstQ = isQW(dst);
629 bool s0Q = isQW(src0);
630
631 if (src1 == 0) {
632 emov<DT, Generator>(g, mod, dst, src0, strategy);
633 return;
634 }
635
636 if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) {
637 if (src1 >= 32) stub();
638
639 RegData dstHi, dstLo, s0Hi, s0Lo;
640
641 auto acc = temp[0].ud();
642
643 splitToDW(dst, dstLo, dstHi);
644
645 if (s0Q) {
646 splitToDW(dst, s0Lo, s0Hi);
647
648 g.shr(mod, acc, s0Lo, uint16_t(32 - src1));
649 g.shl(mod, dstHi, s0Hi, src1);
650 g.shl(mod, dstLo, s0Lo, src1);
651 g.or_(mod, dstHi, acc, dstHi);
652 } else {
653 dstHi.setType(DataType::ud);
654 g.shl(mod, dstLo, src0, src1);
655 g.shr(mod, dstHi, src0, uint16_t(32 - src1));
656 }
657 } else {
658 if (s0Q && !dstQ) downgradeToDW(src0);
659 g.shl(mod, dst, src0, src1);
660 }
661 }
662
663 // Shift right, emulating 64-bit arithmetic if configured.
664 template <typename DT = void, typename Generator>
665 static void eshr(Generator &g, const ngen::InstructionModifier &mod,
666 ngen::RegData dst, ngen::RegData src0, uint16_t src1,
667 const EmulationStrategy &strategy, const EmulationState &state) {
668 using namespace ngen;
669 const auto &temp = state.temp;
670
671 applyDefaultType<DT>(dst);
672 applyDefaultType<DT>(src0);
673
674 bool dstQ = isQW(dst);
675 bool s0Q = isQW(src0);
676
677 if (src1 == 0) {
678 emov<DT, Generator>(g, mod, dst, src0, strategy);
679 return;
680 }
681
682 if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) {
683 if (src1 >= 32) stub();
684
685 RegData dstHi, dstLo, s0Hi, s0Lo;
686
687 auto acc = temp[0].ud();
688
689 splitToDW(dst, dstLo, dstHi);
690
691 if (s0Q) {
692 splitToDW(dst, s0Lo, s0Hi);
693
694 g.shl(mod, acc, s0Lo, uint16_t(32 - src1));
695 g.shr(mod, dstLo, s0Lo, src1);
696 isSigned(src0.getType()) ? g.asr(mod, dstHi, s0Hi, src1)
697 : g.shr(mod, dstHi, s0Hi, src1);
698 g.or_(mod, dstLo, acc, dstLo);
699 } else {
700 dstLo.setType(dstHi.getType());
701 isSigned(src0.getType()) ? g.asr(mod, dstLo, src0, src1)
702 : g.shr(mod, dstLo, src0, src1);
703 g.mov(mod, dstHi, uint16_t(0));
704 }
705 } else {
706 if (s0Q && !dstQ) downgradeToDW(src0);
707 isSigned(src0.getType()) ? g.asr(mod, dst, src0, src1)
708 : g.shr(mod, dst, src0, src1);
709 }
710 }
711
712 // Multiply by a constant, optimizing for power-of-2 constants and emulating 64-bit arithmetic if configured.
713 template <typename DT = void, typename Generator>
714 static void emulConstant(Generator &g, const ngen::InstructionModifier &mod,
715 const ngen::RegData &dst, const ngen::RegData &src0, int32_t src1,
716 const EmulationStrategy &strategy, const EmulationState &state) {
717 if (src1 == 0)
718 emov<DT>(g, mod, dst, uint16_t(0), strategy);
719 else if (src1 == 1) {
720 if (dst != src0) emov<DT>(g, mod, dst, src0, strategy);
721 } else if (ngen::utils::is_zero_or_pow2(src1))
722 eshl<DT>(g, mod, dst, src0, uint16_t(ngen::utils::log2(src1)),
723 strategy, state);
724 else if (src1 > 0)
725 emul<DT>(g, mod, dst, src0, uint32_t(src1), strategy, state);
726 else
727 emul<DT>(g, mod, dst, src0, int32_t(src1), strategy, state);
728 }
729}; // struct EmulationHelper
730
731} // namespace jit
732} // namespace gpu
733} // namespace impl
734} // namespace dnnl
735
736#define EMULATION_FORWARD \
737 template <typename DT = void> \
738 void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, \
739 ngen::RegData src0, const EmulationStrategy &strategy) { \
740 EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy); \
741 } \
742 template <typename DT = void> \
743 void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, \
744 ngen::Immediate src0, const EmulationStrategy &strategy) { \
745 EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy); \
746 } \
747 template <typename DT = void> \
748 void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \
749 const ngen::RegData &src0, const ngen::RegData &src1, \
750 const EmulationStrategy &strategy, const EmulationState &state) { \
751 EmulationImplementation::eadd<DT>( \
752 *this, mod, dst, src0, src1, strategy, state); \
753 } \
754 template <typename DT = void> \
755 void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \
756 const ngen::RegData &src0, ngen::Immediate src1, \
757 const EmulationStrategy &strategy, const EmulationState &state) { \
758 EmulationImplementation::eadd<DT>( \
759 *this, mod, dst, src0, src1, strategy, state); \
760 } \
761 template <typename DT = void> \
762 void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \
763 const ngen::RegData &src0, const ngen::RegData &src1, \
764 const EmulationStrategy &strategy, const EmulationState &state) { \
765 EmulationImplementation::emul<DT>( \
766 *this, mod, dst, src0, src1, strategy, state); \
767 } \
768 template <typename DT = void> \
769 void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \
770 const ngen::RegData &src0, ngen::Immediate src1, \
771 const EmulationStrategy &strategy, const EmulationState &state) { \
772 EmulationImplementation::emul<DT>( \
773 *this, mod, dst, src0, src1, strategy, state); \
774 } \
775 template <typename DT = void> \
776 void eshl(const ngen::InstructionModifier &mod, ngen::RegData dst, \
777 ngen::RegData src0, uint16_t src1, \
778 const EmulationStrategy &strategy, const EmulationState &state) { \
779 EmulationImplementation::eshl<DT>( \
780 *this, mod, dst, src0, src1, strategy, state); \
781 } \
782 template <typename DT = void> \
783 void eshr(const ngen::InstructionModifier &mod, ngen::RegData dst, \
784 ngen::RegData src0, uint16_t src1, \
785 const EmulationStrategy &strategy, const EmulationState &state) { \
786 EmulationImplementation::eshr<DT>( \
787 *this, mod, dst, src0, src1, strategy, state); \
788 } \
789 template <typename DT = void> \
790 void emulConstant(const ngen::InstructionModifier &mod, \
791 const ngen::RegData &dst, const ngen::RegData &src0, int32_t src1, \
792 const EmulationStrategy &strategy, const EmulationState &state) { \
793 EmulationImplementation::emulConstant<DT>( \
794 *this, mod, dst, src0, src1, strategy, state); \
795 } \
796 template <typename S1> \
797 void emul32High(const ngen::InstructionModifier &mod, \
798 const ngen::RegData &dstHi, const ngen::RegData &src0, \
799 const S1 &src1) { \
800 EmulationImplementation::emul32High(*this, mod, dstHi, src0, src1); \
801 }
802
803#endif
804