1 | /******************************************************************************* |
2 | * Copyright 2020-2022 Intel Corporation |
3 | * |
4 | * Licensed under the Apache License, Version 2.0 (the "License"); |
5 | * you may not use this file except in compliance with the License. |
6 | * You may obtain a copy of the License at |
7 | * |
8 | * http://www.apache.org/licenses/LICENSE-2.0 |
9 | * |
10 | * Unless required by applicable law or agreed to in writing, software |
11 | * distributed under the License is distributed on an "AS IS" BASIS, |
12 | * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
13 | * See the License for the specific language governing permissions and |
14 | * limitations under the License. |
15 | *******************************************************************************/ |
16 | |
17 | #ifndef EMULATION_HPP |
18 | #define EMULATION_HPP |
19 | |
20 | #include <exception> |
21 | |
22 | namespace dnnl { |
23 | namespace impl { |
24 | namespace gpu { |
25 | namespace jit { |
26 | |
27 | struct EmulationStrategy { |
28 | // Emulate 64-bit arithmetic (required for GenXLP) |
29 | bool emulate64 = false; |
30 | // Emulate DW x DW -> DW multiplication (required for Gen12) |
31 | bool emulateDWxDW = false; |
32 | // Use 32-bit adds for 64-bit arithmetic, assuming no 2^32 boundaries crossed. |
33 | bool emulate64_add32 = false; |
34 | // Emulate DW x DW -> QW multiplication (XeHPC) |
35 | bool emulate64_mul = false; |
36 | // Emulate QW and/or/xor operations (XeHPC) |
37 | bool emulate64_logic = false; |
38 | // Don't emulate QW shl/shr (XeHPC) |
39 | bool noemulate64_shift = false; |
40 | |
41 | EmulationStrategy() = default; |
42 | EmulationStrategy(ngen::HW hw_, int stepping = 0) { |
43 | using namespace ngen; |
44 | if (hw_ == HW::Gen11) emulate64 = true; |
45 | if (hw_ >= HW::Gen11) emulateDWxDW = true; |
46 | if (hw_ == HW::Gen12LP) emulate64 = true; |
47 | if (hw_ == HW::XeHPG) emulate64 = true; |
48 | if (hw_ == HW::XeHPC) { |
49 | if (stepping >= SteppingPVCXTB0) |
50 | emulate64_mul = emulate64_logic = true; |
51 | else |
52 | emulate64 = noemulate64_shift = true; |
53 | } |
54 | } |
55 | }; |
56 | |
57 | struct EmulationState { |
58 | ngen::GRF temp[2]; // Temporary GRFs for use in emulation sequences |
59 | ngen::FlagRegister |
60 | flag; // Flag register for use in emulating 64-bit adds (optional, avoids temporary registers/acc) |
61 | int flagOffset = 0; // Channel offset to use with flag register. |
62 | }; |
63 | |
64 | // Implementation wrapped as static methods in non-instantiated class. |
65 | // Clients should declare EmulationImplementation as a friend. |
66 | struct EmulationImplementation { |
67 | [[noreturn]] static void stub() { |
68 | throw std::runtime_error("Unimplemented" ); |
69 | } |
70 | |
71 | template <typename DT, typename O> |
72 | static void applyDefaultType(O &op) { |
73 | using namespace ngen; |
74 | if (op.getType() == DataType::invalid) op.setType(getDataType<DT>()); |
75 | } |
76 | |
77 | template <typename O> |
78 | static bool isQW(const O &op) { |
79 | using namespace ngen; |
80 | using dnnl::impl::utils::one_of; |
81 | return one_of(op.getType(), DataType::q, DataType::uq); |
82 | } |
83 | |
84 | template <typename O> |
85 | static bool isDW(const O &op) { |
86 | using namespace ngen; |
87 | using dnnl::impl::utils::one_of; |
88 | return one_of(op.getType(), DataType::d, DataType::ud); |
89 | } |
90 | |
91 | template <typename O> |
92 | static bool isW(const O &op) { |
93 | using namespace ngen; |
94 | using dnnl::impl::utils::one_of; |
95 | return one_of(op.getType(), DataType::w, DataType::uw); |
96 | } |
97 | |
98 | template <typename T1, typename T2> |
99 | static bool equal(const T1 &o1, const T2 &o2) { |
100 | return o1 == o2; |
101 | } |
102 | static bool equal(const ngen::RegData &o1, const ngen::Immediate &o2) { |
103 | return false; |
104 | } |
105 | |
106 | static void downgradeToDW(ngen::RegData &op) { |
107 | using namespace ngen; |
108 | if (isQW(op)) { |
109 | op.setType( |
110 | (op.getType() == DataType::q) ? DataType::d : DataType::ud); |
111 | op.setOffset(op.getOffset() * 2); |
112 | } |
113 | } |
114 | |
115 | static void downgradeToDW(ngen::Immediate &op) { |
116 | using namespace ngen; |
117 | if (isQW(op)) |
118 | op.setType( |
119 | (op.getType() == DataType::q) ? DataType::d : DataType::ud); |
120 | } |
121 | |
122 | // Get the DW equivalent of a QW region. |
123 | static void makeDWPair(ngen::RegData &op, int esize) { |
124 | if (isQW(op)) { |
125 | downgradeToDW(op); |
126 | if (op.getHS() > 1) { |
127 | if (op.getVS() != op.getHS() * op.getWidth()) stub(); |
128 | op.setRegion(op.getHS() * 2, 2, 1); |
129 | } else { |
130 | auto newVS = op.getVS() * 2; |
131 | if (esize == op.getWidth()) newVS = esize * 2; |
132 | op.setRegion(newVS, op.getWidth() * 2, 1); |
133 | } |
134 | } |
135 | } |
136 | |
137 | // Split a register into DW pairs. |
138 | static void splitToDW( |
139 | ngen::RegData in, ngen::RegData &outLo, ngen::RegData &outHi) { |
140 | using namespace ngen; |
141 | bool isQ = (in.getType() == DataType::q); |
142 | bool isUQ = (in.getType() == DataType::uq); |
143 | |
144 | if (isQ || isUQ) { |
145 | outLo = in; |
146 | outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2); |
147 | outLo.setOffset(in.getOffset() * 2); |
148 | outLo.setType(DataType::ud); |
149 | |
150 | outHi = outLo; |
151 | outHi.setOffset(in.getOffset() * 2 + 1); |
152 | outHi.setType(isQ ? DataType::d : DataType::ud); |
153 | } else { |
154 | outLo = in; |
155 | outHi = Subregister {}; // invalid |
156 | } |
157 | } |
158 | |
159 | // Split an ngen::Immediate into DW pairs. |
160 | static void splitToDW(const ngen::Immediate &in, ngen::Immediate &outLo, |
161 | ngen::Immediate &outHi) { |
162 | using namespace ngen; |
163 | bool isQ = (in.getType() == DataType::q); |
164 | bool isUQ = (in.getType() == DataType::uq); |
165 | |
166 | if (isQ || isUQ) { |
167 | outLo = uint32_t(static_cast<uint64_t>(in)); |
168 | outLo.setType(DataType::ud); |
169 | |
170 | outHi = uint32_t(static_cast<uint64_t>(in) >> 32); |
171 | outHi.setType(isQ ? DataType::d : DataType::ud); |
172 | } else { |
173 | outLo = in; |
174 | outHi = uint16_t(0); |
175 | } |
176 | } |
177 | |
178 | static ngen::RegData lowWord(ngen::RegData in) { |
179 | using namespace ngen; |
180 | if (isW(in)) return in; |
181 | |
182 | auto outLo = in; |
183 | outLo.setRegion(in.getVS() * 2, in.getWidth(), in.getHS() * 2); |
184 | outLo.setOffset(in.getOffset() * 2); |
185 | outLo.setType(DataType::uw); |
186 | |
187 | return outLo; |
188 | } |
189 | |
190 | static ngen::Immediate lowWord(const ngen::Immediate &in) { |
191 | return uint16_t(static_cast<uint64_t>(in) & 0xffff); |
192 | } |
193 | |
194 | static bool isUnitStride(const ngen::RegData &rd) { |
195 | return (rd.getHS() == 1 && rd.getVS() == rd.getWidth()); |
196 | } |
197 | |
198 | // Move, emulating 64-bit moves with 32-bit (generally a good idea). |
199 | template <typename DT = void, typename Generator> |
200 | static void emov(Generator &g, const ngen::InstructionModifier &mod, |
201 | ngen::RegData dst, ngen::RegData src0, |
202 | const EmulationStrategy &strategy) { |
203 | using namespace ngen; |
204 | applyDefaultType<DT>(dst); |
205 | applyDefaultType<DT>(src0); |
206 | |
207 | bool dstQ = isQW(dst); |
208 | bool s0Q = isQW(src0); |
209 | bool s0D = isDW(src0); |
210 | bool isDF = (src0.getType() == DataType::df |
211 | && dst.getType() == DataType::df); |
212 | bool unaligned = (mod.getExecSize() > 1 && src0.getHS() != 0 |
213 | && src0.getOffset() != dst.getOffset()); |
214 | |
215 | if ((dstQ && s0D) && strategy.emulate64) { |
216 | if (src0.getNeg()) stub(); |
217 | bool s0Signed = isSigned(src0.getType()); |
218 | RegData dstHi, dstLo; |
219 | splitToDW(dst, dstLo, dstHi); |
220 | g.mov(mod, dstLo, src0); |
221 | if (!s0Signed) { |
222 | g.mov(mod, dstHi, 0); |
223 | } else { |
224 | g.asr(mod, dstHi, dstLo, uint16_t(31)); |
225 | } |
226 | } else if ((isDF && unaligned && g.hardware >= ngen::HW::XeHP) |
227 | || ((dstQ || s0Q) && (strategy.emulate64))) { |
228 | if (dstQ != s0Q) stub(); |
229 | |
230 | auto mod2x = mod; |
231 | mod2x.setExecSize(mod.getExecSize() * 2); |
232 | |
233 | makeDWPair(dst, mod.getExecSize()); |
234 | makeDWPair(src0, mod.getExecSize()); |
235 | g.mov(mod2x, dst, src0); |
236 | } else if (dst.getType() == DataType::f |
237 | && src0.getType() == DataType::bf |
238 | && (src0.getHS() != 1 || mod.getExecSize() == 1)) { |
239 | // Emulate bf16->f32 upconversion |
240 | dst.setType(DataType::ud); |
241 | src0.setType(DataType::uw); |
242 | g.shl(mod, dst, src0, 16); |
243 | } else |
244 | g.mov(mod, dst, src0); |
245 | } |
246 | |
247 | template <typename DT = void, typename Generator> |
248 | static void emov(Generator &g, const ngen::InstructionModifier &mod, |
249 | ngen::RegData dst, ngen::Immediate src0, |
250 | const EmulationStrategy &strategy) { |
251 | using namespace ngen; |
252 | applyDefaultType<DT>(dst); |
253 | applyDefaultType<DT>(src0); |
254 | |
255 | bool dstQ = isQW(dst); |
256 | bool s0Q = isQW(src0); |
257 | |
258 | if ((dstQ || s0Q) && strategy.emulate64) { |
259 | if (!dstQ) stub(); |
260 | |
261 | RegData dstHi, dstLo; |
262 | Immediate s0Hi = 0, s0Lo = 0; |
263 | |
264 | splitToDW(src0, s0Lo, s0Hi); |
265 | |
266 | if (static_cast<uint64_t>(s0Lo) == static_cast<uint64_t>(s0Hi) |
267 | && dst.getHS() <= 1) { |
268 | auto mod2x = mod; |
269 | mod2x.setExecSize(mod.getExecSize() * 2); |
270 | |
271 | downgradeToDW(dst); |
272 | dst.setRegion(0, 0, 1); |
273 | g.mov(mod2x, dst, s0Lo); |
274 | } else { |
275 | splitToDW(dst, dstLo, dstHi); |
276 | g.mov(mod, dstLo, s0Lo); |
277 | g.mov(mod, dstHi, s0Hi); |
278 | } |
279 | } else |
280 | g.mov(mod, dst, src0); |
281 | } |
282 | |
283 | template <typename Generator> |
284 | static void eaddSignExtend1(Generator &g, |
285 | const ngen::InstructionModifier &mod, bool &doSub, |
286 | const ngen::Immediate &src1, ngen::Immediate &s1LoPos, |
287 | const ngen::Immediate &s1Lo, const ngen::Immediate &s1Hi, bool &s1Q, |
288 | const ngen::GRF (&temp)[2]) { |
289 | using namespace ngen; |
290 | uint64_t raw = static_cast<uint64_t>(src1); |
291 | if (src1.getType() == DataType::d) { |
292 | int32_t val = raw; |
293 | s1LoPos = uint32_t(std::abs(val)); |
294 | doSub = (val < 0); |
295 | } else if (src1.getType() == DataType::w) { |
296 | int16_t val = raw; |
297 | s1LoPos = uint16_t(std::abs(val)); |
298 | doSub = (val < 0); |
299 | } |
300 | } |
301 | |
302 | template <typename Generator> |
303 | static void eaddSignExtend1(Generator &g, |
304 | const ngen::InstructionModifier &mod, bool &doSub, |
305 | const ngen::RegData &src1, ngen::RegData &s1LoPos, |
306 | ngen::RegData &s1Lo, ngen::RegData &s1Hi, bool &s1Q, |
307 | const ngen::GRF (&temp)[2]) { |
308 | using namespace ngen; |
309 | s1Q = true; |
310 | s1Hi = temp[0].d(); |
311 | if (s1Lo.getNeg()) { |
312 | g.asr(mod, s1Hi, -s1Lo, uint16_t(31)); |
313 | s1Hi = -s1Hi; |
314 | } else |
315 | g.asr(mod, s1Hi, s1Lo, uint16_t(31)); |
316 | s1Lo.setType(DataType::ud); |
317 | } |
318 | |
319 | static void eaddHandleS1Neg( |
320 | bool &doSub, ngen::RegData &s1LoPos, const ngen::RegData &s1Lo) { |
321 | if (isSigned(s1Lo.getType())) stub(); |
322 | doSub = s1Lo.getNeg(); |
323 | s1LoPos = -s1Lo; |
324 | } |
325 | |
326 | static void eaddHandleS1Neg(bool &doSub, const ngen::Immediate &s1LoPos, |
327 | const ngen::Immediate &s1Lo) { |
328 | /* no-op */ |
329 | } |
330 | |
331 | template <typename Generator> |
332 | static void eaddFixupQD(Generator &g, const ngen::InstructionModifier &mod, |
333 | const ngen::FlagRegister &flag, const ngen::RegData &dstHi, |
334 | const ngen::RegData &src1) { |
335 | if ((src1.getBytes() < 8) && isSigned(src1.getType())) { |
336 | // Add sign extension of src1 to high 32 bits of dst (inefficient but rarely used path). |
337 | g.cmp(mod | (src1.getNeg() ? g.gt : g.lt) | flag, src1, 0); |
338 | g.add(mod | flag, dstHi, dstHi, -1); |
339 | } |
340 | } |
341 | |
342 | template <typename Generator> |
343 | static void eaddFixupQD(Generator &g, const ngen::InstructionModifier &mod, |
344 | const ngen::FlagRegister &flag, const ngen::RegData &dstHi, |
345 | const ngen::Immediate &src1) { |
346 | /* no-op */ |
347 | } |
348 | |
349 | static bool eaddIsNegative(const ngen::RegData &r) { return r.getNeg(); } |
350 | |
351 | static bool eaddIsNegative(const ngen::Immediate &i) { |
352 | return int32_t(uint64_t(i)) < 0; |
353 | } |
354 | |
355 | // Integer addition, emulating 64-bit arithmetic if configured. |
356 | template <typename DT = void, typename S1, typename Generator> |
357 | static void eaddInternal(Generator &g, const ngen::InstructionModifier &mod, |
358 | ngen::RegData dst, ngen::RegData src0, S1 src1, |
359 | const EmulationStrategy &strategy, const EmulationState &state) { |
360 | using namespace ngen; |
361 | const auto &temp = state.temp; |
362 | |
363 | applyDefaultType<DT>(dst); |
364 | applyDefaultType<DT>(src0); |
365 | applyDefaultType<DT>(src1); |
366 | |
367 | bool dstQ = isQW(dst); |
368 | bool s0Q = isQW(src0); |
369 | bool s1Q = isQW(src1); |
370 | |
371 | if (dstQ && strategy.emulate64_add32) { |
372 | RegData dstHi, dstLo, s0Hi, s0Lo; |
373 | S1 s1Hi, s1Lo; |
374 | |
375 | splitToDW(dst, dstLo, dstHi); |
376 | splitToDW(src0, s0Lo, s0Hi); |
377 | splitToDW(src1, s1Lo, s1Hi); |
378 | g.add(mod, dstLo, s0Lo, s1Lo); |
379 | |
380 | if (s0Q && s1Q) |
381 | g.add(mod, dstHi, s0Hi, s1Hi); |
382 | else if (s0Q) { |
383 | if (!equal(dstHi, s0Hi)) g.mov(mod, dstHi, s0Hi); |
384 | } else if (s1Q) { |
385 | if (!equal(dstHi, s1Hi)) g.mov(mod, dstHi, s1Hi); |
386 | } else |
387 | g.mov(mod, dstHi, uint16_t(0)); |
388 | } else if (!strategy.emulate64) |
389 | g.add(mod, dst, src0, src1); |
390 | else { |
391 | if (!dstQ) { |
392 | downgradeToDW(src0); |
393 | downgradeToDW(src1); |
394 | g.add(mod, dst, src0, src1); |
395 | } else { |
396 | RegData dstHi, dstLo, s0Hi, s0Lo; |
397 | S1 s1Hi, s1Lo, s1LoPos; |
398 | FlagRegister flag = state.flag; |
399 | |
400 | splitToDW(dst, dstLo, dstHi); |
401 | splitToDW(src0, s0Lo, s0Hi); |
402 | splitToDW(src1, s1Lo, s1Hi); |
403 | s1LoPos = s1Lo; |
404 | |
405 | bool s0Signed = isSigned(s0Lo.getType()); |
406 | bool s1Signed = isSigned(s1Lo.getType()); |
407 | |
408 | if (flag.isValid() && !eaddIsNegative(s0Lo)) { |
409 | // Use flag register + ov. |
410 | auto Mx = g.ExecutionOffset(state.flagOffset); |
411 | bool neg = eaddIsNegative(s1Lo); |
412 | |
413 | auto s0LoUD = s0Lo; |
414 | auto s1LoMod = s1Lo; |
415 | s0LoUD.setType(DataType::ud); |
416 | if (s1Signed |
417 | && !std::is_base_of<ngen::Immediate, S1>::value) { |
418 | s1LoMod.setType(DataType::ud); |
419 | neg = false; |
420 | } |
421 | |
422 | g.add(mod | Mx | g.ov | flag, dstLo, s0LoUD, s1LoMod); |
423 | if (s0Q && s1Q) |
424 | g.add(mod, dstHi, s0Hi, s1Hi); |
425 | else if (s0Q && !equal(dstHi, s0Hi)) |
426 | g.mov(mod, dstHi, s0Hi); |
427 | else if (s1Q && !equal(dstHi, s1Hi)) |
428 | g.mov(mod, dstHi, s1Hi); |
429 | else if (!s0Q && !s1Q) |
430 | g.mov(mod, dstHi, 0); |
431 | g.add(mod | Mx | flag, dstHi, dstHi, neg ? -1 : +1); |
432 | eaddFixupQD(g, mod | Mx, flag, dstHi, src1); |
433 | } else { |
434 | // Slow path: addc/subb + acc. |
435 | RegData carry = temp[0].ud(); |
436 | bool lateCarry = false; |
437 | RegData subDstLo; |
438 | bool doSub = false; |
439 | |
440 | // For :uq + :d or :q + :ud, sign extend 32-bit input to 64 bits. |
441 | if (s0Signed != s1Signed) { |
442 | if (s0Signed) { |
443 | s0Q = true; |
444 | s0Hi = temp[0].d(); |
445 | g.asr(mod, s0Hi, s0Lo, uint16_t(31)); |
446 | s0Lo.setType(DataType::ud); |
447 | if (s0Lo.getNeg()) s0Hi = -s0Hi; |
448 | } else |
449 | eaddSignExtend1(g, mod, doSub, src1, s1LoPos, s1Lo, |
450 | s1Hi, s1Q, temp); |
451 | carry = temp[1].ud(); |
452 | lateCarry = true; |
453 | } |
454 | |
455 | // Handle modifiers. |
456 | if (s0Lo.getNeg()) stub(); |
457 | eaddHandleS1Neg(doSub, s1LoPos, s1Lo); |
458 | |
459 | // Compute low 32 bits, saving carry/borrow. |
460 | if (dstLo.getOffset() != 0) { |
461 | doSub ? g.subb(mod, g.null.retype(s0Lo.getType()), s0Lo, |
462 | s1LoPos) |
463 | : g.addc(mod, g.null.retype(s0Lo.getType()), s0Lo, |
464 | s1Lo); |
465 | g.add(mod, dstLo, s0Lo, s1Lo); |
466 | } else if ((mod.getExecSize() > 1) |
467 | && !isUnitStride(dstLo)) { |
468 | subDstLo = temp[1].ud(); |
469 | doSub ? g.subb(mod, subDstLo, s0Lo, s1LoPos) |
470 | : g.addc(mod, subDstLo, s0Lo, s1Lo); |
471 | } else { |
472 | doSub ? g.subb(mod, dstLo, s0Lo, s1LoPos) |
473 | : g.addc(mod, dstLo, s0Lo, s1Lo); |
474 | } |
475 | |
476 | // Retrieve carry from accumulator, unless it conflicts with subDstLo. |
477 | if (!lateCarry) g.mov(mod, carry, g.acc0.ud()); |
478 | |
479 | // Move low 32-bits to final resting place, if needed. |
480 | if (subDstLo.isValid()) g.mov(mod, dstLo, subDstLo); |
481 | |
482 | // Retrieve carry from accumulator once subDstLo isn't needed. |
483 | if (lateCarry) g.mov(mod, carry, g.acc0.ud()); |
484 | |
485 | if (doSub) carry = -carry; |
486 | |
487 | // Compute high 32 bits of sum. |
488 | if (s0Q && s1Q) { |
489 | g.add(mod, dstHi, s0Hi, s1Hi); |
490 | g.add(mod, dstHi, carry, dstHi); |
491 | } else if (s0Q) |
492 | g.add(mod, dstHi, carry, s0Hi); |
493 | else if (s1Q) |
494 | g.add(mod, dstHi, carry, s1Hi); |
495 | else |
496 | g.mov(mod, dstHi, carry); |
497 | } |
498 | } |
499 | } |
500 | } |
501 | |
502 | template <typename DT = void, typename Generator> |
503 | static void eadd(Generator &g, const ngen::InstructionModifier &mod, |
504 | const ngen::RegData &dst, const ngen::RegData &src0, |
505 | const ngen::RegData &src1, const EmulationStrategy &strategy, |
506 | const EmulationState &state) { |
507 | eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state); |
508 | } |
509 | |
510 | template <typename DT = void, typename Generator> |
511 | static void eadd(Generator &g, const ngen::InstructionModifier &mod, |
512 | const ngen::RegData &dst, const ngen::RegData &src0, |
513 | ngen::Immediate src1, const EmulationStrategy &strategy, |
514 | const EmulationState &state) { |
515 | eaddInternal<DT>(g, mod, dst, src0, src1, strategy, state); |
516 | } |
517 | |
518 | // Integer multiplication, emulating 32x32 multiplication as configured. |
519 | template <typename DT = void, typename S1, typename Generator> |
520 | static void emulInternal(Generator &g, const ngen::InstructionModifier &mod, |
521 | ngen::RegData dst, ngen::RegData src0, S1 src1, |
522 | const EmulationStrategy &strategy, const EmulationState &state) { |
523 | using namespace ngen; |
524 | applyDefaultType<DT>(dst); |
525 | applyDefaultType<DT>(src0); |
526 | applyDefaultType<DT>(src1); |
527 | |
528 | bool dstD = isDW(dst); |
529 | bool dstQ = isQW(dst); |
530 | bool s0W = isW(src0); |
531 | bool s0D = isDW(src0); |
532 | bool s0Q = isQW(src0); |
533 | bool s1W = isW(src1); |
534 | bool s1D = isDW(src1); |
535 | bool s1Q = isQW(src1); |
536 | bool s1Immed = std::is_base_of<ngen::Immediate, S1>::value; |
537 | |
538 | bool s0Signed = isSigned(src0.getType()); |
539 | bool s1Signed = isSigned(src1.getType()); |
540 | auto mulHiType = (s0Signed || s1Signed) ? DataType::d : DataType::ud; |
541 | |
542 | bool emulate64 = strategy.emulate64; |
543 | emulate64 |= strategy.emulate64_mul; |
544 | |
545 | if (s0Q || s1Q) { |
546 | stub(); |
547 | } else if (dstQ && s0W && s1W) { |
548 | RegData dstLo, dstHi; |
549 | splitToDW(dst, dstLo, dstHi); |
550 | |
551 | g.mul(mod, dstLo, src0, src1); |
552 | |
553 | dstHi.setType(mulHiType); |
554 | dstLo.setType(mulHiType); |
555 | |
556 | if (s0Signed || s1Signed) |
557 | g.asr(mod, dstHi, dstLo, 31); |
558 | else |
559 | g.mov(mod, dstHi, 0); |
560 | } else if (dstQ && s0W && s1D) { |
561 | stub(); |
562 | } else if (dstQ && s0D |
563 | && ((s1W && !s1Immed) || ((s1W || s1D) && emulate64))) { |
564 | RegData dstLo, dstHi; |
565 | splitToDW(dst, dstLo, dstHi); |
566 | |
567 | auto acc = g.acc0.retype(mulHiType)[dstLo.getOffset()]( |
568 | dstLo.getHS()); |
569 | |
570 | g.mul(mod, acc, src0, lowWord(src1)); |
571 | if (s1D) |
572 | g.mach(mod, dstLo, src0, src1); |
573 | else |
574 | g.mach(mod, dstLo, src0, int32_t(0)); |
575 | g.mov(mod, dstHi, dstLo); |
576 | g.mov(mod, dstLo, acc); |
577 | } else if (dstD && s0D && s1D && strategy.emulateDWxDW) { |
578 | auto acc = g.acc0.retype(mulHiType)[dst.getOffset()](dst.getHS()); |
579 | auto dummy = g.null.retype(mulHiType)[dst.getOffset()](dst.getHS()); |
580 | |
581 | g.mul(mod, acc, src0, lowWord(src1)); |
582 | |
583 | if (g.hardware < HW::Gen10) { |
584 | g.mach(mod, dummy, src0, src1); |
585 | g.mov(mod, dst, acc); |
586 | } else { |
587 | g.macl(mod, dst, src0, src1); |
588 | } |
589 | } else |
590 | g.mul(mod, dst, src0, src1); |
591 | } |
592 | |
593 | template <typename DT = void, typename Generator> |
594 | static void emul(Generator &g, const ngen::InstructionModifier &mod, |
595 | const ngen::RegData &dst, const ngen::RegData &src0, |
596 | const ngen::RegData &src1, const EmulationStrategy &strategy, |
597 | const EmulationState &state) { |
598 | emulInternal<DT>(g, mod, dst, src0, src1, strategy, state); |
599 | } |
600 | |
601 | template <typename DT = void, typename Generator> |
602 | static void emul(Generator &g, const ngen::InstructionModifier &mod, |
603 | const ngen::RegData &dst, const ngen::RegData &src0, |
604 | ngen::Immediate src1, const EmulationStrategy &strategy, |
605 | const EmulationState &state) { |
606 | emulInternal<DT>(g, mod, dst, src0, src1, strategy, state); |
607 | } |
608 | |
609 | template <typename S1, typename Generator> |
610 | static void emul32High(Generator &g, const ngen::InstructionModifier &mod, |
611 | const ngen::RegData &dstHi, const ngen::RegData &src0, |
612 | const S1 &src1) { |
613 | g.mul(mod, g.acc0.ud(dstHi.getOffset()), src0, lowWord(src1)); |
614 | g.mach(mod, dstHi, src0, src1); |
615 | } |
616 | |
617 | // Shift left, emulating 64-bit arithmetic if configured. |
618 | template <typename DT = void, typename Generator> |
619 | static void eshl(Generator &g, const ngen::InstructionModifier &mod, |
620 | ngen::RegData dst, ngen::RegData src0, uint16_t src1, |
621 | const EmulationStrategy &strategy, const EmulationState &state) { |
622 | using namespace ngen; |
623 | const auto &temp = state.temp; |
624 | |
625 | applyDefaultType<DT>(dst); |
626 | applyDefaultType<DT>(src0); |
627 | |
628 | bool dstQ = isQW(dst); |
629 | bool s0Q = isQW(src0); |
630 | |
631 | if (src1 == 0) { |
632 | emov<DT, Generator>(g, mod, dst, src0, strategy); |
633 | return; |
634 | } |
635 | |
636 | if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) { |
637 | if (src1 >= 32) stub(); |
638 | |
639 | RegData dstHi, dstLo, s0Hi, s0Lo; |
640 | |
641 | auto acc = temp[0].ud(); |
642 | |
643 | splitToDW(dst, dstLo, dstHi); |
644 | |
645 | if (s0Q) { |
646 | splitToDW(dst, s0Lo, s0Hi); |
647 | |
648 | g.shr(mod, acc, s0Lo, uint16_t(32 - src1)); |
649 | g.shl(mod, dstHi, s0Hi, src1); |
650 | g.shl(mod, dstLo, s0Lo, src1); |
651 | g.or_(mod, dstHi, acc, dstHi); |
652 | } else { |
653 | dstHi.setType(DataType::ud); |
654 | g.shl(mod, dstLo, src0, src1); |
655 | g.shr(mod, dstHi, src0, uint16_t(32 - src1)); |
656 | } |
657 | } else { |
658 | if (s0Q && !dstQ) downgradeToDW(src0); |
659 | g.shl(mod, dst, src0, src1); |
660 | } |
661 | } |
662 | |
663 | // Shift right, emulating 64-bit arithmetic if configured. |
664 | template <typename DT = void, typename Generator> |
665 | static void eshr(Generator &g, const ngen::InstructionModifier &mod, |
666 | ngen::RegData dst, ngen::RegData src0, uint16_t src1, |
667 | const EmulationStrategy &strategy, const EmulationState &state) { |
668 | using namespace ngen; |
669 | const auto &temp = state.temp; |
670 | |
671 | applyDefaultType<DT>(dst); |
672 | applyDefaultType<DT>(src0); |
673 | |
674 | bool dstQ = isQW(dst); |
675 | bool s0Q = isQW(src0); |
676 | |
677 | if (src1 == 0) { |
678 | emov<DT, Generator>(g, mod, dst, src0, strategy); |
679 | return; |
680 | } |
681 | |
682 | if (dstQ && strategy.emulate64 && !strategy.noemulate64_shift) { |
683 | if (src1 >= 32) stub(); |
684 | |
685 | RegData dstHi, dstLo, s0Hi, s0Lo; |
686 | |
687 | auto acc = temp[0].ud(); |
688 | |
689 | splitToDW(dst, dstLo, dstHi); |
690 | |
691 | if (s0Q) { |
692 | splitToDW(dst, s0Lo, s0Hi); |
693 | |
694 | g.shl(mod, acc, s0Lo, uint16_t(32 - src1)); |
695 | g.shr(mod, dstLo, s0Lo, src1); |
696 | isSigned(src0.getType()) ? g.asr(mod, dstHi, s0Hi, src1) |
697 | : g.shr(mod, dstHi, s0Hi, src1); |
698 | g.or_(mod, dstLo, acc, dstLo); |
699 | } else { |
700 | dstLo.setType(dstHi.getType()); |
701 | isSigned(src0.getType()) ? g.asr(mod, dstLo, src0, src1) |
702 | : g.shr(mod, dstLo, src0, src1); |
703 | g.mov(mod, dstHi, uint16_t(0)); |
704 | } |
705 | } else { |
706 | if (s0Q && !dstQ) downgradeToDW(src0); |
707 | isSigned(src0.getType()) ? g.asr(mod, dst, src0, src1) |
708 | : g.shr(mod, dst, src0, src1); |
709 | } |
710 | } |
711 | |
712 | // Multiply by a constant, optimizing for power-of-2 constants and emulating 64-bit arithmetic if configured. |
713 | template <typename DT = void, typename Generator> |
714 | static void emulConstant(Generator &g, const ngen::InstructionModifier &mod, |
715 | const ngen::RegData &dst, const ngen::RegData &src0, int32_t src1, |
716 | const EmulationStrategy &strategy, const EmulationState &state) { |
717 | if (src1 == 0) |
718 | emov<DT>(g, mod, dst, uint16_t(0), strategy); |
719 | else if (src1 == 1) { |
720 | if (dst != src0) emov<DT>(g, mod, dst, src0, strategy); |
721 | } else if (ngen::utils::is_zero_or_pow2(src1)) |
722 | eshl<DT>(g, mod, dst, src0, uint16_t(ngen::utils::log2(src1)), |
723 | strategy, state); |
724 | else if (src1 > 0) |
725 | emul<DT>(g, mod, dst, src0, uint32_t(src1), strategy, state); |
726 | else |
727 | emul<DT>(g, mod, dst, src0, int32_t(src1), strategy, state); |
728 | } |
729 | }; // struct EmulationHelper |
730 | |
731 | } // namespace jit |
732 | } // namespace gpu |
733 | } // namespace impl |
734 | } // namespace dnnl |
735 | |
736 | #define EMULATION_FORWARD \ |
737 | template <typename DT = void> \ |
738 | void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, \ |
739 | ngen::RegData src0, const EmulationStrategy &strategy) { \ |
740 | EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy); \ |
741 | } \ |
742 | template <typename DT = void> \ |
743 | void emov(const ngen::InstructionModifier &mod, ngen::RegData dst, \ |
744 | ngen::Immediate src0, const EmulationStrategy &strategy) { \ |
745 | EmulationImplementation::emov<DT>(*this, mod, dst, src0, strategy); \ |
746 | } \ |
747 | template <typename DT = void> \ |
748 | void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \ |
749 | const ngen::RegData &src0, const ngen::RegData &src1, \ |
750 | const EmulationStrategy &strategy, const EmulationState &state) { \ |
751 | EmulationImplementation::eadd<DT>( \ |
752 | *this, mod, dst, src0, src1, strategy, state); \ |
753 | } \ |
754 | template <typename DT = void> \ |
755 | void eadd(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \ |
756 | const ngen::RegData &src0, ngen::Immediate src1, \ |
757 | const EmulationStrategy &strategy, const EmulationState &state) { \ |
758 | EmulationImplementation::eadd<DT>( \ |
759 | *this, mod, dst, src0, src1, strategy, state); \ |
760 | } \ |
761 | template <typename DT = void> \ |
762 | void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \ |
763 | const ngen::RegData &src0, const ngen::RegData &src1, \ |
764 | const EmulationStrategy &strategy, const EmulationState &state) { \ |
765 | EmulationImplementation::emul<DT>( \ |
766 | *this, mod, dst, src0, src1, strategy, state); \ |
767 | } \ |
768 | template <typename DT = void> \ |
769 | void emul(const ngen::InstructionModifier &mod, const ngen::RegData &dst, \ |
770 | const ngen::RegData &src0, ngen::Immediate src1, \ |
771 | const EmulationStrategy &strategy, const EmulationState &state) { \ |
772 | EmulationImplementation::emul<DT>( \ |
773 | *this, mod, dst, src0, src1, strategy, state); \ |
774 | } \ |
775 | template <typename DT = void> \ |
776 | void eshl(const ngen::InstructionModifier &mod, ngen::RegData dst, \ |
777 | ngen::RegData src0, uint16_t src1, \ |
778 | const EmulationStrategy &strategy, const EmulationState &state) { \ |
779 | EmulationImplementation::eshl<DT>( \ |
780 | *this, mod, dst, src0, src1, strategy, state); \ |
781 | } \ |
782 | template <typename DT = void> \ |
783 | void eshr(const ngen::InstructionModifier &mod, ngen::RegData dst, \ |
784 | ngen::RegData src0, uint16_t src1, \ |
785 | const EmulationStrategy &strategy, const EmulationState &state) { \ |
786 | EmulationImplementation::eshr<DT>( \ |
787 | *this, mod, dst, src0, src1, strategy, state); \ |
788 | } \ |
789 | template <typename DT = void> \ |
790 | void emulConstant(const ngen::InstructionModifier &mod, \ |
791 | const ngen::RegData &dst, const ngen::RegData &src0, int32_t src1, \ |
792 | const EmulationStrategy &strategy, const EmulationState &state) { \ |
793 | EmulationImplementation::emulConstant<DT>( \ |
794 | *this, mod, dst, src0, src1, strategy, state); \ |
795 | } \ |
796 | template <typename S1> \ |
797 | void emul32High(const ngen::InstructionModifier &mod, \ |
798 | const ngen::RegData &dstHi, const ngen::RegData &src0, \ |
799 | const S1 &src1) { \ |
800 | EmulationImplementation::emul32High(*this, mod, dstHi, src0, src1); \ |
801 | } |
802 | |
803 | #endif |
804 | |