1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 * This source code is licensed under the BSD-style license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7#include "./FbgemmFP16UKernelsAvx512.h"
8#include "./InlineAsmDefines.h"
9
10namespace fbgemm {
11
12void NOINLINE gemmkernel_1x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
13 asm volatile(
14#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
15 "mov %[gp], %%r14\t\n"
16 ".intel_syntax noprefix\t\n"
17#else
18 "mov r14, %[gp]\t\n"
19#endif
20
21 // Copy parameters
22 // k
23 "mov r8, [r14 + 0]\t\n"
24 "dec r8\t\n"
25 // A
26 "mov r9, [r14 + 8]\t\n"
27 // B
28 "mov r10, [r14 + 16]\t\n"
29 // beta
30 "lea r15, [r14 + 24]\t\n"
31 // C
32 "mov r12, [r14 + 32]\t\n"
33 // ldc
34 "mov r13, [r14 + 40]\t\n"
35 // b_block_cols
36 "mov rdi, [r14 + 48]\t\n"
37 // b_block_size
38 "mov rsi, [r14 + 56]\t\n"
39
40 // Make copies of A and C
41 "mov rax, r9\t\n"
42 "mov rcx, r12\t\n"
43
44 "xor ebx, ebx\t\n"
45 "loop_outter%=:\t\n"
46 "mov r14, r8\t\n"
47 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
48 "vcvtph2ps zmm3,YMMWORD PTR [r10 + 0]\t\n"
49 "vcvtph2ps zmm4,YMMWORD PTR [r10 + 32]\t\n"
50 "vxorps xmm0, xmm0, xmm0\t\n"
51 "vcomiss xmm31, xmm0\t\n"
52 "jz zero_regs%=\t\n"
53
54 // Setup values with beta multiplication
55 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
56 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
57 "test r14,r14\t\n"
58 "jz skip_preload%=\t\n"
59 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
60 "skip_preload%=:\t\n"
61 "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n"
62 "vfmadd231ps zmm0,zmm3,zmm2\t\n"
63 "vfmadd231ps zmm1,zmm4,zmm2\t\n"
64 "test r14,r14\t\n"
65 "jnz next_inner%=\t\n"
66 "add r10,64\t\n"
67 "jmp dump_C%=\t\n"
68
69 "zero_regs%=:\t\n"
70
71 "test r14,r14\t\n"
72 "jz skip_preload_b_zero%=\t\n"
73 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
74 "skip_preload_b_zero%=:\t\n"
75 "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n"
76 "vmulps zmm0,zmm3,zmm2\t\n"
77 "vmulps zmm1,zmm4,zmm2\t\n"
78 "test r14,r14\t\n"
79 "jnz next_inner%=\t\n"
80 "add r10,64\t\n"
81 "jmp dump_C%=\t\n"
82
83 "loop_inner%=:\t\n"
84
85 "vmovaps zmm3,zmm31\t\n"
86 "vcvtph2ps zmm4,YMMWORD PTR [r10 + 32]\t\n"
87 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
88 "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n"
89 "vfmadd231ps zmm0,zmm3,zmm2\t\n"
90 "vfmadd231ps zmm1,zmm4,zmm2\t\n"
91
92 "next_inner%=:\t\n"
93 "add r9,4\t\n"
94 "add r10,64\t\n"
95 "dec r14\t\n"
96 "jnz loop_inner%=\t\n"
97
98 "vmovaps zmm3,zmm31\t\n"
99 "vcvtph2ps zmm4,YMMWORD PTR [r10 + 32]\t\n"
100 "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n"
101 "vfmadd231ps zmm0,zmm3,zmm2\t\n"
102 "vfmadd231ps zmm1,zmm4,zmm2\t\n"
103 "add r9,4\t\n"
104 "add r10,64\t\n"
105 // Dump C
106 "dump_C%=:\t\n"
107 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
108 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
109
110 // next outer iteration
111 "add rcx, 128\t\n"
112 "mov r12, rcx\t\n"
113 "mov r9, rax\t\n"
114 "inc rbx\t\n"
115 "cmp rbx, rdi\t\n"
116 "jl loop_outter%=\t\n"
117 :
118 : [gp] "rm"(gp)
119 : "r8",
120 "r9",
121 "r10",
122 "r11",
123 "r13",
124 "r14",
125 "rax",
126 "rcx",
127 "rsi",
128 "rdi",
129 "rbx",
130 "r12",
131 "r15",
132 "memory");
133}
134void NOINLINE gemmkernel_2x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
135 asm volatile(
136#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
137 "mov %[gp], %%r14\t\n"
138 ".intel_syntax noprefix\t\n"
139#else
140 "mov r14, %[gp]\t\n"
141#endif
142
143 // Copy parameters
144 // k
145 "mov r8, [r14 + 0]\t\n"
146 "dec r8\t\n"
147 // A
148 "mov r9, [r14 + 8]\t\n"
149 // B
150 "mov r10, [r14 + 16]\t\n"
151 // beta
152 "lea r15, [r14 + 24]\t\n"
153 // C
154 "mov r12, [r14 + 32]\t\n"
155 // ldc
156 "mov r13, [r14 + 40]\t\n"
157 // b_block_cols
158 "mov rdi, [r14 + 48]\t\n"
159 // b_block_size
160 "mov rsi, [r14 + 56]\t\n"
161
162 // Make copies of A and C
163 "mov rax, r9\t\n"
164 "mov rcx, r12\t\n"
165
166 "xor ebx, ebx\t\n"
167 "loop_outter%=:\t\n"
168 "mov r14, r8\t\n"
169 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
170 "vcvtph2ps zmm5,YMMWORD PTR [r10 + 0]\t\n"
171 "vcvtph2ps zmm6,YMMWORD PTR [r10 + 32]\t\n"
172 "vxorps xmm0, xmm0, xmm0\t\n"
173 "vcomiss xmm31, xmm0\t\n"
174 "jz zero_regs%=\t\n"
175
176 // Setup values with beta multiplication
177 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
178 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
179 "add r12, r13\t\n"
180 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
181 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
182 "test r14,r14\t\n"
183 "jz skip_preload%=\t\n"
184 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
185 "skip_preload%=:\t\n"
186 "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n"
187 "vfmadd231ps zmm0,zmm5,zmm4\t\n"
188 "vfmadd231ps zmm1,zmm6,zmm4\t\n"
189 "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n"
190 "vfmadd231ps zmm2,zmm5,zmm4\t\n"
191 "vfmadd231ps zmm3,zmm6,zmm4\t\n"
192 "mov r12, rcx\t\n"
193 "test r14,r14\t\n"
194 "jnz next_inner%=\t\n"
195 "add r10,64\t\n"
196 "jmp dump_C%=\t\n"
197
198 "zero_regs%=:\t\n"
199
200 "test r14,r14\t\n"
201 "jz skip_preload_b_zero%=\t\n"
202 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
203 "skip_preload_b_zero%=:\t\n"
204 "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n"
205 "vmulps zmm0,zmm5,zmm4\t\n"
206 "vmulps zmm1,zmm6,zmm4\t\n"
207 "add r12, r13\t\n"
208 "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n"
209 "vmulps zmm2,zmm5,zmm4\t\n"
210 "vmulps zmm3,zmm6,zmm4\t\n"
211 "mov r12, rcx\t\n"
212 "test r14,r14\t\n"
213 "jnz next_inner%=\t\n"
214 "add r10,64\t\n"
215 "jmp dump_C%=\t\n"
216
217 "loop_inner%=:\t\n"
218
219 "vmovaps zmm5,zmm31\t\n"
220 "vcvtph2ps zmm6,YMMWORD PTR [r10 + 32]\t\n"
221 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
222 "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n"
223 "vfmadd231ps zmm0,zmm5,zmm4\t\n"
224 "vfmadd231ps zmm1,zmm6,zmm4\t\n"
225 "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n"
226 "vfmadd231ps zmm2,zmm5,zmm4\t\n"
227 "vfmadd231ps zmm3,zmm6,zmm4\t\n"
228
229 "next_inner%=:\t\n"
230 "add r9,8\t\n"
231 "add r10,64\t\n"
232 "dec r14\t\n"
233 "jnz loop_inner%=\t\n"
234
235 "vmovaps zmm5,zmm31\t\n"
236 "vcvtph2ps zmm6,YMMWORD PTR [r10 + 32]\t\n"
237 "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n"
238 "vfmadd231ps zmm0,zmm5,zmm4\t\n"
239 "vfmadd231ps zmm1,zmm6,zmm4\t\n"
240 "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n"
241 "vfmadd231ps zmm2,zmm5,zmm4\t\n"
242 "vfmadd231ps zmm3,zmm6,zmm4\t\n"
243 "add r9,8\t\n"
244 "add r10,64\t\n"
245 // Dump C
246 "dump_C%=:\t\n"
247 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
248 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
249 "add r12, r13\t\n"
250 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
251 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
252
253 // next outer iteration
254 "add rcx, 128\t\n"
255 "mov r12, rcx\t\n"
256 "mov r9, rax\t\n"
257 "inc rbx\t\n"
258 "cmp rbx, rdi\t\n"
259 "jl loop_outter%=\t\n"
260 :
261 : [gp] "rm"(gp)
262 : "r8",
263 "r9",
264 "r10",
265 "r11",
266 "r13",
267 "r14",
268 "rax",
269 "rcx",
270 "rsi",
271 "rdi",
272 "rbx",
273 "r12",
274 "r15",
275 "memory");
276}
277void NOINLINE gemmkernel_3x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
278 asm volatile(
279#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
280 "mov %[gp], %%r14\t\n"
281 ".intel_syntax noprefix\t\n"
282#else
283 "mov r14, %[gp]\t\n"
284#endif
285
286 // Copy parameters
287 // k
288 "mov r8, [r14 + 0]\t\n"
289 "dec r8\t\n"
290 // A
291 "mov r9, [r14 + 8]\t\n"
292 // B
293 "mov r10, [r14 + 16]\t\n"
294 // beta
295 "lea r15, [r14 + 24]\t\n"
296 // C
297 "mov r12, [r14 + 32]\t\n"
298 // ldc
299 "mov r13, [r14 + 40]\t\n"
300 // b_block_cols
301 "mov rdi, [r14 + 48]\t\n"
302 // b_block_size
303 "mov rsi, [r14 + 56]\t\n"
304
305 // Make copies of A and C
306 "mov rax, r9\t\n"
307 "mov rcx, r12\t\n"
308
309 "xor ebx, ebx\t\n"
310 "loop_outter%=:\t\n"
311 "mov r14, r8\t\n"
312 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
313 "vcvtph2ps zmm7,YMMWORD PTR [r10 + 0]\t\n"
314 "vcvtph2ps zmm8,YMMWORD PTR [r10 + 32]\t\n"
315 "vxorps xmm0, xmm0, xmm0\t\n"
316 "vcomiss xmm31, xmm0\t\n"
317 "jz zero_regs%=\t\n"
318
319 // Setup values with beta multiplication
320 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
321 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
322 "add r12, r13\t\n"
323 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
324 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
325 "add r12, r13\t\n"
326 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
327 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
328 "test r14,r14\t\n"
329 "jz skip_preload%=\t\n"
330 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
331 "skip_preload%=:\t\n"
332 "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n"
333 "vfmadd231ps zmm0,zmm7,zmm6\t\n"
334 "vfmadd231ps zmm1,zmm8,zmm6\t\n"
335 "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n"
336 "vfmadd231ps zmm2,zmm7,zmm6\t\n"
337 "vfmadd231ps zmm3,zmm8,zmm6\t\n"
338 "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n"
339 "vfmadd231ps zmm4,zmm7,zmm6\t\n"
340 "vfmadd231ps zmm5,zmm8,zmm6\t\n"
341 "mov r12, rcx\t\n"
342 "test r14,r14\t\n"
343 "jnz next_inner%=\t\n"
344 "add r10,64\t\n"
345 "jmp dump_C%=\t\n"
346
347 "zero_regs%=:\t\n"
348
349 "test r14,r14\t\n"
350 "jz skip_preload_b_zero%=\t\n"
351 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
352 "skip_preload_b_zero%=:\t\n"
353 "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n"
354 "vmulps zmm0,zmm7,zmm6\t\n"
355 "vmulps zmm1,zmm8,zmm6\t\n"
356 "add r12, r13\t\n"
357 "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n"
358 "vmulps zmm2,zmm7,zmm6\t\n"
359 "vmulps zmm3,zmm8,zmm6\t\n"
360 "add r12, r13\t\n"
361 "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n"
362 "vmulps zmm4,zmm7,zmm6\t\n"
363 "vmulps zmm5,zmm8,zmm6\t\n"
364 "mov r12, rcx\t\n"
365 "test r14,r14\t\n"
366 "jnz next_inner%=\t\n"
367 "add r10,64\t\n"
368 "jmp dump_C%=\t\n"
369
370 "loop_inner%=:\t\n"
371
372 "vmovaps zmm7,zmm31\t\n"
373 "vcvtph2ps zmm8,YMMWORD PTR [r10 + 32]\t\n"
374 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
375 "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n"
376 "vfmadd231ps zmm0,zmm7,zmm6\t\n"
377 "vfmadd231ps zmm1,zmm8,zmm6\t\n"
378 "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n"
379 "vfmadd231ps zmm2,zmm7,zmm6\t\n"
380 "vfmadd231ps zmm3,zmm8,zmm6\t\n"
381 "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n"
382 "vfmadd231ps zmm4,zmm7,zmm6\t\n"
383 "vfmadd231ps zmm5,zmm8,zmm6\t\n"
384
385 "next_inner%=:\t\n"
386 "add r9,12\t\n"
387 "add r10,64\t\n"
388 "dec r14\t\n"
389 "jnz loop_inner%=\t\n"
390
391 "vmovaps zmm7,zmm31\t\n"
392 "vcvtph2ps zmm8,YMMWORD PTR [r10 + 32]\t\n"
393 "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n"
394 "vfmadd231ps zmm0,zmm7,zmm6\t\n"
395 "vfmadd231ps zmm1,zmm8,zmm6\t\n"
396 "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n"
397 "vfmadd231ps zmm2,zmm7,zmm6\t\n"
398 "vfmadd231ps zmm3,zmm8,zmm6\t\n"
399 "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n"
400 "vfmadd231ps zmm4,zmm7,zmm6\t\n"
401 "vfmadd231ps zmm5,zmm8,zmm6\t\n"
402 "add r9,12\t\n"
403 "add r10,64\t\n"
404 // Dump C
405 "dump_C%=:\t\n"
406 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
407 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
408 "add r12, r13\t\n"
409 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
410 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
411 "add r12, r13\t\n"
412 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
413 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
414
415 // next outer iteration
416 "add rcx, 128\t\n"
417 "mov r12, rcx\t\n"
418 "mov r9, rax\t\n"
419 "inc rbx\t\n"
420 "cmp rbx, rdi\t\n"
421 "jl loop_outter%=\t\n"
422 :
423 : [gp] "rm"(gp)
424 : "r8",
425 "r9",
426 "r10",
427 "r11",
428 "r13",
429 "r14",
430 "rax",
431 "rcx",
432 "rsi",
433 "rdi",
434 "rbx",
435 "r12",
436 "r15",
437 "memory");
438}
439void NOINLINE gemmkernel_4x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
440 asm volatile(
441#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
442 "mov %[gp], %%r14\t\n"
443 ".intel_syntax noprefix\t\n"
444#else
445 "mov r14, %[gp]\t\n"
446#endif
447
448 // Copy parameters
449 // k
450 "mov r8, [r14 + 0]\t\n"
451 "dec r8\t\n"
452 // A
453 "mov r9, [r14 + 8]\t\n"
454 // B
455 "mov r10, [r14 + 16]\t\n"
456 // beta
457 "lea r15, [r14 + 24]\t\n"
458 // C
459 "mov r12, [r14 + 32]\t\n"
460 // ldc
461 "mov r13, [r14 + 40]\t\n"
462 // b_block_cols
463 "mov rdi, [r14 + 48]\t\n"
464 // b_block_size
465 "mov rsi, [r14 + 56]\t\n"
466
467 // Make copies of A and C
468 "mov rax, r9\t\n"
469 "mov rcx, r12\t\n"
470
471 "xor ebx, ebx\t\n"
472 "loop_outter%=:\t\n"
473 "mov r14, r8\t\n"
474 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
475 "vcvtph2ps zmm9,YMMWORD PTR [r10 + 0]\t\n"
476 "vcvtph2ps zmm10,YMMWORD PTR [r10 + 32]\t\n"
477 "vxorps xmm0, xmm0, xmm0\t\n"
478 "vcomiss xmm31, xmm0\t\n"
479 "jz zero_regs%=\t\n"
480
481 // Setup values with beta multiplication
482 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
483 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
484 "add r12, r13\t\n"
485 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
486 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
487 "add r12, r13\t\n"
488 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
489 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
490 "add r12, r13\t\n"
491 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
492 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
493 "test r14,r14\t\n"
494 "jz skip_preload%=\t\n"
495 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
496 "skip_preload%=:\t\n"
497 "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n"
498 "vfmadd231ps zmm0,zmm9,zmm8\t\n"
499 "vfmadd231ps zmm1,zmm10,zmm8\t\n"
500 "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n"
501 "vfmadd231ps zmm2,zmm9,zmm8\t\n"
502 "vfmadd231ps zmm3,zmm10,zmm8\t\n"
503 "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n"
504 "vfmadd231ps zmm4,zmm9,zmm8\t\n"
505 "vfmadd231ps zmm5,zmm10,zmm8\t\n"
506 "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n"
507 "vfmadd231ps zmm6,zmm9,zmm8\t\n"
508 "vfmadd231ps zmm7,zmm10,zmm8\t\n"
509 "mov r12, rcx\t\n"
510 "test r14,r14\t\n"
511 "jnz next_inner%=\t\n"
512 "add r10,64\t\n"
513 "jmp dump_C%=\t\n"
514
515 "zero_regs%=:\t\n"
516
517 "test r14,r14\t\n"
518 "jz skip_preload_b_zero%=\t\n"
519 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
520 "skip_preload_b_zero%=:\t\n"
521 "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n"
522 "vmulps zmm0,zmm9,zmm8\t\n"
523 "vmulps zmm1,zmm10,zmm8\t\n"
524 "add r12, r13\t\n"
525 "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n"
526 "vmulps zmm2,zmm9,zmm8\t\n"
527 "vmulps zmm3,zmm10,zmm8\t\n"
528 "add r12, r13\t\n"
529 "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n"
530 "vmulps zmm4,zmm9,zmm8\t\n"
531 "vmulps zmm5,zmm10,zmm8\t\n"
532 "add r12, r13\t\n"
533 "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n"
534 "vmulps zmm6,zmm9,zmm8\t\n"
535 "vmulps zmm7,zmm10,zmm8\t\n"
536 "mov r12, rcx\t\n"
537 "test r14,r14\t\n"
538 "jnz next_inner%=\t\n"
539 "add r10,64\t\n"
540 "jmp dump_C%=\t\n"
541
542 "loop_inner%=:\t\n"
543
544 "vmovaps zmm9,zmm31\t\n"
545 "vcvtph2ps zmm10,YMMWORD PTR [r10 + 32]\t\n"
546 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
547 "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n"
548 "vfmadd231ps zmm0,zmm9,zmm8\t\n"
549 "vfmadd231ps zmm1,zmm10,zmm8\t\n"
550 "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n"
551 "vfmadd231ps zmm2,zmm9,zmm8\t\n"
552 "vfmadd231ps zmm3,zmm10,zmm8\t\n"
553 "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n"
554 "vfmadd231ps zmm4,zmm9,zmm8\t\n"
555 "vfmadd231ps zmm5,zmm10,zmm8\t\n"
556 "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n"
557 "vfmadd231ps zmm6,zmm9,zmm8\t\n"
558 "vfmadd231ps zmm7,zmm10,zmm8\t\n"
559
560 "next_inner%=:\t\n"
561 "add r9,16\t\n"
562 "add r10,64\t\n"
563 "dec r14\t\n"
564 "jnz loop_inner%=\t\n"
565
566 "vmovaps zmm9,zmm31\t\n"
567 "vcvtph2ps zmm10,YMMWORD PTR [r10 + 32]\t\n"
568 "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n"
569 "vfmadd231ps zmm0,zmm9,zmm8\t\n"
570 "vfmadd231ps zmm1,zmm10,zmm8\t\n"
571 "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n"
572 "vfmadd231ps zmm2,zmm9,zmm8\t\n"
573 "vfmadd231ps zmm3,zmm10,zmm8\t\n"
574 "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n"
575 "vfmadd231ps zmm4,zmm9,zmm8\t\n"
576 "vfmadd231ps zmm5,zmm10,zmm8\t\n"
577 "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n"
578 "vfmadd231ps zmm6,zmm9,zmm8\t\n"
579 "vfmadd231ps zmm7,zmm10,zmm8\t\n"
580 "add r9,16\t\n"
581 "add r10,64\t\n"
582 // Dump C
583 "dump_C%=:\t\n"
584 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
585 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
586 "add r12, r13\t\n"
587 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
588 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
589 "add r12, r13\t\n"
590 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
591 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
592 "add r12, r13\t\n"
593 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
594 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
595
596 // next outer iteration
597 "add rcx, 128\t\n"
598 "mov r12, rcx\t\n"
599 "mov r9, rax\t\n"
600 "inc rbx\t\n"
601 "cmp rbx, rdi\t\n"
602 "jl loop_outter%=\t\n"
603 :
604 : [gp] "rm"(gp)
605 : "r8",
606 "r9",
607 "r10",
608 "r11",
609 "r13",
610 "r14",
611 "rax",
612 "rcx",
613 "rsi",
614 "rdi",
615 "rbx",
616 "r12",
617 "r15",
618 "memory");
619}
620void NOINLINE gemmkernel_5x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
621 asm volatile(
622#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
623 "mov %[gp], %%r14\t\n"
624 ".intel_syntax noprefix\t\n"
625#else
626 "mov r14, %[gp]\t\n"
627#endif
628
629 // Copy parameters
630 // k
631 "mov r8, [r14 + 0]\t\n"
632 "dec r8\t\n"
633 // A
634 "mov r9, [r14 + 8]\t\n"
635 // B
636 "mov r10, [r14 + 16]\t\n"
637 // beta
638 "lea r15, [r14 + 24]\t\n"
639 // C
640 "mov r12, [r14 + 32]\t\n"
641 // ldc
642 "mov r13, [r14 + 40]\t\n"
643 // b_block_cols
644 "mov rdi, [r14 + 48]\t\n"
645 // b_block_size
646 "mov rsi, [r14 + 56]\t\n"
647
648 // Make copies of A and C
649 "mov rax, r9\t\n"
650 "mov rcx, r12\t\n"
651
652 "xor ebx, ebx\t\n"
653 "loop_outter%=:\t\n"
654 "mov r14, r8\t\n"
655 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
656 "vcvtph2ps zmm11,YMMWORD PTR [r10 + 0]\t\n"
657 "vcvtph2ps zmm12,YMMWORD PTR [r10 + 32]\t\n"
658 "vxorps xmm0, xmm0, xmm0\t\n"
659 "vcomiss xmm31, xmm0\t\n"
660 "jz zero_regs%=\t\n"
661
662 // Setup values with beta multiplication
663 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
664 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
665 "add r12, r13\t\n"
666 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
667 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
668 "add r12, r13\t\n"
669 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
670 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
671 "add r12, r13\t\n"
672 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
673 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
674 "add r12, r13\t\n"
675 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
676 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
677 "test r14,r14\t\n"
678 "jz skip_preload%=\t\n"
679 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
680 "skip_preload%=:\t\n"
681 "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n"
682 "vfmadd231ps zmm0,zmm11,zmm10\t\n"
683 "vfmadd231ps zmm1,zmm12,zmm10\t\n"
684 "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n"
685 "vfmadd231ps zmm2,zmm11,zmm10\t\n"
686 "vfmadd231ps zmm3,zmm12,zmm10\t\n"
687 "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n"
688 "vfmadd231ps zmm4,zmm11,zmm10\t\n"
689 "vfmadd231ps zmm5,zmm12,zmm10\t\n"
690 "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n"
691 "vfmadd231ps zmm6,zmm11,zmm10\t\n"
692 "vfmadd231ps zmm7,zmm12,zmm10\t\n"
693 "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n"
694 "vfmadd231ps zmm8,zmm11,zmm10\t\n"
695 "vfmadd231ps zmm9,zmm12,zmm10\t\n"
696 "mov r12, rcx\t\n"
697 "test r14,r14\t\n"
698 "jnz next_inner%=\t\n"
699 "add r10,64\t\n"
700 "jmp dump_C%=\t\n"
701
702 "zero_regs%=:\t\n"
703
704 "test r14,r14\t\n"
705 "jz skip_preload_b_zero%=\t\n"
706 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
707 "skip_preload_b_zero%=:\t\n"
708 "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n"
709 "vmulps zmm0,zmm11,zmm10\t\n"
710 "vmulps zmm1,zmm12,zmm10\t\n"
711 "add r12, r13\t\n"
712 "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n"
713 "vmulps zmm2,zmm11,zmm10\t\n"
714 "vmulps zmm3,zmm12,zmm10\t\n"
715 "add r12, r13\t\n"
716 "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n"
717 "vmulps zmm4,zmm11,zmm10\t\n"
718 "vmulps zmm5,zmm12,zmm10\t\n"
719 "add r12, r13\t\n"
720 "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n"
721 "vmulps zmm6,zmm11,zmm10\t\n"
722 "vmulps zmm7,zmm12,zmm10\t\n"
723 "add r12, r13\t\n"
724 "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n"
725 "vmulps zmm8,zmm11,zmm10\t\n"
726 "vmulps zmm9,zmm12,zmm10\t\n"
727 "mov r12, rcx\t\n"
728 "test r14,r14\t\n"
729 "jnz next_inner%=\t\n"
730 "add r10,64\t\n"
731 "jmp dump_C%=\t\n"
732
733 "loop_inner%=:\t\n"
734
735 "vmovaps zmm11,zmm31\t\n"
736 "vcvtph2ps zmm12,YMMWORD PTR [r10 + 32]\t\n"
737 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
738 "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n"
739 "vfmadd231ps zmm0,zmm11,zmm10\t\n"
740 "vfmadd231ps zmm1,zmm12,zmm10\t\n"
741 "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n"
742 "vfmadd231ps zmm2,zmm11,zmm10\t\n"
743 "vfmadd231ps zmm3,zmm12,zmm10\t\n"
744 "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n"
745 "vfmadd231ps zmm4,zmm11,zmm10\t\n"
746 "vfmadd231ps zmm5,zmm12,zmm10\t\n"
747 "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n"
748 "vfmadd231ps zmm6,zmm11,zmm10\t\n"
749 "vfmadd231ps zmm7,zmm12,zmm10\t\n"
750 "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n"
751 "vfmadd231ps zmm8,zmm11,zmm10\t\n"
752 "vfmadd231ps zmm9,zmm12,zmm10\t\n"
753
754 "next_inner%=:\t\n"
755 "add r9,20\t\n"
756 "add r10,64\t\n"
757 "dec r14\t\n"
758 "jnz loop_inner%=\t\n"
759
760 "vmovaps zmm11,zmm31\t\n"
761 "vcvtph2ps zmm12,YMMWORD PTR [r10 + 32]\t\n"
762 "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n"
763 "vfmadd231ps zmm0,zmm11,zmm10\t\n"
764 "vfmadd231ps zmm1,zmm12,zmm10\t\n"
765 "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n"
766 "vfmadd231ps zmm2,zmm11,zmm10\t\n"
767 "vfmadd231ps zmm3,zmm12,zmm10\t\n"
768 "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n"
769 "vfmadd231ps zmm4,zmm11,zmm10\t\n"
770 "vfmadd231ps zmm5,zmm12,zmm10\t\n"
771 "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n"
772 "vfmadd231ps zmm6,zmm11,zmm10\t\n"
773 "vfmadd231ps zmm7,zmm12,zmm10\t\n"
774 "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n"
775 "vfmadd231ps zmm8,zmm11,zmm10\t\n"
776 "vfmadd231ps zmm9,zmm12,zmm10\t\n"
777 "add r9,20\t\n"
778 "add r10,64\t\n"
779 // Dump C
780 "dump_C%=:\t\n"
781 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
782 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
783 "add r12, r13\t\n"
784 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
785 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
786 "add r12, r13\t\n"
787 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
788 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
789 "add r12, r13\t\n"
790 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
791 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
792 "add r12, r13\t\n"
793 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
794 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
795
796 // next outer iteration
797 "add rcx, 128\t\n"
798 "mov r12, rcx\t\n"
799 "mov r9, rax\t\n"
800 "inc rbx\t\n"
801 "cmp rbx, rdi\t\n"
802 "jl loop_outter%=\t\n"
803 :
804 : [gp] "rm"(gp)
805 : "r8",
806 "r9",
807 "r10",
808 "r11",
809 "r13",
810 "r14",
811 "rax",
812 "rcx",
813 "rsi",
814 "rdi",
815 "rbx",
816 "r12",
817 "r15",
818 "memory");
819}
820void NOINLINE gemmkernel_6x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
821 asm volatile(
822#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
823 "mov %[gp], %%r14\t\n"
824 ".intel_syntax noprefix\t\n"
825#else
826 "mov r14, %[gp]\t\n"
827#endif
828
829 // Copy parameters
830 // k
831 "mov r8, [r14 + 0]\t\n"
832 "dec r8\t\n"
833 // A
834 "mov r9, [r14 + 8]\t\n"
835 // B
836 "mov r10, [r14 + 16]\t\n"
837 // beta
838 "lea r15, [r14 + 24]\t\n"
839 // C
840 "mov r12, [r14 + 32]\t\n"
841 // ldc
842 "mov r13, [r14 + 40]\t\n"
843 // b_block_cols
844 "mov rdi, [r14 + 48]\t\n"
845 // b_block_size
846 "mov rsi, [r14 + 56]\t\n"
847
848 // Make copies of A and C
849 "mov rax, r9\t\n"
850 "mov rcx, r12\t\n"
851
852 "xor ebx, ebx\t\n"
853 "loop_outter%=:\t\n"
854 "mov r14, r8\t\n"
855 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
856 "vcvtph2ps zmm13,YMMWORD PTR [r10 + 0]\t\n"
857 "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n"
858 "vxorps xmm0, xmm0, xmm0\t\n"
859 "vcomiss xmm31, xmm0\t\n"
860 "jz zero_regs%=\t\n"
861
862 // Setup values with beta multiplication
863 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
864 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
865 "add r12, r13\t\n"
866 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
867 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
868 "add r12, r13\t\n"
869 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
870 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
871 "add r12, r13\t\n"
872 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
873 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
874 "add r12, r13\t\n"
875 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
876 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
877 "add r12, r13\t\n"
878 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
879 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
880 "test r14,r14\t\n"
881 "jz skip_preload%=\t\n"
882 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
883 "skip_preload%=:\t\n"
884 "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
885 "vfmadd231ps zmm0,zmm13,zmm12\t\n"
886 "vfmadd231ps zmm1,zmm14,zmm12\t\n"
887 "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
888 "vfmadd231ps zmm2,zmm13,zmm12\t\n"
889 "vfmadd231ps zmm3,zmm14,zmm12\t\n"
890 "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
891 "vfmadd231ps zmm4,zmm13,zmm12\t\n"
892 "vfmadd231ps zmm5,zmm14,zmm12\t\n"
893 "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
894 "vfmadd231ps zmm6,zmm13,zmm12\t\n"
895 "vfmadd231ps zmm7,zmm14,zmm12\t\n"
896 "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
897 "vfmadd231ps zmm8,zmm13,zmm12\t\n"
898 "vfmadd231ps zmm9,zmm14,zmm12\t\n"
899 "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
900 "vfmadd231ps zmm10,zmm13,zmm12\t\n"
901 "vfmadd231ps zmm11,zmm14,zmm12\t\n"
902 "mov r12, rcx\t\n"
903 "test r14,r14\t\n"
904 "jnz next_inner%=\t\n"
905 "add r10,64\t\n"
906 "jmp dump_C%=\t\n"
907
908 "zero_regs%=:\t\n"
909
910 "test r14,r14\t\n"
911 "jz skip_preload_b_zero%=\t\n"
912 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
913 "skip_preload_b_zero%=:\t\n"
914 "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
915 "vmulps zmm0,zmm13,zmm12\t\n"
916 "vmulps zmm1,zmm14,zmm12\t\n"
917 "add r12, r13\t\n"
918 "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
919 "vmulps zmm2,zmm13,zmm12\t\n"
920 "vmulps zmm3,zmm14,zmm12\t\n"
921 "add r12, r13\t\n"
922 "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
923 "vmulps zmm4,zmm13,zmm12\t\n"
924 "vmulps zmm5,zmm14,zmm12\t\n"
925 "add r12, r13\t\n"
926 "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
927 "vmulps zmm6,zmm13,zmm12\t\n"
928 "vmulps zmm7,zmm14,zmm12\t\n"
929 "add r12, r13\t\n"
930 "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
931 "vmulps zmm8,zmm13,zmm12\t\n"
932 "vmulps zmm9,zmm14,zmm12\t\n"
933 "add r12, r13\t\n"
934 "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
935 "vmulps zmm10,zmm13,zmm12\t\n"
936 "vmulps zmm11,zmm14,zmm12\t\n"
937 "mov r12, rcx\t\n"
938 "test r14,r14\t\n"
939 "jnz next_inner%=\t\n"
940 "add r10,64\t\n"
941 "jmp dump_C%=\t\n"
942
943 "loop_inner%=:\t\n"
944
945 "vmovaps zmm13,zmm31\t\n"
946 "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n"
947 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
948 "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
949 "vfmadd231ps zmm0,zmm13,zmm12\t\n"
950 "vfmadd231ps zmm1,zmm14,zmm12\t\n"
951 "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
952 "vfmadd231ps zmm2,zmm13,zmm12\t\n"
953 "vfmadd231ps zmm3,zmm14,zmm12\t\n"
954 "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
955 "vfmadd231ps zmm4,zmm13,zmm12\t\n"
956 "vfmadd231ps zmm5,zmm14,zmm12\t\n"
957 "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
958 "vfmadd231ps zmm6,zmm13,zmm12\t\n"
959 "vfmadd231ps zmm7,zmm14,zmm12\t\n"
960 "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
961 "vfmadd231ps zmm8,zmm13,zmm12\t\n"
962 "vfmadd231ps zmm9,zmm14,zmm12\t\n"
963 "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
964 "vfmadd231ps zmm10,zmm13,zmm12\t\n"
965 "vfmadd231ps zmm11,zmm14,zmm12\t\n"
966
967 "next_inner%=:\t\n"
968 "add r9,24\t\n"
969 "add r10,64\t\n"
970 "dec r14\t\n"
971 "jnz loop_inner%=\t\n"
972
973 "vmovaps zmm13,zmm31\t\n"
974 "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n"
975 "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n"
976 "vfmadd231ps zmm0,zmm13,zmm12\t\n"
977 "vfmadd231ps zmm1,zmm14,zmm12\t\n"
978 "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n"
979 "vfmadd231ps zmm2,zmm13,zmm12\t\n"
980 "vfmadd231ps zmm3,zmm14,zmm12\t\n"
981 "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n"
982 "vfmadd231ps zmm4,zmm13,zmm12\t\n"
983 "vfmadd231ps zmm5,zmm14,zmm12\t\n"
984 "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n"
985 "vfmadd231ps zmm6,zmm13,zmm12\t\n"
986 "vfmadd231ps zmm7,zmm14,zmm12\t\n"
987 "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n"
988 "vfmadd231ps zmm8,zmm13,zmm12\t\n"
989 "vfmadd231ps zmm9,zmm14,zmm12\t\n"
990 "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n"
991 "vfmadd231ps zmm10,zmm13,zmm12\t\n"
992 "vfmadd231ps zmm11,zmm14,zmm12\t\n"
993 "add r9,24\t\n"
994 "add r10,64\t\n"
995 // Dump C
996 "dump_C%=:\t\n"
997 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
998 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
999 "add r12, r13\t\n"
1000 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
1001 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
1002 "add r12, r13\t\n"
1003 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
1004 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
1005 "add r12, r13\t\n"
1006 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
1007 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
1008 "add r12, r13\t\n"
1009 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
1010 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
1011 "add r12, r13\t\n"
1012 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
1013 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
1014
1015 // next outer iteration
1016 "add rcx, 128\t\n"
1017 "mov r12, rcx\t\n"
1018 "mov r9, rax\t\n"
1019 "inc rbx\t\n"
1020 "cmp rbx, rdi\t\n"
1021 "jl loop_outter%=\t\n"
1022 :
1023 : [gp] "rm"(gp)
1024 : "r8",
1025 "r9",
1026 "r10",
1027 "r11",
1028 "r13",
1029 "r14",
1030 "rax",
1031 "rcx",
1032 "rsi",
1033 "rdi",
1034 "rbx",
1035 "r12",
1036 "r15",
1037 "memory");
1038}
1039void NOINLINE gemmkernel_7x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1040 asm volatile(
1041#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
1042 "mov %[gp], %%r14\t\n"
1043 ".intel_syntax noprefix\t\n"
1044#else
1045 "mov r14, %[gp]\t\n"
1046#endif
1047
1048 // Copy parameters
1049 // k
1050 "mov r8, [r14 + 0]\t\n"
1051 "dec r8\t\n"
1052 // A
1053 "mov r9, [r14 + 8]\t\n"
1054 // B
1055 "mov r10, [r14 + 16]\t\n"
1056 // beta
1057 "lea r15, [r14 + 24]\t\n"
1058 // C
1059 "mov r12, [r14 + 32]\t\n"
1060 // ldc
1061 "mov r13, [r14 + 40]\t\n"
1062 // b_block_cols
1063 "mov rdi, [r14 + 48]\t\n"
1064 // b_block_size
1065 "mov rsi, [r14 + 56]\t\n"
1066
1067 // Make copies of A and C
1068 "mov rax, r9\t\n"
1069 "mov rcx, r12\t\n"
1070
1071 "xor ebx, ebx\t\n"
1072 "loop_outter%=:\t\n"
1073 "mov r14, r8\t\n"
1074 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
1075 "vcvtph2ps zmm15,YMMWORD PTR [r10 + 0]\t\n"
1076 "vcvtph2ps zmm16,YMMWORD PTR [r10 + 32]\t\n"
1077 "vxorps xmm0, xmm0, xmm0\t\n"
1078 "vcomiss xmm31, xmm0\t\n"
1079 "jz zero_regs%=\t\n"
1080
1081 // Setup values with beta multiplication
1082 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
1083 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
1084 "add r12, r13\t\n"
1085 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
1086 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
1087 "add r12, r13\t\n"
1088 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
1089 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
1090 "add r12, r13\t\n"
1091 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
1092 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
1093 "add r12, r13\t\n"
1094 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
1095 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
1096 "add r12, r13\t\n"
1097 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
1098 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
1099 "add r12, r13\t\n"
1100 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
1101 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
1102 "test r14,r14\t\n"
1103 "jz skip_preload%=\t\n"
1104 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1105 "skip_preload%=:\t\n"
1106 "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n"
1107 "vfmadd231ps zmm0,zmm15,zmm14\t\n"
1108 "vfmadd231ps zmm1,zmm16,zmm14\t\n"
1109 "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n"
1110 "vfmadd231ps zmm2,zmm15,zmm14\t\n"
1111 "vfmadd231ps zmm3,zmm16,zmm14\t\n"
1112 "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n"
1113 "vfmadd231ps zmm4,zmm15,zmm14\t\n"
1114 "vfmadd231ps zmm5,zmm16,zmm14\t\n"
1115 "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n"
1116 "vfmadd231ps zmm6,zmm15,zmm14\t\n"
1117 "vfmadd231ps zmm7,zmm16,zmm14\t\n"
1118 "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n"
1119 "vfmadd231ps zmm8,zmm15,zmm14\t\n"
1120 "vfmadd231ps zmm9,zmm16,zmm14\t\n"
1121 "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n"
1122 "vfmadd231ps zmm10,zmm15,zmm14\t\n"
1123 "vfmadd231ps zmm11,zmm16,zmm14\t\n"
1124 "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n"
1125 "vfmadd231ps zmm12,zmm15,zmm14\t\n"
1126 "vfmadd231ps zmm13,zmm16,zmm14\t\n"
1127 "mov r12, rcx\t\n"
1128 "test r14,r14\t\n"
1129 "jnz next_inner%=\t\n"
1130 "add r10,64\t\n"
1131 "jmp dump_C%=\t\n"
1132
1133 "zero_regs%=:\t\n"
1134
1135 "test r14,r14\t\n"
1136 "jz skip_preload_b_zero%=\t\n"
1137 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1138 "skip_preload_b_zero%=:\t\n"
1139 "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n"
1140 "vmulps zmm0,zmm15,zmm14\t\n"
1141 "vmulps zmm1,zmm16,zmm14\t\n"
1142 "add r12, r13\t\n"
1143 "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n"
1144 "vmulps zmm2,zmm15,zmm14\t\n"
1145 "vmulps zmm3,zmm16,zmm14\t\n"
1146 "add r12, r13\t\n"
1147 "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n"
1148 "vmulps zmm4,zmm15,zmm14\t\n"
1149 "vmulps zmm5,zmm16,zmm14\t\n"
1150 "add r12, r13\t\n"
1151 "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n"
1152 "vmulps zmm6,zmm15,zmm14\t\n"
1153 "vmulps zmm7,zmm16,zmm14\t\n"
1154 "add r12, r13\t\n"
1155 "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n"
1156 "vmulps zmm8,zmm15,zmm14\t\n"
1157 "vmulps zmm9,zmm16,zmm14\t\n"
1158 "add r12, r13\t\n"
1159 "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n"
1160 "vmulps zmm10,zmm15,zmm14\t\n"
1161 "vmulps zmm11,zmm16,zmm14\t\n"
1162 "add r12, r13\t\n"
1163 "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n"
1164 "vmulps zmm12,zmm15,zmm14\t\n"
1165 "vmulps zmm13,zmm16,zmm14\t\n"
1166 "mov r12, rcx\t\n"
1167 "test r14,r14\t\n"
1168 "jnz next_inner%=\t\n"
1169 "add r10,64\t\n"
1170 "jmp dump_C%=\t\n"
1171
1172 "loop_inner%=:\t\n"
1173
1174 "vmovaps zmm15,zmm31\t\n"
1175 "vcvtph2ps zmm16,YMMWORD PTR [r10 + 32]\t\n"
1176 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1177 "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n"
1178 "vfmadd231ps zmm0,zmm15,zmm14\t\n"
1179 "vfmadd231ps zmm1,zmm16,zmm14\t\n"
1180 "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n"
1181 "vfmadd231ps zmm2,zmm15,zmm14\t\n"
1182 "vfmadd231ps zmm3,zmm16,zmm14\t\n"
1183 "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n"
1184 "vfmadd231ps zmm4,zmm15,zmm14\t\n"
1185 "vfmadd231ps zmm5,zmm16,zmm14\t\n"
1186 "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n"
1187 "vfmadd231ps zmm6,zmm15,zmm14\t\n"
1188 "vfmadd231ps zmm7,zmm16,zmm14\t\n"
1189 "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n"
1190 "vfmadd231ps zmm8,zmm15,zmm14\t\n"
1191 "vfmadd231ps zmm9,zmm16,zmm14\t\n"
1192 "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n"
1193 "vfmadd231ps zmm10,zmm15,zmm14\t\n"
1194 "vfmadd231ps zmm11,zmm16,zmm14\t\n"
1195 "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n"
1196 "vfmadd231ps zmm12,zmm15,zmm14\t\n"
1197 "vfmadd231ps zmm13,zmm16,zmm14\t\n"
1198
1199 "next_inner%=:\t\n"
1200 "add r9,28\t\n"
1201 "add r10,64\t\n"
1202 "dec r14\t\n"
1203 "jnz loop_inner%=\t\n"
1204
1205 "vmovaps zmm15,zmm31\t\n"
1206 "vcvtph2ps zmm16,YMMWORD PTR [r10 + 32]\t\n"
1207 "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n"
1208 "vfmadd231ps zmm0,zmm15,zmm14\t\n"
1209 "vfmadd231ps zmm1,zmm16,zmm14\t\n"
1210 "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n"
1211 "vfmadd231ps zmm2,zmm15,zmm14\t\n"
1212 "vfmadd231ps zmm3,zmm16,zmm14\t\n"
1213 "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n"
1214 "vfmadd231ps zmm4,zmm15,zmm14\t\n"
1215 "vfmadd231ps zmm5,zmm16,zmm14\t\n"
1216 "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n"
1217 "vfmadd231ps zmm6,zmm15,zmm14\t\n"
1218 "vfmadd231ps zmm7,zmm16,zmm14\t\n"
1219 "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n"
1220 "vfmadd231ps zmm8,zmm15,zmm14\t\n"
1221 "vfmadd231ps zmm9,zmm16,zmm14\t\n"
1222 "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n"
1223 "vfmadd231ps zmm10,zmm15,zmm14\t\n"
1224 "vfmadd231ps zmm11,zmm16,zmm14\t\n"
1225 "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n"
1226 "vfmadd231ps zmm12,zmm15,zmm14\t\n"
1227 "vfmadd231ps zmm13,zmm16,zmm14\t\n"
1228 "add r9,28\t\n"
1229 "add r10,64\t\n"
1230 // Dump C
1231 "dump_C%=:\t\n"
1232 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
1233 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
1234 "add r12, r13\t\n"
1235 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
1236 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
1237 "add r12, r13\t\n"
1238 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
1239 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
1240 "add r12, r13\t\n"
1241 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
1242 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
1243 "add r12, r13\t\n"
1244 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
1245 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
1246 "add r12, r13\t\n"
1247 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
1248 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
1249 "add r12, r13\t\n"
1250 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
1251 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
1252
1253 // next outer iteration
1254 "add rcx, 128\t\n"
1255 "mov r12, rcx\t\n"
1256 "mov r9, rax\t\n"
1257 "inc rbx\t\n"
1258 "cmp rbx, rdi\t\n"
1259 "jl loop_outter%=\t\n"
1260 :
1261 : [gp] "rm"(gp)
1262 : "r8",
1263 "r9",
1264 "r10",
1265 "r11",
1266 "r13",
1267 "r14",
1268 "rax",
1269 "rcx",
1270 "rsi",
1271 "rdi",
1272 "rbx",
1273 "r12",
1274 "r15",
1275 "memory");
1276}
1277void NOINLINE gemmkernel_8x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1278 asm volatile(
1279#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
1280 "mov %[gp], %%r14\t\n"
1281 ".intel_syntax noprefix\t\n"
1282#else
1283 "mov r14, %[gp]\t\n"
1284#endif
1285
1286 // Copy parameters
1287 // k
1288 "mov r8, [r14 + 0]\t\n"
1289 "dec r8\t\n"
1290 // A
1291 "mov r9, [r14 + 8]\t\n"
1292 // B
1293 "mov r10, [r14 + 16]\t\n"
1294 // beta
1295 "lea r15, [r14 + 24]\t\n"
1296 // C
1297 "mov r12, [r14 + 32]\t\n"
1298 // ldc
1299 "mov r13, [r14 + 40]\t\n"
1300 // b_block_cols
1301 "mov rdi, [r14 + 48]\t\n"
1302 // b_block_size
1303 "mov rsi, [r14 + 56]\t\n"
1304
1305 // Make copies of A and C
1306 "mov rax, r9\t\n"
1307 "mov rcx, r12\t\n"
1308
1309 "xor ebx, ebx\t\n"
1310 "loop_outter%=:\t\n"
1311 "mov r14, r8\t\n"
1312 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
1313 "vcvtph2ps zmm17,YMMWORD PTR [r10 + 0]\t\n"
1314 "vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n"
1315 "vxorps xmm0, xmm0, xmm0\t\n"
1316 "vcomiss xmm31, xmm0\t\n"
1317 "jz zero_regs%=\t\n"
1318
1319 // Setup values with beta multiplication
1320 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
1321 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
1322 "add r12, r13\t\n"
1323 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
1324 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
1325 "add r12, r13\t\n"
1326 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
1327 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
1328 "add r12, r13\t\n"
1329 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
1330 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
1331 "add r12, r13\t\n"
1332 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
1333 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
1334 "add r12, r13\t\n"
1335 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
1336 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
1337 "add r12, r13\t\n"
1338 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
1339 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
1340 "add r12, r13\t\n"
1341 "vmulps zmm14, zmm31, [r12 + 0]\t\n"
1342 "vmulps zmm15, zmm31, [r12 + 64]\t\n"
1343 "test r14,r14\t\n"
1344 "jz skip_preload%=\t\n"
1345 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1346 "skip_preload%=:\t\n"
1347 "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
1348 "vfmadd231ps zmm0,zmm17,zmm16\t\n"
1349 "vfmadd231ps zmm1,zmm18,zmm16\t\n"
1350 "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
1351 "vfmadd231ps zmm2,zmm17,zmm16\t\n"
1352 "vfmadd231ps zmm3,zmm18,zmm16\t\n"
1353 "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
1354 "vfmadd231ps zmm4,zmm17,zmm16\t\n"
1355 "vfmadd231ps zmm5,zmm18,zmm16\t\n"
1356 "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
1357 "vfmadd231ps zmm6,zmm17,zmm16\t\n"
1358 "vfmadd231ps zmm7,zmm18,zmm16\t\n"
1359 "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
1360 "vfmadd231ps zmm8,zmm17,zmm16\t\n"
1361 "vfmadd231ps zmm9,zmm18,zmm16\t\n"
1362 "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
1363 "vfmadd231ps zmm10,zmm17,zmm16\t\n"
1364 "vfmadd231ps zmm11,zmm18,zmm16\t\n"
1365 "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
1366 "vfmadd231ps zmm12,zmm17,zmm16\t\n"
1367 "vfmadd231ps zmm13,zmm18,zmm16\t\n"
1368 "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
1369 "vfmadd231ps zmm14,zmm17,zmm16\t\n"
1370 "vfmadd231ps zmm15,zmm18,zmm16\t\n"
1371 "mov r12, rcx\t\n"
1372 "test r14,r14\t\n"
1373 "jnz next_inner%=\t\n"
1374 "add r10,64\t\n"
1375 "jmp dump_C%=\t\n"
1376
1377 "zero_regs%=:\t\n"
1378
1379 "test r14,r14\t\n"
1380 "jz skip_preload_b_zero%=\t\n"
1381 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1382 "skip_preload_b_zero%=:\t\n"
1383 "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
1384 "vmulps zmm0,zmm17,zmm16\t\n"
1385 "vmulps zmm1,zmm18,zmm16\t\n"
1386 "add r12, r13\t\n"
1387 "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
1388 "vmulps zmm2,zmm17,zmm16\t\n"
1389 "vmulps zmm3,zmm18,zmm16\t\n"
1390 "add r12, r13\t\n"
1391 "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
1392 "vmulps zmm4,zmm17,zmm16\t\n"
1393 "vmulps zmm5,zmm18,zmm16\t\n"
1394 "add r12, r13\t\n"
1395 "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
1396 "vmulps zmm6,zmm17,zmm16\t\n"
1397 "vmulps zmm7,zmm18,zmm16\t\n"
1398 "add r12, r13\t\n"
1399 "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
1400 "vmulps zmm8,zmm17,zmm16\t\n"
1401 "vmulps zmm9,zmm18,zmm16\t\n"
1402 "add r12, r13\t\n"
1403 "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
1404 "vmulps zmm10,zmm17,zmm16\t\n"
1405 "vmulps zmm11,zmm18,zmm16\t\n"
1406 "add r12, r13\t\n"
1407 "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
1408 "vmulps zmm12,zmm17,zmm16\t\n"
1409 "vmulps zmm13,zmm18,zmm16\t\n"
1410 "add r12, r13\t\n"
1411 "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
1412 "vmulps zmm14,zmm17,zmm16\t\n"
1413 "vmulps zmm15,zmm18,zmm16\t\n"
1414 "mov r12, rcx\t\n"
1415 "test r14,r14\t\n"
1416 "jnz next_inner%=\t\n"
1417 "add r10,64\t\n"
1418 "jmp dump_C%=\t\n"
1419
1420 "loop_inner%=:\t\n"
1421
1422 "vmovaps zmm17,zmm31\t\n"
1423 "vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n"
1424 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1425 "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
1426 "vfmadd231ps zmm0,zmm17,zmm16\t\n"
1427 "vfmadd231ps zmm1,zmm18,zmm16\t\n"
1428 "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
1429 "vfmadd231ps zmm2,zmm17,zmm16\t\n"
1430 "vfmadd231ps zmm3,zmm18,zmm16\t\n"
1431 "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
1432 "vfmadd231ps zmm4,zmm17,zmm16\t\n"
1433 "vfmadd231ps zmm5,zmm18,zmm16\t\n"
1434 "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
1435 "vfmadd231ps zmm6,zmm17,zmm16\t\n"
1436 "vfmadd231ps zmm7,zmm18,zmm16\t\n"
1437 "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
1438 "vfmadd231ps zmm8,zmm17,zmm16\t\n"
1439 "vfmadd231ps zmm9,zmm18,zmm16\t\n"
1440 "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
1441 "vfmadd231ps zmm10,zmm17,zmm16\t\n"
1442 "vfmadd231ps zmm11,zmm18,zmm16\t\n"
1443 "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
1444 "vfmadd231ps zmm12,zmm17,zmm16\t\n"
1445 "vfmadd231ps zmm13,zmm18,zmm16\t\n"
1446 "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
1447 "vfmadd231ps zmm14,zmm17,zmm16\t\n"
1448 "vfmadd231ps zmm15,zmm18,zmm16\t\n"
1449
1450 "next_inner%=:\t\n"
1451 "add r9,32\t\n"
1452 "add r10,64\t\n"
1453 "dec r14\t\n"
1454 "jnz loop_inner%=\t\n"
1455
1456 "vmovaps zmm17,zmm31\t\n"
1457 "vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n"
1458 "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n"
1459 "vfmadd231ps zmm0,zmm17,zmm16\t\n"
1460 "vfmadd231ps zmm1,zmm18,zmm16\t\n"
1461 "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n"
1462 "vfmadd231ps zmm2,zmm17,zmm16\t\n"
1463 "vfmadd231ps zmm3,zmm18,zmm16\t\n"
1464 "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n"
1465 "vfmadd231ps zmm4,zmm17,zmm16\t\n"
1466 "vfmadd231ps zmm5,zmm18,zmm16\t\n"
1467 "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n"
1468 "vfmadd231ps zmm6,zmm17,zmm16\t\n"
1469 "vfmadd231ps zmm7,zmm18,zmm16\t\n"
1470 "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n"
1471 "vfmadd231ps zmm8,zmm17,zmm16\t\n"
1472 "vfmadd231ps zmm9,zmm18,zmm16\t\n"
1473 "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n"
1474 "vfmadd231ps zmm10,zmm17,zmm16\t\n"
1475 "vfmadd231ps zmm11,zmm18,zmm16\t\n"
1476 "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n"
1477 "vfmadd231ps zmm12,zmm17,zmm16\t\n"
1478 "vfmadd231ps zmm13,zmm18,zmm16\t\n"
1479 "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n"
1480 "vfmadd231ps zmm14,zmm17,zmm16\t\n"
1481 "vfmadd231ps zmm15,zmm18,zmm16\t\n"
1482 "add r9,32\t\n"
1483 "add r10,64\t\n"
1484 // Dump C
1485 "dump_C%=:\t\n"
1486 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
1487 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
1488 "add r12, r13\t\n"
1489 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
1490 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
1491 "add r12, r13\t\n"
1492 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
1493 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
1494 "add r12, r13\t\n"
1495 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
1496 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
1497 "add r12, r13\t\n"
1498 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
1499 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
1500 "add r12, r13\t\n"
1501 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
1502 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
1503 "add r12, r13\t\n"
1504 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
1505 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
1506 "add r12, r13\t\n"
1507 "vmovups zmmword PTR [r12 + 0], zmm14\t\n"
1508 "vmovups zmmword PTR [r12 + 64], zmm15\t\n"
1509
1510 // next outer iteration
1511 "add rcx, 128\t\n"
1512 "mov r12, rcx\t\n"
1513 "mov r9, rax\t\n"
1514 "inc rbx\t\n"
1515 "cmp rbx, rdi\t\n"
1516 "jl loop_outter%=\t\n"
1517 :
1518 : [gp] "rm"(gp)
1519 : "r8",
1520 "r9",
1521 "r10",
1522 "r11",
1523 "r13",
1524 "r14",
1525 "rax",
1526 "rcx",
1527 "rsi",
1528 "rdi",
1529 "rbx",
1530 "r12",
1531 "r15",
1532 "memory");
1533}
1534void NOINLINE gemmkernel_9x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1535 asm volatile(
1536#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
1537 "mov %[gp], %%r14\t\n"
1538 ".intel_syntax noprefix\t\n"
1539#else
1540 "mov r14, %[gp]\t\n"
1541#endif
1542
1543 // Copy parameters
1544 // k
1545 "mov r8, [r14 + 0]\t\n"
1546 "dec r8\t\n"
1547 // A
1548 "mov r9, [r14 + 8]\t\n"
1549 // B
1550 "mov r10, [r14 + 16]\t\n"
1551 // beta
1552 "lea r15, [r14 + 24]\t\n"
1553 // C
1554 "mov r12, [r14 + 32]\t\n"
1555 // ldc
1556 "mov r13, [r14 + 40]\t\n"
1557 // b_block_cols
1558 "mov rdi, [r14 + 48]\t\n"
1559 // b_block_size
1560 "mov rsi, [r14 + 56]\t\n"
1561
1562 // Make copies of A and C
1563 "mov rax, r9\t\n"
1564 "mov rcx, r12\t\n"
1565
1566 "xor ebx, ebx\t\n"
1567 "loop_outter%=:\t\n"
1568 "mov r14, r8\t\n"
1569 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
1570 "vcvtph2ps zmm19,YMMWORD PTR [r10 + 0]\t\n"
1571 "vcvtph2ps zmm20,YMMWORD PTR [r10 + 32]\t\n"
1572 "vxorps xmm0, xmm0, xmm0\t\n"
1573 "vcomiss xmm31, xmm0\t\n"
1574 "jz zero_regs%=\t\n"
1575
1576 // Setup values with beta multiplication
1577 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
1578 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
1579 "add r12, r13\t\n"
1580 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
1581 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
1582 "add r12, r13\t\n"
1583 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
1584 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
1585 "add r12, r13\t\n"
1586 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
1587 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
1588 "add r12, r13\t\n"
1589 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
1590 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
1591 "add r12, r13\t\n"
1592 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
1593 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
1594 "add r12, r13\t\n"
1595 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
1596 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
1597 "add r12, r13\t\n"
1598 "vmulps zmm14, zmm31, [r12 + 0]\t\n"
1599 "vmulps zmm15, zmm31, [r12 + 64]\t\n"
1600 "add r12, r13\t\n"
1601 "vmulps zmm16, zmm31, [r12 + 0]\t\n"
1602 "vmulps zmm17, zmm31, [r12 + 64]\t\n"
1603 "test r14,r14\t\n"
1604 "jz skip_preload%=\t\n"
1605 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1606 "skip_preload%=:\t\n"
1607 "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n"
1608 "vfmadd231ps zmm0,zmm19,zmm18\t\n"
1609 "vfmadd231ps zmm1,zmm20,zmm18\t\n"
1610 "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n"
1611 "vfmadd231ps zmm2,zmm19,zmm18\t\n"
1612 "vfmadd231ps zmm3,zmm20,zmm18\t\n"
1613 "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n"
1614 "vfmadd231ps zmm4,zmm19,zmm18\t\n"
1615 "vfmadd231ps zmm5,zmm20,zmm18\t\n"
1616 "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n"
1617 "vfmadd231ps zmm6,zmm19,zmm18\t\n"
1618 "vfmadd231ps zmm7,zmm20,zmm18\t\n"
1619 "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n"
1620 "vfmadd231ps zmm8,zmm19,zmm18\t\n"
1621 "vfmadd231ps zmm9,zmm20,zmm18\t\n"
1622 "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n"
1623 "vfmadd231ps zmm10,zmm19,zmm18\t\n"
1624 "vfmadd231ps zmm11,zmm20,zmm18\t\n"
1625 "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n"
1626 "vfmadd231ps zmm12,zmm19,zmm18\t\n"
1627 "vfmadd231ps zmm13,zmm20,zmm18\t\n"
1628 "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n"
1629 "vfmadd231ps zmm14,zmm19,zmm18\t\n"
1630 "vfmadd231ps zmm15,zmm20,zmm18\t\n"
1631 "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n"
1632 "vfmadd231ps zmm16,zmm19,zmm18\t\n"
1633 "vfmadd231ps zmm17,zmm20,zmm18\t\n"
1634 "mov r12, rcx\t\n"
1635 "test r14,r14\t\n"
1636 "jnz next_inner%=\t\n"
1637 "add r10,64\t\n"
1638 "jmp dump_C%=\t\n"
1639
1640 "zero_regs%=:\t\n"
1641
1642 "test r14,r14\t\n"
1643 "jz skip_preload_b_zero%=\t\n"
1644 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1645 "skip_preload_b_zero%=:\t\n"
1646 "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n"
1647 "vmulps zmm0,zmm19,zmm18\t\n"
1648 "vmulps zmm1,zmm20,zmm18\t\n"
1649 "add r12, r13\t\n"
1650 "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n"
1651 "vmulps zmm2,zmm19,zmm18\t\n"
1652 "vmulps zmm3,zmm20,zmm18\t\n"
1653 "add r12, r13\t\n"
1654 "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n"
1655 "vmulps zmm4,zmm19,zmm18\t\n"
1656 "vmulps zmm5,zmm20,zmm18\t\n"
1657 "add r12, r13\t\n"
1658 "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n"
1659 "vmulps zmm6,zmm19,zmm18\t\n"
1660 "vmulps zmm7,zmm20,zmm18\t\n"
1661 "add r12, r13\t\n"
1662 "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n"
1663 "vmulps zmm8,zmm19,zmm18\t\n"
1664 "vmulps zmm9,zmm20,zmm18\t\n"
1665 "add r12, r13\t\n"
1666 "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n"
1667 "vmulps zmm10,zmm19,zmm18\t\n"
1668 "vmulps zmm11,zmm20,zmm18\t\n"
1669 "add r12, r13\t\n"
1670 "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n"
1671 "vmulps zmm12,zmm19,zmm18\t\n"
1672 "vmulps zmm13,zmm20,zmm18\t\n"
1673 "add r12, r13\t\n"
1674 "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n"
1675 "vmulps zmm14,zmm19,zmm18\t\n"
1676 "vmulps zmm15,zmm20,zmm18\t\n"
1677 "add r12, r13\t\n"
1678 "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n"
1679 "vmulps zmm16,zmm19,zmm18\t\n"
1680 "vmulps zmm17,zmm20,zmm18\t\n"
1681 "mov r12, rcx\t\n"
1682 "test r14,r14\t\n"
1683 "jnz next_inner%=\t\n"
1684 "add r10,64\t\n"
1685 "jmp dump_C%=\t\n"
1686
1687 "loop_inner%=:\t\n"
1688
1689 "vmovaps zmm19,zmm31\t\n"
1690 "vcvtph2ps zmm20,YMMWORD PTR [r10 + 32]\t\n"
1691 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1692 "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n"
1693 "vfmadd231ps zmm0,zmm19,zmm18\t\n"
1694 "vfmadd231ps zmm1,zmm20,zmm18\t\n"
1695 "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n"
1696 "vfmadd231ps zmm2,zmm19,zmm18\t\n"
1697 "vfmadd231ps zmm3,zmm20,zmm18\t\n"
1698 "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n"
1699 "vfmadd231ps zmm4,zmm19,zmm18\t\n"
1700 "vfmadd231ps zmm5,zmm20,zmm18\t\n"
1701 "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n"
1702 "vfmadd231ps zmm6,zmm19,zmm18\t\n"
1703 "vfmadd231ps zmm7,zmm20,zmm18\t\n"
1704 "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n"
1705 "vfmadd231ps zmm8,zmm19,zmm18\t\n"
1706 "vfmadd231ps zmm9,zmm20,zmm18\t\n"
1707 "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n"
1708 "vfmadd231ps zmm10,zmm19,zmm18\t\n"
1709 "vfmadd231ps zmm11,zmm20,zmm18\t\n"
1710 "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n"
1711 "vfmadd231ps zmm12,zmm19,zmm18\t\n"
1712 "vfmadd231ps zmm13,zmm20,zmm18\t\n"
1713 "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n"
1714 "vfmadd231ps zmm14,zmm19,zmm18\t\n"
1715 "vfmadd231ps zmm15,zmm20,zmm18\t\n"
1716 "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n"
1717 "vfmadd231ps zmm16,zmm19,zmm18\t\n"
1718 "vfmadd231ps zmm17,zmm20,zmm18\t\n"
1719
1720 "next_inner%=:\t\n"
1721 "add r9,36\t\n"
1722 "add r10,64\t\n"
1723 "dec r14\t\n"
1724 "jnz loop_inner%=\t\n"
1725
1726 "vmovaps zmm19,zmm31\t\n"
1727 "vcvtph2ps zmm20,YMMWORD PTR [r10 + 32]\t\n"
1728 "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n"
1729 "vfmadd231ps zmm0,zmm19,zmm18\t\n"
1730 "vfmadd231ps zmm1,zmm20,zmm18\t\n"
1731 "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n"
1732 "vfmadd231ps zmm2,zmm19,zmm18\t\n"
1733 "vfmadd231ps zmm3,zmm20,zmm18\t\n"
1734 "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n"
1735 "vfmadd231ps zmm4,zmm19,zmm18\t\n"
1736 "vfmadd231ps zmm5,zmm20,zmm18\t\n"
1737 "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n"
1738 "vfmadd231ps zmm6,zmm19,zmm18\t\n"
1739 "vfmadd231ps zmm7,zmm20,zmm18\t\n"
1740 "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n"
1741 "vfmadd231ps zmm8,zmm19,zmm18\t\n"
1742 "vfmadd231ps zmm9,zmm20,zmm18\t\n"
1743 "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n"
1744 "vfmadd231ps zmm10,zmm19,zmm18\t\n"
1745 "vfmadd231ps zmm11,zmm20,zmm18\t\n"
1746 "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n"
1747 "vfmadd231ps zmm12,zmm19,zmm18\t\n"
1748 "vfmadd231ps zmm13,zmm20,zmm18\t\n"
1749 "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n"
1750 "vfmadd231ps zmm14,zmm19,zmm18\t\n"
1751 "vfmadd231ps zmm15,zmm20,zmm18\t\n"
1752 "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n"
1753 "vfmadd231ps zmm16,zmm19,zmm18\t\n"
1754 "vfmadd231ps zmm17,zmm20,zmm18\t\n"
1755 "add r9,36\t\n"
1756 "add r10,64\t\n"
1757 // Dump C
1758 "dump_C%=:\t\n"
1759 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
1760 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
1761 "add r12, r13\t\n"
1762 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
1763 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
1764 "add r12, r13\t\n"
1765 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
1766 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
1767 "add r12, r13\t\n"
1768 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
1769 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
1770 "add r12, r13\t\n"
1771 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
1772 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
1773 "add r12, r13\t\n"
1774 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
1775 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
1776 "add r12, r13\t\n"
1777 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
1778 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
1779 "add r12, r13\t\n"
1780 "vmovups zmmword PTR [r12 + 0], zmm14\t\n"
1781 "vmovups zmmword PTR [r12 + 64], zmm15\t\n"
1782 "add r12, r13\t\n"
1783 "vmovups zmmword PTR [r12 + 0], zmm16\t\n"
1784 "vmovups zmmword PTR [r12 + 64], zmm17\t\n"
1785
1786 // next outer iteration
1787 "add rcx, 128\t\n"
1788 "mov r12, rcx\t\n"
1789 "mov r9, rax\t\n"
1790 "inc rbx\t\n"
1791 "cmp rbx, rdi\t\n"
1792 "jl loop_outter%=\t\n"
1793 :
1794 : [gp] "rm"(gp)
1795 : "r8",
1796 "r9",
1797 "r10",
1798 "r11",
1799 "r13",
1800 "r14",
1801 "rax",
1802 "rcx",
1803 "rsi",
1804 "rdi",
1805 "rbx",
1806 "r12",
1807 "r15",
1808 "memory");
1809}
1810void NOINLINE gemmkernel_10x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1811 asm volatile(
1812#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
1813 "mov %[gp], %%r14\t\n"
1814 ".intel_syntax noprefix\t\n"
1815#else
1816 "mov r14, %[gp]\t\n"
1817#endif
1818
1819 // Copy parameters
1820 // k
1821 "mov r8, [r14 + 0]\t\n"
1822 "dec r8\t\n"
1823 // A
1824 "mov r9, [r14 + 8]\t\n"
1825 // B
1826 "mov r10, [r14 + 16]\t\n"
1827 // beta
1828 "lea r15, [r14 + 24]\t\n"
1829 // C
1830 "mov r12, [r14 + 32]\t\n"
1831 // ldc
1832 "mov r13, [r14 + 40]\t\n"
1833 // b_block_cols
1834 "mov rdi, [r14 + 48]\t\n"
1835 // b_block_size
1836 "mov rsi, [r14 + 56]\t\n"
1837
1838 // Make copies of A and C
1839 "mov rax, r9\t\n"
1840 "mov rcx, r12\t\n"
1841
1842 "xor ebx, ebx\t\n"
1843 "loop_outter%=:\t\n"
1844 "mov r14, r8\t\n"
1845 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
1846 "vcvtph2ps zmm21,YMMWORD PTR [r10 + 0]\t\n"
1847 "vcvtph2ps zmm22,YMMWORD PTR [r10 + 32]\t\n"
1848 "vxorps xmm0, xmm0, xmm0\t\n"
1849 "vcomiss xmm31, xmm0\t\n"
1850 "jz zero_regs%=\t\n"
1851
1852 // Setup values with beta multiplication
1853 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
1854 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
1855 "add r12, r13\t\n"
1856 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
1857 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
1858 "add r12, r13\t\n"
1859 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
1860 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
1861 "add r12, r13\t\n"
1862 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
1863 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
1864 "add r12, r13\t\n"
1865 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
1866 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
1867 "add r12, r13\t\n"
1868 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
1869 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
1870 "add r12, r13\t\n"
1871 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
1872 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
1873 "add r12, r13\t\n"
1874 "vmulps zmm14, zmm31, [r12 + 0]\t\n"
1875 "vmulps zmm15, zmm31, [r12 + 64]\t\n"
1876 "add r12, r13\t\n"
1877 "vmulps zmm16, zmm31, [r12 + 0]\t\n"
1878 "vmulps zmm17, zmm31, [r12 + 64]\t\n"
1879 "add r12, r13\t\n"
1880 "vmulps zmm18, zmm31, [r12 + 0]\t\n"
1881 "vmulps zmm19, zmm31, [r12 + 64]\t\n"
1882 "test r14,r14\t\n"
1883 "jz skip_preload%=\t\n"
1884 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1885 "skip_preload%=:\t\n"
1886 "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n"
1887 "vfmadd231ps zmm0,zmm21,zmm20\t\n"
1888 "vfmadd231ps zmm1,zmm22,zmm20\t\n"
1889 "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n"
1890 "vfmadd231ps zmm2,zmm21,zmm20\t\n"
1891 "vfmadd231ps zmm3,zmm22,zmm20\t\n"
1892 "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n"
1893 "vfmadd231ps zmm4,zmm21,zmm20\t\n"
1894 "vfmadd231ps zmm5,zmm22,zmm20\t\n"
1895 "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n"
1896 "vfmadd231ps zmm6,zmm21,zmm20\t\n"
1897 "vfmadd231ps zmm7,zmm22,zmm20\t\n"
1898 "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n"
1899 "vfmadd231ps zmm8,zmm21,zmm20\t\n"
1900 "vfmadd231ps zmm9,zmm22,zmm20\t\n"
1901 "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n"
1902 "vfmadd231ps zmm10,zmm21,zmm20\t\n"
1903 "vfmadd231ps zmm11,zmm22,zmm20\t\n"
1904 "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n"
1905 "vfmadd231ps zmm12,zmm21,zmm20\t\n"
1906 "vfmadd231ps zmm13,zmm22,zmm20\t\n"
1907 "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n"
1908 "vfmadd231ps zmm14,zmm21,zmm20\t\n"
1909 "vfmadd231ps zmm15,zmm22,zmm20\t\n"
1910 "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n"
1911 "vfmadd231ps zmm16,zmm21,zmm20\t\n"
1912 "vfmadd231ps zmm17,zmm22,zmm20\t\n"
1913 "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n"
1914 "vfmadd231ps zmm18,zmm21,zmm20\t\n"
1915 "vfmadd231ps zmm19,zmm22,zmm20\t\n"
1916 "mov r12, rcx\t\n"
1917 "test r14,r14\t\n"
1918 "jnz next_inner%=\t\n"
1919 "add r10,64\t\n"
1920 "jmp dump_C%=\t\n"
1921
1922 "zero_regs%=:\t\n"
1923
1924 "test r14,r14\t\n"
1925 "jz skip_preload_b_zero%=\t\n"
1926 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1927 "skip_preload_b_zero%=:\t\n"
1928 "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n"
1929 "vmulps zmm0,zmm21,zmm20\t\n"
1930 "vmulps zmm1,zmm22,zmm20\t\n"
1931 "add r12, r13\t\n"
1932 "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n"
1933 "vmulps zmm2,zmm21,zmm20\t\n"
1934 "vmulps zmm3,zmm22,zmm20\t\n"
1935 "add r12, r13\t\n"
1936 "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n"
1937 "vmulps zmm4,zmm21,zmm20\t\n"
1938 "vmulps zmm5,zmm22,zmm20\t\n"
1939 "add r12, r13\t\n"
1940 "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n"
1941 "vmulps zmm6,zmm21,zmm20\t\n"
1942 "vmulps zmm7,zmm22,zmm20\t\n"
1943 "add r12, r13\t\n"
1944 "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n"
1945 "vmulps zmm8,zmm21,zmm20\t\n"
1946 "vmulps zmm9,zmm22,zmm20\t\n"
1947 "add r12, r13\t\n"
1948 "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n"
1949 "vmulps zmm10,zmm21,zmm20\t\n"
1950 "vmulps zmm11,zmm22,zmm20\t\n"
1951 "add r12, r13\t\n"
1952 "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n"
1953 "vmulps zmm12,zmm21,zmm20\t\n"
1954 "vmulps zmm13,zmm22,zmm20\t\n"
1955 "add r12, r13\t\n"
1956 "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n"
1957 "vmulps zmm14,zmm21,zmm20\t\n"
1958 "vmulps zmm15,zmm22,zmm20\t\n"
1959 "add r12, r13\t\n"
1960 "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n"
1961 "vmulps zmm16,zmm21,zmm20\t\n"
1962 "vmulps zmm17,zmm22,zmm20\t\n"
1963 "add r12, r13\t\n"
1964 "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n"
1965 "vmulps zmm18,zmm21,zmm20\t\n"
1966 "vmulps zmm19,zmm22,zmm20\t\n"
1967 "mov r12, rcx\t\n"
1968 "test r14,r14\t\n"
1969 "jnz next_inner%=\t\n"
1970 "add r10,64\t\n"
1971 "jmp dump_C%=\t\n"
1972
1973 "loop_inner%=:\t\n"
1974
1975 "vmovaps zmm21,zmm31\t\n"
1976 "vcvtph2ps zmm22,YMMWORD PTR [r10 + 32]\t\n"
1977 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
1978 "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n"
1979 "vfmadd231ps zmm0,zmm21,zmm20\t\n"
1980 "vfmadd231ps zmm1,zmm22,zmm20\t\n"
1981 "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n"
1982 "vfmadd231ps zmm2,zmm21,zmm20\t\n"
1983 "vfmadd231ps zmm3,zmm22,zmm20\t\n"
1984 "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n"
1985 "vfmadd231ps zmm4,zmm21,zmm20\t\n"
1986 "vfmadd231ps zmm5,zmm22,zmm20\t\n"
1987 "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n"
1988 "vfmadd231ps zmm6,zmm21,zmm20\t\n"
1989 "vfmadd231ps zmm7,zmm22,zmm20\t\n"
1990 "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n"
1991 "vfmadd231ps zmm8,zmm21,zmm20\t\n"
1992 "vfmadd231ps zmm9,zmm22,zmm20\t\n"
1993 "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n"
1994 "vfmadd231ps zmm10,zmm21,zmm20\t\n"
1995 "vfmadd231ps zmm11,zmm22,zmm20\t\n"
1996 "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n"
1997 "vfmadd231ps zmm12,zmm21,zmm20\t\n"
1998 "vfmadd231ps zmm13,zmm22,zmm20\t\n"
1999 "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n"
2000 "vfmadd231ps zmm14,zmm21,zmm20\t\n"
2001 "vfmadd231ps zmm15,zmm22,zmm20\t\n"
2002 "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n"
2003 "vfmadd231ps zmm16,zmm21,zmm20\t\n"
2004 "vfmadd231ps zmm17,zmm22,zmm20\t\n"
2005 "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n"
2006 "vfmadd231ps zmm18,zmm21,zmm20\t\n"
2007 "vfmadd231ps zmm19,zmm22,zmm20\t\n"
2008
2009 "next_inner%=:\t\n"
2010 "add r9,40\t\n"
2011 "add r10,64\t\n"
2012 "dec r14\t\n"
2013 "jnz loop_inner%=\t\n"
2014
2015 "vmovaps zmm21,zmm31\t\n"
2016 "vcvtph2ps zmm22,YMMWORD PTR [r10 + 32]\t\n"
2017 "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n"
2018 "vfmadd231ps zmm0,zmm21,zmm20\t\n"
2019 "vfmadd231ps zmm1,zmm22,zmm20\t\n"
2020 "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n"
2021 "vfmadd231ps zmm2,zmm21,zmm20\t\n"
2022 "vfmadd231ps zmm3,zmm22,zmm20\t\n"
2023 "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n"
2024 "vfmadd231ps zmm4,zmm21,zmm20\t\n"
2025 "vfmadd231ps zmm5,zmm22,zmm20\t\n"
2026 "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n"
2027 "vfmadd231ps zmm6,zmm21,zmm20\t\n"
2028 "vfmadd231ps zmm7,zmm22,zmm20\t\n"
2029 "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n"
2030 "vfmadd231ps zmm8,zmm21,zmm20\t\n"
2031 "vfmadd231ps zmm9,zmm22,zmm20\t\n"
2032 "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n"
2033 "vfmadd231ps zmm10,zmm21,zmm20\t\n"
2034 "vfmadd231ps zmm11,zmm22,zmm20\t\n"
2035 "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n"
2036 "vfmadd231ps zmm12,zmm21,zmm20\t\n"
2037 "vfmadd231ps zmm13,zmm22,zmm20\t\n"
2038 "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n"
2039 "vfmadd231ps zmm14,zmm21,zmm20\t\n"
2040 "vfmadd231ps zmm15,zmm22,zmm20\t\n"
2041 "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n"
2042 "vfmadd231ps zmm16,zmm21,zmm20\t\n"
2043 "vfmadd231ps zmm17,zmm22,zmm20\t\n"
2044 "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n"
2045 "vfmadd231ps zmm18,zmm21,zmm20\t\n"
2046 "vfmadd231ps zmm19,zmm22,zmm20\t\n"
2047 "add r9,40\t\n"
2048 "add r10,64\t\n"
2049 // Dump C
2050 "dump_C%=:\t\n"
2051 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
2052 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
2053 "add r12, r13\t\n"
2054 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
2055 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
2056 "add r12, r13\t\n"
2057 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
2058 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
2059 "add r12, r13\t\n"
2060 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
2061 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
2062 "add r12, r13\t\n"
2063 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
2064 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
2065 "add r12, r13\t\n"
2066 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
2067 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
2068 "add r12, r13\t\n"
2069 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
2070 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
2071 "add r12, r13\t\n"
2072 "vmovups zmmword PTR [r12 + 0], zmm14\t\n"
2073 "vmovups zmmword PTR [r12 + 64], zmm15\t\n"
2074 "add r12, r13\t\n"
2075 "vmovups zmmword PTR [r12 + 0], zmm16\t\n"
2076 "vmovups zmmword PTR [r12 + 64], zmm17\t\n"
2077 "add r12, r13\t\n"
2078 "vmovups zmmword PTR [r12 + 0], zmm18\t\n"
2079 "vmovups zmmword PTR [r12 + 64], zmm19\t\n"
2080
2081 // next outer iteration
2082 "add rcx, 128\t\n"
2083 "mov r12, rcx\t\n"
2084 "mov r9, rax\t\n"
2085 "inc rbx\t\n"
2086 "cmp rbx, rdi\t\n"
2087 "jl loop_outter%=\t\n"
2088 :
2089 : [gp] "rm"(gp)
2090 : "r8",
2091 "r9",
2092 "r10",
2093 "r11",
2094 "r13",
2095 "r14",
2096 "rax",
2097 "rcx",
2098 "rsi",
2099 "rdi",
2100 "rbx",
2101 "r12",
2102 "r15",
2103 "memory");
2104}
2105void NOINLINE gemmkernel_11x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
2106 asm volatile(
2107#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
2108 "mov %[gp], %%r14\t\n"
2109 ".intel_syntax noprefix\t\n"
2110#else
2111 "mov r14, %[gp]\t\n"
2112#endif
2113
2114 // Copy parameters
2115 // k
2116 "mov r8, [r14 + 0]\t\n"
2117 "dec r8\t\n"
2118 // A
2119 "mov r9, [r14 + 8]\t\n"
2120 // B
2121 "mov r10, [r14 + 16]\t\n"
2122 // beta
2123 "lea r15, [r14 + 24]\t\n"
2124 // C
2125 "mov r12, [r14 + 32]\t\n"
2126 // ldc
2127 "mov r13, [r14 + 40]\t\n"
2128 // b_block_cols
2129 "mov rdi, [r14 + 48]\t\n"
2130 // b_block_size
2131 "mov rsi, [r14 + 56]\t\n"
2132
2133 // Make copies of A and C
2134 "mov rax, r9\t\n"
2135 "mov rcx, r12\t\n"
2136
2137 "xor ebx, ebx\t\n"
2138 "loop_outter%=:\t\n"
2139 "mov r14, r8\t\n"
2140 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
2141 "vcvtph2ps zmm23,YMMWORD PTR [r10 + 0]\t\n"
2142 "vcvtph2ps zmm24,YMMWORD PTR [r10 + 32]\t\n"
2143 "vxorps xmm0, xmm0, xmm0\t\n"
2144 "vcomiss xmm31, xmm0\t\n"
2145 "jz zero_regs%=\t\n"
2146
2147 // Setup values with beta multiplication
2148 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
2149 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
2150 "add r12, r13\t\n"
2151 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
2152 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
2153 "add r12, r13\t\n"
2154 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
2155 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
2156 "add r12, r13\t\n"
2157 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
2158 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
2159 "add r12, r13\t\n"
2160 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
2161 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
2162 "add r12, r13\t\n"
2163 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
2164 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
2165 "add r12, r13\t\n"
2166 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
2167 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
2168 "add r12, r13\t\n"
2169 "vmulps zmm14, zmm31, [r12 + 0]\t\n"
2170 "vmulps zmm15, zmm31, [r12 + 64]\t\n"
2171 "add r12, r13\t\n"
2172 "vmulps zmm16, zmm31, [r12 + 0]\t\n"
2173 "vmulps zmm17, zmm31, [r12 + 64]\t\n"
2174 "add r12, r13\t\n"
2175 "vmulps zmm18, zmm31, [r12 + 0]\t\n"
2176 "vmulps zmm19, zmm31, [r12 + 64]\t\n"
2177 "add r12, r13\t\n"
2178 "vmulps zmm20, zmm31, [r12 + 0]\t\n"
2179 "vmulps zmm21, zmm31, [r12 + 64]\t\n"
2180 "test r14,r14\t\n"
2181 "jz skip_preload%=\t\n"
2182 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2183 "skip_preload%=:\t\n"
2184 "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n"
2185 "vfmadd231ps zmm0,zmm23,zmm22\t\n"
2186 "vfmadd231ps zmm1,zmm24,zmm22\t\n"
2187 "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n"
2188 "vfmadd231ps zmm2,zmm23,zmm22\t\n"
2189 "vfmadd231ps zmm3,zmm24,zmm22\t\n"
2190 "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n"
2191 "vfmadd231ps zmm4,zmm23,zmm22\t\n"
2192 "vfmadd231ps zmm5,zmm24,zmm22\t\n"
2193 "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n"
2194 "vfmadd231ps zmm6,zmm23,zmm22\t\n"
2195 "vfmadd231ps zmm7,zmm24,zmm22\t\n"
2196 "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n"
2197 "vfmadd231ps zmm8,zmm23,zmm22\t\n"
2198 "vfmadd231ps zmm9,zmm24,zmm22\t\n"
2199 "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n"
2200 "vfmadd231ps zmm10,zmm23,zmm22\t\n"
2201 "vfmadd231ps zmm11,zmm24,zmm22\t\n"
2202 "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n"
2203 "vfmadd231ps zmm12,zmm23,zmm22\t\n"
2204 "vfmadd231ps zmm13,zmm24,zmm22\t\n"
2205 "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n"
2206 "vfmadd231ps zmm14,zmm23,zmm22\t\n"
2207 "vfmadd231ps zmm15,zmm24,zmm22\t\n"
2208 "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n"
2209 "vfmadd231ps zmm16,zmm23,zmm22\t\n"
2210 "vfmadd231ps zmm17,zmm24,zmm22\t\n"
2211 "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n"
2212 "vfmadd231ps zmm18,zmm23,zmm22\t\n"
2213 "vfmadd231ps zmm19,zmm24,zmm22\t\n"
2214 "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n"
2215 "vfmadd231ps zmm20,zmm23,zmm22\t\n"
2216 "vfmadd231ps zmm21,zmm24,zmm22\t\n"
2217 "mov r12, rcx\t\n"
2218 "test r14,r14\t\n"
2219 "jnz next_inner%=\t\n"
2220 "add r10,64\t\n"
2221 "jmp dump_C%=\t\n"
2222
2223 "zero_regs%=:\t\n"
2224
2225 "test r14,r14\t\n"
2226 "jz skip_preload_b_zero%=\t\n"
2227 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2228 "skip_preload_b_zero%=:\t\n"
2229 "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n"
2230 "vmulps zmm0,zmm23,zmm22\t\n"
2231 "vmulps zmm1,zmm24,zmm22\t\n"
2232 "add r12, r13\t\n"
2233 "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n"
2234 "vmulps zmm2,zmm23,zmm22\t\n"
2235 "vmulps zmm3,zmm24,zmm22\t\n"
2236 "add r12, r13\t\n"
2237 "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n"
2238 "vmulps zmm4,zmm23,zmm22\t\n"
2239 "vmulps zmm5,zmm24,zmm22\t\n"
2240 "add r12, r13\t\n"
2241 "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n"
2242 "vmulps zmm6,zmm23,zmm22\t\n"
2243 "vmulps zmm7,zmm24,zmm22\t\n"
2244 "add r12, r13\t\n"
2245 "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n"
2246 "vmulps zmm8,zmm23,zmm22\t\n"
2247 "vmulps zmm9,zmm24,zmm22\t\n"
2248 "add r12, r13\t\n"
2249 "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n"
2250 "vmulps zmm10,zmm23,zmm22\t\n"
2251 "vmulps zmm11,zmm24,zmm22\t\n"
2252 "add r12, r13\t\n"
2253 "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n"
2254 "vmulps zmm12,zmm23,zmm22\t\n"
2255 "vmulps zmm13,zmm24,zmm22\t\n"
2256 "add r12, r13\t\n"
2257 "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n"
2258 "vmulps zmm14,zmm23,zmm22\t\n"
2259 "vmulps zmm15,zmm24,zmm22\t\n"
2260 "add r12, r13\t\n"
2261 "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n"
2262 "vmulps zmm16,zmm23,zmm22\t\n"
2263 "vmulps zmm17,zmm24,zmm22\t\n"
2264 "add r12, r13\t\n"
2265 "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n"
2266 "vmulps zmm18,zmm23,zmm22\t\n"
2267 "vmulps zmm19,zmm24,zmm22\t\n"
2268 "add r12, r13\t\n"
2269 "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n"
2270 "vmulps zmm20,zmm23,zmm22\t\n"
2271 "vmulps zmm21,zmm24,zmm22\t\n"
2272 "mov r12, rcx\t\n"
2273 "test r14,r14\t\n"
2274 "jnz next_inner%=\t\n"
2275 "add r10,64\t\n"
2276 "jmp dump_C%=\t\n"
2277
2278 "loop_inner%=:\t\n"
2279
2280 "vmovaps zmm23,zmm31\t\n"
2281 "vcvtph2ps zmm24,YMMWORD PTR [r10 + 32]\t\n"
2282 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2283 "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n"
2284 "vfmadd231ps zmm0,zmm23,zmm22\t\n"
2285 "vfmadd231ps zmm1,zmm24,zmm22\t\n"
2286 "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n"
2287 "vfmadd231ps zmm2,zmm23,zmm22\t\n"
2288 "vfmadd231ps zmm3,zmm24,zmm22\t\n"
2289 "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n"
2290 "vfmadd231ps zmm4,zmm23,zmm22\t\n"
2291 "vfmadd231ps zmm5,zmm24,zmm22\t\n"
2292 "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n"
2293 "vfmadd231ps zmm6,zmm23,zmm22\t\n"
2294 "vfmadd231ps zmm7,zmm24,zmm22\t\n"
2295 "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n"
2296 "vfmadd231ps zmm8,zmm23,zmm22\t\n"
2297 "vfmadd231ps zmm9,zmm24,zmm22\t\n"
2298 "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n"
2299 "vfmadd231ps zmm10,zmm23,zmm22\t\n"
2300 "vfmadd231ps zmm11,zmm24,zmm22\t\n"
2301 "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n"
2302 "vfmadd231ps zmm12,zmm23,zmm22\t\n"
2303 "vfmadd231ps zmm13,zmm24,zmm22\t\n"
2304 "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n"
2305 "vfmadd231ps zmm14,zmm23,zmm22\t\n"
2306 "vfmadd231ps zmm15,zmm24,zmm22\t\n"
2307 "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n"
2308 "vfmadd231ps zmm16,zmm23,zmm22\t\n"
2309 "vfmadd231ps zmm17,zmm24,zmm22\t\n"
2310 "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n"
2311 "vfmadd231ps zmm18,zmm23,zmm22\t\n"
2312 "vfmadd231ps zmm19,zmm24,zmm22\t\n"
2313 "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n"
2314 "vfmadd231ps zmm20,zmm23,zmm22\t\n"
2315 "vfmadd231ps zmm21,zmm24,zmm22\t\n"
2316
2317 "next_inner%=:\t\n"
2318 "add r9,44\t\n"
2319 "add r10,64\t\n"
2320 "dec r14\t\n"
2321 "jnz loop_inner%=\t\n"
2322
2323 "vmovaps zmm23,zmm31\t\n"
2324 "vcvtph2ps zmm24,YMMWORD PTR [r10 + 32]\t\n"
2325 "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n"
2326 "vfmadd231ps zmm0,zmm23,zmm22\t\n"
2327 "vfmadd231ps zmm1,zmm24,zmm22\t\n"
2328 "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n"
2329 "vfmadd231ps zmm2,zmm23,zmm22\t\n"
2330 "vfmadd231ps zmm3,zmm24,zmm22\t\n"
2331 "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n"
2332 "vfmadd231ps zmm4,zmm23,zmm22\t\n"
2333 "vfmadd231ps zmm5,zmm24,zmm22\t\n"
2334 "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n"
2335 "vfmadd231ps zmm6,zmm23,zmm22\t\n"
2336 "vfmadd231ps zmm7,zmm24,zmm22\t\n"
2337 "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n"
2338 "vfmadd231ps zmm8,zmm23,zmm22\t\n"
2339 "vfmadd231ps zmm9,zmm24,zmm22\t\n"
2340 "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n"
2341 "vfmadd231ps zmm10,zmm23,zmm22\t\n"
2342 "vfmadd231ps zmm11,zmm24,zmm22\t\n"
2343 "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n"
2344 "vfmadd231ps zmm12,zmm23,zmm22\t\n"
2345 "vfmadd231ps zmm13,zmm24,zmm22\t\n"
2346 "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n"
2347 "vfmadd231ps zmm14,zmm23,zmm22\t\n"
2348 "vfmadd231ps zmm15,zmm24,zmm22\t\n"
2349 "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n"
2350 "vfmadd231ps zmm16,zmm23,zmm22\t\n"
2351 "vfmadd231ps zmm17,zmm24,zmm22\t\n"
2352 "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n"
2353 "vfmadd231ps zmm18,zmm23,zmm22\t\n"
2354 "vfmadd231ps zmm19,zmm24,zmm22\t\n"
2355 "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n"
2356 "vfmadd231ps zmm20,zmm23,zmm22\t\n"
2357 "vfmadd231ps zmm21,zmm24,zmm22\t\n"
2358 "add r9,44\t\n"
2359 "add r10,64\t\n"
2360 // Dump C
2361 "dump_C%=:\t\n"
2362 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
2363 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
2364 "add r12, r13\t\n"
2365 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
2366 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
2367 "add r12, r13\t\n"
2368 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
2369 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
2370 "add r12, r13\t\n"
2371 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
2372 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
2373 "add r12, r13\t\n"
2374 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
2375 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
2376 "add r12, r13\t\n"
2377 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
2378 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
2379 "add r12, r13\t\n"
2380 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
2381 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
2382 "add r12, r13\t\n"
2383 "vmovups zmmword PTR [r12 + 0], zmm14\t\n"
2384 "vmovups zmmword PTR [r12 + 64], zmm15\t\n"
2385 "add r12, r13\t\n"
2386 "vmovups zmmword PTR [r12 + 0], zmm16\t\n"
2387 "vmovups zmmword PTR [r12 + 64], zmm17\t\n"
2388 "add r12, r13\t\n"
2389 "vmovups zmmword PTR [r12 + 0], zmm18\t\n"
2390 "vmovups zmmword PTR [r12 + 64], zmm19\t\n"
2391 "add r12, r13\t\n"
2392 "vmovups zmmword PTR [r12 + 0], zmm20\t\n"
2393 "vmovups zmmword PTR [r12 + 64], zmm21\t\n"
2394
2395 // next outer iteration
2396 "add rcx, 128\t\n"
2397 "mov r12, rcx\t\n"
2398 "mov r9, rax\t\n"
2399 "inc rbx\t\n"
2400 "cmp rbx, rdi\t\n"
2401 "jl loop_outter%=\t\n"
2402 :
2403 : [gp] "rm"(gp)
2404 : "r8",
2405 "r9",
2406 "r10",
2407 "r11",
2408 "r13",
2409 "r14",
2410 "rax",
2411 "rcx",
2412 "rsi",
2413 "rdi",
2414 "rbx",
2415 "r12",
2416 "r15",
2417 "memory");
2418}
2419void NOINLINE gemmkernel_12x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
2420 asm volatile(
2421#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
2422 "mov %[gp], %%r14\t\n"
2423 ".intel_syntax noprefix\t\n"
2424#else
2425 "mov r14, %[gp]\t\n"
2426#endif
2427
2428 // Copy parameters
2429 // k
2430 "mov r8, [r14 + 0]\t\n"
2431 "dec r8\t\n"
2432 // A
2433 "mov r9, [r14 + 8]\t\n"
2434 // B
2435 "mov r10, [r14 + 16]\t\n"
2436 // beta
2437 "lea r15, [r14 + 24]\t\n"
2438 // C
2439 "mov r12, [r14 + 32]\t\n"
2440 // ldc
2441 "mov r13, [r14 + 40]\t\n"
2442 // b_block_cols
2443 "mov rdi, [r14 + 48]\t\n"
2444 // b_block_size
2445 "mov rsi, [r14 + 56]\t\n"
2446
2447 // Make copies of A and C
2448 "mov rax, r9\t\n"
2449 "mov rcx, r12\t\n"
2450
2451 "xor ebx, ebx\t\n"
2452 "loop_outter%=:\t\n"
2453 "mov r14, r8\t\n"
2454 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
2455 "vcvtph2ps zmm25,YMMWORD PTR [r10 + 0]\t\n"
2456 "vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n"
2457 "vxorps xmm0, xmm0, xmm0\t\n"
2458 "vcomiss xmm31, xmm0\t\n"
2459 "jz zero_regs%=\t\n"
2460
2461 // Setup values with beta multiplication
2462 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
2463 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
2464 "add r12, r13\t\n"
2465 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
2466 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
2467 "add r12, r13\t\n"
2468 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
2469 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
2470 "add r12, r13\t\n"
2471 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
2472 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
2473 "add r12, r13\t\n"
2474 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
2475 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
2476 "add r12, r13\t\n"
2477 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
2478 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
2479 "add r12, r13\t\n"
2480 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
2481 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
2482 "add r12, r13\t\n"
2483 "vmulps zmm14, zmm31, [r12 + 0]\t\n"
2484 "vmulps zmm15, zmm31, [r12 + 64]\t\n"
2485 "add r12, r13\t\n"
2486 "vmulps zmm16, zmm31, [r12 + 0]\t\n"
2487 "vmulps zmm17, zmm31, [r12 + 64]\t\n"
2488 "add r12, r13\t\n"
2489 "vmulps zmm18, zmm31, [r12 + 0]\t\n"
2490 "vmulps zmm19, zmm31, [r12 + 64]\t\n"
2491 "add r12, r13\t\n"
2492 "vmulps zmm20, zmm31, [r12 + 0]\t\n"
2493 "vmulps zmm21, zmm31, [r12 + 64]\t\n"
2494 "add r12, r13\t\n"
2495 "vmulps zmm22, zmm31, [r12 + 0]\t\n"
2496 "vmulps zmm23, zmm31, [r12 + 64]\t\n"
2497 "test r14,r14\t\n"
2498 "jz skip_preload%=\t\n"
2499 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2500 "skip_preload%=:\t\n"
2501 "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
2502 "vfmadd231ps zmm0,zmm25,zmm24\t\n"
2503 "vfmadd231ps zmm1,zmm26,zmm24\t\n"
2504 "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
2505 "vfmadd231ps zmm2,zmm25,zmm24\t\n"
2506 "vfmadd231ps zmm3,zmm26,zmm24\t\n"
2507 "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
2508 "vfmadd231ps zmm4,zmm25,zmm24\t\n"
2509 "vfmadd231ps zmm5,zmm26,zmm24\t\n"
2510 "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
2511 "vfmadd231ps zmm6,zmm25,zmm24\t\n"
2512 "vfmadd231ps zmm7,zmm26,zmm24\t\n"
2513 "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
2514 "vfmadd231ps zmm8,zmm25,zmm24\t\n"
2515 "vfmadd231ps zmm9,zmm26,zmm24\t\n"
2516 "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
2517 "vfmadd231ps zmm10,zmm25,zmm24\t\n"
2518 "vfmadd231ps zmm11,zmm26,zmm24\t\n"
2519 "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
2520 "vfmadd231ps zmm12,zmm25,zmm24\t\n"
2521 "vfmadd231ps zmm13,zmm26,zmm24\t\n"
2522 "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
2523 "vfmadd231ps zmm14,zmm25,zmm24\t\n"
2524 "vfmadd231ps zmm15,zmm26,zmm24\t\n"
2525 "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
2526 "vfmadd231ps zmm16,zmm25,zmm24\t\n"
2527 "vfmadd231ps zmm17,zmm26,zmm24\t\n"
2528 "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
2529 "vfmadd231ps zmm18,zmm25,zmm24\t\n"
2530 "vfmadd231ps zmm19,zmm26,zmm24\t\n"
2531 "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
2532 "vfmadd231ps zmm20,zmm25,zmm24\t\n"
2533 "vfmadd231ps zmm21,zmm26,zmm24\t\n"
2534 "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
2535 "vfmadd231ps zmm22,zmm25,zmm24\t\n"
2536 "vfmadd231ps zmm23,zmm26,zmm24\t\n"
2537 "mov r12, rcx\t\n"
2538 "test r14,r14\t\n"
2539 "jnz next_inner%=\t\n"
2540 "add r10,64\t\n"
2541 "jmp dump_C%=\t\n"
2542
2543 "zero_regs%=:\t\n"
2544
2545 "test r14,r14\t\n"
2546 "jz skip_preload_b_zero%=\t\n"
2547 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2548 "skip_preload_b_zero%=:\t\n"
2549 "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
2550 "vmulps zmm0,zmm25,zmm24\t\n"
2551 "vmulps zmm1,zmm26,zmm24\t\n"
2552 "add r12, r13\t\n"
2553 "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
2554 "vmulps zmm2,zmm25,zmm24\t\n"
2555 "vmulps zmm3,zmm26,zmm24\t\n"
2556 "add r12, r13\t\n"
2557 "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
2558 "vmulps zmm4,zmm25,zmm24\t\n"
2559 "vmulps zmm5,zmm26,zmm24\t\n"
2560 "add r12, r13\t\n"
2561 "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
2562 "vmulps zmm6,zmm25,zmm24\t\n"
2563 "vmulps zmm7,zmm26,zmm24\t\n"
2564 "add r12, r13\t\n"
2565 "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
2566 "vmulps zmm8,zmm25,zmm24\t\n"
2567 "vmulps zmm9,zmm26,zmm24\t\n"
2568 "add r12, r13\t\n"
2569 "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
2570 "vmulps zmm10,zmm25,zmm24\t\n"
2571 "vmulps zmm11,zmm26,zmm24\t\n"
2572 "add r12, r13\t\n"
2573 "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
2574 "vmulps zmm12,zmm25,zmm24\t\n"
2575 "vmulps zmm13,zmm26,zmm24\t\n"
2576 "add r12, r13\t\n"
2577 "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
2578 "vmulps zmm14,zmm25,zmm24\t\n"
2579 "vmulps zmm15,zmm26,zmm24\t\n"
2580 "add r12, r13\t\n"
2581 "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
2582 "vmulps zmm16,zmm25,zmm24\t\n"
2583 "vmulps zmm17,zmm26,zmm24\t\n"
2584 "add r12, r13\t\n"
2585 "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
2586 "vmulps zmm18,zmm25,zmm24\t\n"
2587 "vmulps zmm19,zmm26,zmm24\t\n"
2588 "add r12, r13\t\n"
2589 "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
2590 "vmulps zmm20,zmm25,zmm24\t\n"
2591 "vmulps zmm21,zmm26,zmm24\t\n"
2592 "add r12, r13\t\n"
2593 "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
2594 "vmulps zmm22,zmm25,zmm24\t\n"
2595 "vmulps zmm23,zmm26,zmm24\t\n"
2596 "mov r12, rcx\t\n"
2597 "test r14,r14\t\n"
2598 "jnz next_inner%=\t\n"
2599 "add r10,64\t\n"
2600 "jmp dump_C%=\t\n"
2601
2602 "loop_inner%=:\t\n"
2603
2604 "vmovaps zmm25,zmm31\t\n"
2605 "vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n"
2606 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2607 "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
2608 "vfmadd231ps zmm0,zmm25,zmm24\t\n"
2609 "vfmadd231ps zmm1,zmm26,zmm24\t\n"
2610 "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
2611 "vfmadd231ps zmm2,zmm25,zmm24\t\n"
2612 "vfmadd231ps zmm3,zmm26,zmm24\t\n"
2613 "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
2614 "vfmadd231ps zmm4,zmm25,zmm24\t\n"
2615 "vfmadd231ps zmm5,zmm26,zmm24\t\n"
2616 "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
2617 "vfmadd231ps zmm6,zmm25,zmm24\t\n"
2618 "vfmadd231ps zmm7,zmm26,zmm24\t\n"
2619 "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
2620 "vfmadd231ps zmm8,zmm25,zmm24\t\n"
2621 "vfmadd231ps zmm9,zmm26,zmm24\t\n"
2622 "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
2623 "vfmadd231ps zmm10,zmm25,zmm24\t\n"
2624 "vfmadd231ps zmm11,zmm26,zmm24\t\n"
2625 "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
2626 "vfmadd231ps zmm12,zmm25,zmm24\t\n"
2627 "vfmadd231ps zmm13,zmm26,zmm24\t\n"
2628 "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
2629 "vfmadd231ps zmm14,zmm25,zmm24\t\n"
2630 "vfmadd231ps zmm15,zmm26,zmm24\t\n"
2631 "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
2632 "vfmadd231ps zmm16,zmm25,zmm24\t\n"
2633 "vfmadd231ps zmm17,zmm26,zmm24\t\n"
2634 "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
2635 "vfmadd231ps zmm18,zmm25,zmm24\t\n"
2636 "vfmadd231ps zmm19,zmm26,zmm24\t\n"
2637 "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
2638 "vfmadd231ps zmm20,zmm25,zmm24\t\n"
2639 "vfmadd231ps zmm21,zmm26,zmm24\t\n"
2640 "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
2641 "vfmadd231ps zmm22,zmm25,zmm24\t\n"
2642 "vfmadd231ps zmm23,zmm26,zmm24\t\n"
2643
2644 "next_inner%=:\t\n"
2645 "add r9,48\t\n"
2646 "add r10,64\t\n"
2647 "dec r14\t\n"
2648 "jnz loop_inner%=\t\n"
2649
2650 "vmovaps zmm25,zmm31\t\n"
2651 "vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n"
2652 "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n"
2653 "vfmadd231ps zmm0,zmm25,zmm24\t\n"
2654 "vfmadd231ps zmm1,zmm26,zmm24\t\n"
2655 "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n"
2656 "vfmadd231ps zmm2,zmm25,zmm24\t\n"
2657 "vfmadd231ps zmm3,zmm26,zmm24\t\n"
2658 "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n"
2659 "vfmadd231ps zmm4,zmm25,zmm24\t\n"
2660 "vfmadd231ps zmm5,zmm26,zmm24\t\n"
2661 "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n"
2662 "vfmadd231ps zmm6,zmm25,zmm24\t\n"
2663 "vfmadd231ps zmm7,zmm26,zmm24\t\n"
2664 "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n"
2665 "vfmadd231ps zmm8,zmm25,zmm24\t\n"
2666 "vfmadd231ps zmm9,zmm26,zmm24\t\n"
2667 "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n"
2668 "vfmadd231ps zmm10,zmm25,zmm24\t\n"
2669 "vfmadd231ps zmm11,zmm26,zmm24\t\n"
2670 "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n"
2671 "vfmadd231ps zmm12,zmm25,zmm24\t\n"
2672 "vfmadd231ps zmm13,zmm26,zmm24\t\n"
2673 "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n"
2674 "vfmadd231ps zmm14,zmm25,zmm24\t\n"
2675 "vfmadd231ps zmm15,zmm26,zmm24\t\n"
2676 "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n"
2677 "vfmadd231ps zmm16,zmm25,zmm24\t\n"
2678 "vfmadd231ps zmm17,zmm26,zmm24\t\n"
2679 "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n"
2680 "vfmadd231ps zmm18,zmm25,zmm24\t\n"
2681 "vfmadd231ps zmm19,zmm26,zmm24\t\n"
2682 "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n"
2683 "vfmadd231ps zmm20,zmm25,zmm24\t\n"
2684 "vfmadd231ps zmm21,zmm26,zmm24\t\n"
2685 "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n"
2686 "vfmadd231ps zmm22,zmm25,zmm24\t\n"
2687 "vfmadd231ps zmm23,zmm26,zmm24\t\n"
2688 "add r9,48\t\n"
2689 "add r10,64\t\n"
2690 // Dump C
2691 "dump_C%=:\t\n"
2692 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
2693 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
2694 "add r12, r13\t\n"
2695 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
2696 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
2697 "add r12, r13\t\n"
2698 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
2699 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
2700 "add r12, r13\t\n"
2701 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
2702 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
2703 "add r12, r13\t\n"
2704 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
2705 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
2706 "add r12, r13\t\n"
2707 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
2708 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
2709 "add r12, r13\t\n"
2710 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
2711 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
2712 "add r12, r13\t\n"
2713 "vmovups zmmword PTR [r12 + 0], zmm14\t\n"
2714 "vmovups zmmword PTR [r12 + 64], zmm15\t\n"
2715 "add r12, r13\t\n"
2716 "vmovups zmmword PTR [r12 + 0], zmm16\t\n"
2717 "vmovups zmmword PTR [r12 + 64], zmm17\t\n"
2718 "add r12, r13\t\n"
2719 "vmovups zmmword PTR [r12 + 0], zmm18\t\n"
2720 "vmovups zmmword PTR [r12 + 64], zmm19\t\n"
2721 "add r12, r13\t\n"
2722 "vmovups zmmword PTR [r12 + 0], zmm20\t\n"
2723 "vmovups zmmword PTR [r12 + 64], zmm21\t\n"
2724 "add r12, r13\t\n"
2725 "vmovups zmmword PTR [r12 + 0], zmm22\t\n"
2726 "vmovups zmmword PTR [r12 + 64], zmm23\t\n"
2727
2728 // next outer iteration
2729 "add rcx, 128\t\n"
2730 "mov r12, rcx\t\n"
2731 "mov r9, rax\t\n"
2732 "inc rbx\t\n"
2733 "cmp rbx, rdi\t\n"
2734 "jl loop_outter%=\t\n"
2735 :
2736 : [gp] "rm"(gp)
2737 : "r8",
2738 "r9",
2739 "r10",
2740 "r11",
2741 "r13",
2742 "r14",
2743 "rax",
2744 "rcx",
2745 "rsi",
2746 "rdi",
2747 "rbx",
2748 "r12",
2749 "r15",
2750 "memory");
2751}
2752void NOINLINE gemmkernel_13x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
2753 asm volatile(
2754#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
2755 "mov %[gp], %%r14\t\n"
2756 ".intel_syntax noprefix\t\n"
2757#else
2758 "mov r14, %[gp]\t\n"
2759#endif
2760
2761 // Copy parameters
2762 // k
2763 "mov r8, [r14 + 0]\t\n"
2764 "dec r8\t\n"
2765 // A
2766 "mov r9, [r14 + 8]\t\n"
2767 // B
2768 "mov r10, [r14 + 16]\t\n"
2769 // beta
2770 "lea r15, [r14 + 24]\t\n"
2771 // C
2772 "mov r12, [r14 + 32]\t\n"
2773 // ldc
2774 "mov r13, [r14 + 40]\t\n"
2775 // b_block_cols
2776 "mov rdi, [r14 + 48]\t\n"
2777 // b_block_size
2778 "mov rsi, [r14 + 56]\t\n"
2779
2780 // Make copies of A and C
2781 "mov rax, r9\t\n"
2782 "mov rcx, r12\t\n"
2783
2784 "xor ebx, ebx\t\n"
2785 "loop_outter%=:\t\n"
2786 "mov r14, r8\t\n"
2787 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
2788 "vcvtph2ps zmm27,YMMWORD PTR [r10 + 0]\t\n"
2789 "vcvtph2ps zmm28,YMMWORD PTR [r10 + 32]\t\n"
2790 "vxorps xmm0, xmm0, xmm0\t\n"
2791 "vcomiss xmm31, xmm0\t\n"
2792 "jz zero_regs%=\t\n"
2793
2794 // Setup values with beta multiplication
2795 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
2796 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
2797 "add r12, r13\t\n"
2798 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
2799 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
2800 "add r12, r13\t\n"
2801 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
2802 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
2803 "add r12, r13\t\n"
2804 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
2805 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
2806 "add r12, r13\t\n"
2807 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
2808 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
2809 "add r12, r13\t\n"
2810 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
2811 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
2812 "add r12, r13\t\n"
2813 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
2814 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
2815 "add r12, r13\t\n"
2816 "vmulps zmm14, zmm31, [r12 + 0]\t\n"
2817 "vmulps zmm15, zmm31, [r12 + 64]\t\n"
2818 "add r12, r13\t\n"
2819 "vmulps zmm16, zmm31, [r12 + 0]\t\n"
2820 "vmulps zmm17, zmm31, [r12 + 64]\t\n"
2821 "add r12, r13\t\n"
2822 "vmulps zmm18, zmm31, [r12 + 0]\t\n"
2823 "vmulps zmm19, zmm31, [r12 + 64]\t\n"
2824 "add r12, r13\t\n"
2825 "vmulps zmm20, zmm31, [r12 + 0]\t\n"
2826 "vmulps zmm21, zmm31, [r12 + 64]\t\n"
2827 "add r12, r13\t\n"
2828 "vmulps zmm22, zmm31, [r12 + 0]\t\n"
2829 "vmulps zmm23, zmm31, [r12 + 64]\t\n"
2830 "add r12, r13\t\n"
2831 "vmulps zmm24, zmm31, [r12 + 0]\t\n"
2832 "vmulps zmm25, zmm31, [r12 + 64]\t\n"
2833 "test r14,r14\t\n"
2834 "jz skip_preload%=\t\n"
2835 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2836 "skip_preload%=:\t\n"
2837 "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n"
2838 "vfmadd231ps zmm0,zmm27,zmm26\t\n"
2839 "vfmadd231ps zmm1,zmm28,zmm26\t\n"
2840 "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n"
2841 "vfmadd231ps zmm2,zmm27,zmm26\t\n"
2842 "vfmadd231ps zmm3,zmm28,zmm26\t\n"
2843 "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n"
2844 "vfmadd231ps zmm4,zmm27,zmm26\t\n"
2845 "vfmadd231ps zmm5,zmm28,zmm26\t\n"
2846 "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n"
2847 "vfmadd231ps zmm6,zmm27,zmm26\t\n"
2848 "vfmadd231ps zmm7,zmm28,zmm26\t\n"
2849 "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n"
2850 "vfmadd231ps zmm8,zmm27,zmm26\t\n"
2851 "vfmadd231ps zmm9,zmm28,zmm26\t\n"
2852 "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n"
2853 "vfmadd231ps zmm10,zmm27,zmm26\t\n"
2854 "vfmadd231ps zmm11,zmm28,zmm26\t\n"
2855 "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n"
2856 "vfmadd231ps zmm12,zmm27,zmm26\t\n"
2857 "vfmadd231ps zmm13,zmm28,zmm26\t\n"
2858 "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n"
2859 "vfmadd231ps zmm14,zmm27,zmm26\t\n"
2860 "vfmadd231ps zmm15,zmm28,zmm26\t\n"
2861 "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n"
2862 "vfmadd231ps zmm16,zmm27,zmm26\t\n"
2863 "vfmadd231ps zmm17,zmm28,zmm26\t\n"
2864 "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n"
2865 "vfmadd231ps zmm18,zmm27,zmm26\t\n"
2866 "vfmadd231ps zmm19,zmm28,zmm26\t\n"
2867 "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n"
2868 "vfmadd231ps zmm20,zmm27,zmm26\t\n"
2869 "vfmadd231ps zmm21,zmm28,zmm26\t\n"
2870 "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n"
2871 "vfmadd231ps zmm22,zmm27,zmm26\t\n"
2872 "vfmadd231ps zmm23,zmm28,zmm26\t\n"
2873 "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n"
2874 "vfmadd231ps zmm24,zmm27,zmm26\t\n"
2875 "vfmadd231ps zmm25,zmm28,zmm26\t\n"
2876 "mov r12, rcx\t\n"
2877 "test r14,r14\t\n"
2878 "jnz next_inner%=\t\n"
2879 "add r10,64\t\n"
2880 "jmp dump_C%=\t\n"
2881
2882 "zero_regs%=:\t\n"
2883
2884 "test r14,r14\t\n"
2885 "jz skip_preload_b_zero%=\t\n"
2886 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2887 "skip_preload_b_zero%=:\t\n"
2888 "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n"
2889 "vmulps zmm0,zmm27,zmm26\t\n"
2890 "vmulps zmm1,zmm28,zmm26\t\n"
2891 "add r12, r13\t\n"
2892 "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n"
2893 "vmulps zmm2,zmm27,zmm26\t\n"
2894 "vmulps zmm3,zmm28,zmm26\t\n"
2895 "add r12, r13\t\n"
2896 "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n"
2897 "vmulps zmm4,zmm27,zmm26\t\n"
2898 "vmulps zmm5,zmm28,zmm26\t\n"
2899 "add r12, r13\t\n"
2900 "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n"
2901 "vmulps zmm6,zmm27,zmm26\t\n"
2902 "vmulps zmm7,zmm28,zmm26\t\n"
2903 "add r12, r13\t\n"
2904 "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n"
2905 "vmulps zmm8,zmm27,zmm26\t\n"
2906 "vmulps zmm9,zmm28,zmm26\t\n"
2907 "add r12, r13\t\n"
2908 "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n"
2909 "vmulps zmm10,zmm27,zmm26\t\n"
2910 "vmulps zmm11,zmm28,zmm26\t\n"
2911 "add r12, r13\t\n"
2912 "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n"
2913 "vmulps zmm12,zmm27,zmm26\t\n"
2914 "vmulps zmm13,zmm28,zmm26\t\n"
2915 "add r12, r13\t\n"
2916 "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n"
2917 "vmulps zmm14,zmm27,zmm26\t\n"
2918 "vmulps zmm15,zmm28,zmm26\t\n"
2919 "add r12, r13\t\n"
2920 "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n"
2921 "vmulps zmm16,zmm27,zmm26\t\n"
2922 "vmulps zmm17,zmm28,zmm26\t\n"
2923 "add r12, r13\t\n"
2924 "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n"
2925 "vmulps zmm18,zmm27,zmm26\t\n"
2926 "vmulps zmm19,zmm28,zmm26\t\n"
2927 "add r12, r13\t\n"
2928 "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n"
2929 "vmulps zmm20,zmm27,zmm26\t\n"
2930 "vmulps zmm21,zmm28,zmm26\t\n"
2931 "add r12, r13\t\n"
2932 "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n"
2933 "vmulps zmm22,zmm27,zmm26\t\n"
2934 "vmulps zmm23,zmm28,zmm26\t\n"
2935 "add r12, r13\t\n"
2936 "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n"
2937 "vmulps zmm24,zmm27,zmm26\t\n"
2938 "vmulps zmm25,zmm28,zmm26\t\n"
2939 "mov r12, rcx\t\n"
2940 "test r14,r14\t\n"
2941 "jnz next_inner%=\t\n"
2942 "add r10,64\t\n"
2943 "jmp dump_C%=\t\n"
2944
2945 "loop_inner%=:\t\n"
2946
2947 "vmovaps zmm27,zmm31\t\n"
2948 "vcvtph2ps zmm28,YMMWORD PTR [r10 + 32]\t\n"
2949 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
2950 "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n"
2951 "vfmadd231ps zmm0,zmm27,zmm26\t\n"
2952 "vfmadd231ps zmm1,zmm28,zmm26\t\n"
2953 "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n"
2954 "vfmadd231ps zmm2,zmm27,zmm26\t\n"
2955 "vfmadd231ps zmm3,zmm28,zmm26\t\n"
2956 "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n"
2957 "vfmadd231ps zmm4,zmm27,zmm26\t\n"
2958 "vfmadd231ps zmm5,zmm28,zmm26\t\n"
2959 "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n"
2960 "vfmadd231ps zmm6,zmm27,zmm26\t\n"
2961 "vfmadd231ps zmm7,zmm28,zmm26\t\n"
2962 "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n"
2963 "vfmadd231ps zmm8,zmm27,zmm26\t\n"
2964 "vfmadd231ps zmm9,zmm28,zmm26\t\n"
2965 "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n"
2966 "vfmadd231ps zmm10,zmm27,zmm26\t\n"
2967 "vfmadd231ps zmm11,zmm28,zmm26\t\n"
2968 "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n"
2969 "vfmadd231ps zmm12,zmm27,zmm26\t\n"
2970 "vfmadd231ps zmm13,zmm28,zmm26\t\n"
2971 "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n"
2972 "vfmadd231ps zmm14,zmm27,zmm26\t\n"
2973 "vfmadd231ps zmm15,zmm28,zmm26\t\n"
2974 "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n"
2975 "vfmadd231ps zmm16,zmm27,zmm26\t\n"
2976 "vfmadd231ps zmm17,zmm28,zmm26\t\n"
2977 "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n"
2978 "vfmadd231ps zmm18,zmm27,zmm26\t\n"
2979 "vfmadd231ps zmm19,zmm28,zmm26\t\n"
2980 "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n"
2981 "vfmadd231ps zmm20,zmm27,zmm26\t\n"
2982 "vfmadd231ps zmm21,zmm28,zmm26\t\n"
2983 "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n"
2984 "vfmadd231ps zmm22,zmm27,zmm26\t\n"
2985 "vfmadd231ps zmm23,zmm28,zmm26\t\n"
2986 "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n"
2987 "vfmadd231ps zmm24,zmm27,zmm26\t\n"
2988 "vfmadd231ps zmm25,zmm28,zmm26\t\n"
2989
2990 "next_inner%=:\t\n"
2991 "add r9,52\t\n"
2992 "add r10,64\t\n"
2993 "dec r14\t\n"
2994 "jnz loop_inner%=\t\n"
2995
2996 "vmovaps zmm27,zmm31\t\n"
2997 "vcvtph2ps zmm28,YMMWORD PTR [r10 + 32]\t\n"
2998 "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n"
2999 "vfmadd231ps zmm0,zmm27,zmm26\t\n"
3000 "vfmadd231ps zmm1,zmm28,zmm26\t\n"
3001 "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n"
3002 "vfmadd231ps zmm2,zmm27,zmm26\t\n"
3003 "vfmadd231ps zmm3,zmm28,zmm26\t\n"
3004 "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n"
3005 "vfmadd231ps zmm4,zmm27,zmm26\t\n"
3006 "vfmadd231ps zmm5,zmm28,zmm26\t\n"
3007 "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n"
3008 "vfmadd231ps zmm6,zmm27,zmm26\t\n"
3009 "vfmadd231ps zmm7,zmm28,zmm26\t\n"
3010 "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n"
3011 "vfmadd231ps zmm8,zmm27,zmm26\t\n"
3012 "vfmadd231ps zmm9,zmm28,zmm26\t\n"
3013 "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n"
3014 "vfmadd231ps zmm10,zmm27,zmm26\t\n"
3015 "vfmadd231ps zmm11,zmm28,zmm26\t\n"
3016 "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n"
3017 "vfmadd231ps zmm12,zmm27,zmm26\t\n"
3018 "vfmadd231ps zmm13,zmm28,zmm26\t\n"
3019 "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n"
3020 "vfmadd231ps zmm14,zmm27,zmm26\t\n"
3021 "vfmadd231ps zmm15,zmm28,zmm26\t\n"
3022 "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n"
3023 "vfmadd231ps zmm16,zmm27,zmm26\t\n"
3024 "vfmadd231ps zmm17,zmm28,zmm26\t\n"
3025 "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n"
3026 "vfmadd231ps zmm18,zmm27,zmm26\t\n"
3027 "vfmadd231ps zmm19,zmm28,zmm26\t\n"
3028 "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n"
3029 "vfmadd231ps zmm20,zmm27,zmm26\t\n"
3030 "vfmadd231ps zmm21,zmm28,zmm26\t\n"
3031 "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n"
3032 "vfmadd231ps zmm22,zmm27,zmm26\t\n"
3033 "vfmadd231ps zmm23,zmm28,zmm26\t\n"
3034 "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n"
3035 "vfmadd231ps zmm24,zmm27,zmm26\t\n"
3036 "vfmadd231ps zmm25,zmm28,zmm26\t\n"
3037 "add r9,52\t\n"
3038 "add r10,64\t\n"
3039 // Dump C
3040 "dump_C%=:\t\n"
3041 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
3042 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
3043 "add r12, r13\t\n"
3044 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
3045 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
3046 "add r12, r13\t\n"
3047 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
3048 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
3049 "add r12, r13\t\n"
3050 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
3051 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
3052 "add r12, r13\t\n"
3053 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
3054 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
3055 "add r12, r13\t\n"
3056 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
3057 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
3058 "add r12, r13\t\n"
3059 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
3060 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
3061 "add r12, r13\t\n"
3062 "vmovups zmmword PTR [r12 + 0], zmm14\t\n"
3063 "vmovups zmmword PTR [r12 + 64], zmm15\t\n"
3064 "add r12, r13\t\n"
3065 "vmovups zmmword PTR [r12 + 0], zmm16\t\n"
3066 "vmovups zmmword PTR [r12 + 64], zmm17\t\n"
3067 "add r12, r13\t\n"
3068 "vmovups zmmword PTR [r12 + 0], zmm18\t\n"
3069 "vmovups zmmword PTR [r12 + 64], zmm19\t\n"
3070 "add r12, r13\t\n"
3071 "vmovups zmmword PTR [r12 + 0], zmm20\t\n"
3072 "vmovups zmmword PTR [r12 + 64], zmm21\t\n"
3073 "add r12, r13\t\n"
3074 "vmovups zmmword PTR [r12 + 0], zmm22\t\n"
3075 "vmovups zmmword PTR [r12 + 64], zmm23\t\n"
3076 "add r12, r13\t\n"
3077 "vmovups zmmword PTR [r12 + 0], zmm24\t\n"
3078 "vmovups zmmword PTR [r12 + 64], zmm25\t\n"
3079
3080 // next outer iteration
3081 "add rcx, 128\t\n"
3082 "mov r12, rcx\t\n"
3083 "mov r9, rax\t\n"
3084 "inc rbx\t\n"
3085 "cmp rbx, rdi\t\n"
3086 "jl loop_outter%=\t\n"
3087 :
3088 : [gp] "rm"(gp)
3089 : "r8",
3090 "r9",
3091 "r10",
3092 "r11",
3093 "r13",
3094 "r14",
3095 "rax",
3096 "rcx",
3097 "rsi",
3098 "rdi",
3099 "rbx",
3100 "r12",
3101 "r15",
3102 "memory");
3103}
3104void NOINLINE gemmkernel_14x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
3105 asm volatile(
3106#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
3107 "mov %[gp], %%r14\t\n"
3108 ".intel_syntax noprefix\t\n"
3109#else
3110 "mov r14, %[gp]\t\n"
3111#endif
3112
3113 // Copy parameters
3114 // k
3115 "mov r8, [r14 + 0]\t\n"
3116 "dec r8\t\n"
3117 // A
3118 "mov r9, [r14 + 8]\t\n"
3119 // B
3120 "mov r10, [r14 + 16]\t\n"
3121 // beta
3122 "lea r15, [r14 + 24]\t\n"
3123 // C
3124 "mov r12, [r14 + 32]\t\n"
3125 // ldc
3126 "mov r13, [r14 + 40]\t\n"
3127 // b_block_cols
3128 "mov rdi, [r14 + 48]\t\n"
3129 // b_block_size
3130 "mov rsi, [r14 + 56]\t\n"
3131
3132 // Make copies of A and C
3133 "mov rax, r9\t\n"
3134 "mov rcx, r12\t\n"
3135
3136 "xor ebx, ebx\t\n"
3137 "loop_outter%=:\t\n"
3138 "mov r14, r8\t\n"
3139 "vbroadcastss zmm31,DWORD PTR [r15]\t\n"
3140 "vcvtph2ps zmm29,YMMWORD PTR [r10 + 0]\t\n"
3141 "vcvtph2ps zmm30,YMMWORD PTR [r10 + 32]\t\n"
3142 "vxorps xmm0, xmm0, xmm0\t\n"
3143 "vcomiss xmm31, xmm0\t\n"
3144 "jz zero_regs%=\t\n"
3145
3146 // Setup values with beta multiplication
3147 "vmulps zmm0, zmm31, [r12 + 0]\t\n"
3148 "vmulps zmm1, zmm31, [r12 + 64]\t\n"
3149 "add r12, r13\t\n"
3150 "vmulps zmm2, zmm31, [r12 + 0]\t\n"
3151 "vmulps zmm3, zmm31, [r12 + 64]\t\n"
3152 "add r12, r13\t\n"
3153 "vmulps zmm4, zmm31, [r12 + 0]\t\n"
3154 "vmulps zmm5, zmm31, [r12 + 64]\t\n"
3155 "add r12, r13\t\n"
3156 "vmulps zmm6, zmm31, [r12 + 0]\t\n"
3157 "vmulps zmm7, zmm31, [r12 + 64]\t\n"
3158 "add r12, r13\t\n"
3159 "vmulps zmm8, zmm31, [r12 + 0]\t\n"
3160 "vmulps zmm9, zmm31, [r12 + 64]\t\n"
3161 "add r12, r13\t\n"
3162 "vmulps zmm10, zmm31, [r12 + 0]\t\n"
3163 "vmulps zmm11, zmm31, [r12 + 64]\t\n"
3164 "add r12, r13\t\n"
3165 "vmulps zmm12, zmm31, [r12 + 0]\t\n"
3166 "vmulps zmm13, zmm31, [r12 + 64]\t\n"
3167 "add r12, r13\t\n"
3168 "vmulps zmm14, zmm31, [r12 + 0]\t\n"
3169 "vmulps zmm15, zmm31, [r12 + 64]\t\n"
3170 "add r12, r13\t\n"
3171 "vmulps zmm16, zmm31, [r12 + 0]\t\n"
3172 "vmulps zmm17, zmm31, [r12 + 64]\t\n"
3173 "add r12, r13\t\n"
3174 "vmulps zmm18, zmm31, [r12 + 0]\t\n"
3175 "vmulps zmm19, zmm31, [r12 + 64]\t\n"
3176 "add r12, r13\t\n"
3177 "vmulps zmm20, zmm31, [r12 + 0]\t\n"
3178 "vmulps zmm21, zmm31, [r12 + 64]\t\n"
3179 "add r12, r13\t\n"
3180 "vmulps zmm22, zmm31, [r12 + 0]\t\n"
3181 "vmulps zmm23, zmm31, [r12 + 64]\t\n"
3182 "add r12, r13\t\n"
3183 "vmulps zmm24, zmm31, [r12 + 0]\t\n"
3184 "vmulps zmm25, zmm31, [r12 + 64]\t\n"
3185 "add r12, r13\t\n"
3186 "vmulps zmm26, zmm31, [r12 + 0]\t\n"
3187 "vmulps zmm27, zmm31, [r12 + 64]\t\n"
3188 "test r14,r14\t\n"
3189 "jz skip_preload%=\t\n"
3190 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
3191 "skip_preload%=:\t\n"
3192 "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n"
3193 "vfmadd231ps zmm0,zmm29,zmm28\t\n"
3194 "vfmadd231ps zmm1,zmm30,zmm28\t\n"
3195 "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n"
3196 "vfmadd231ps zmm2,zmm29,zmm28\t\n"
3197 "vfmadd231ps zmm3,zmm30,zmm28\t\n"
3198 "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n"
3199 "vfmadd231ps zmm4,zmm29,zmm28\t\n"
3200 "vfmadd231ps zmm5,zmm30,zmm28\t\n"
3201 "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n"
3202 "vfmadd231ps zmm6,zmm29,zmm28\t\n"
3203 "vfmadd231ps zmm7,zmm30,zmm28\t\n"
3204 "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n"
3205 "vfmadd231ps zmm8,zmm29,zmm28\t\n"
3206 "vfmadd231ps zmm9,zmm30,zmm28\t\n"
3207 "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n"
3208 "vfmadd231ps zmm10,zmm29,zmm28\t\n"
3209 "vfmadd231ps zmm11,zmm30,zmm28\t\n"
3210 "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n"
3211 "vfmadd231ps zmm12,zmm29,zmm28\t\n"
3212 "vfmadd231ps zmm13,zmm30,zmm28\t\n"
3213 "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n"
3214 "vfmadd231ps zmm14,zmm29,zmm28\t\n"
3215 "vfmadd231ps zmm15,zmm30,zmm28\t\n"
3216 "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n"
3217 "vfmadd231ps zmm16,zmm29,zmm28\t\n"
3218 "vfmadd231ps zmm17,zmm30,zmm28\t\n"
3219 "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n"
3220 "vfmadd231ps zmm18,zmm29,zmm28\t\n"
3221 "vfmadd231ps zmm19,zmm30,zmm28\t\n"
3222 "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n"
3223 "vfmadd231ps zmm20,zmm29,zmm28\t\n"
3224 "vfmadd231ps zmm21,zmm30,zmm28\t\n"
3225 "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n"
3226 "vfmadd231ps zmm22,zmm29,zmm28\t\n"
3227 "vfmadd231ps zmm23,zmm30,zmm28\t\n"
3228 "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n"
3229 "vfmadd231ps zmm24,zmm29,zmm28\t\n"
3230 "vfmadd231ps zmm25,zmm30,zmm28\t\n"
3231 "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n"
3232 "vfmadd231ps zmm26,zmm29,zmm28\t\n"
3233 "vfmadd231ps zmm27,zmm30,zmm28\t\n"
3234 "mov r12, rcx\t\n"
3235 "test r14,r14\t\n"
3236 "jnz next_inner%=\t\n"
3237 "add r10,64\t\n"
3238 "jmp dump_C%=\t\n"
3239
3240 "zero_regs%=:\t\n"
3241
3242 "test r14,r14\t\n"
3243 "jz skip_preload_b_zero%=\t\n"
3244 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
3245 "skip_preload_b_zero%=:\t\n"
3246 "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n"
3247 "vmulps zmm0,zmm29,zmm28\t\n"
3248 "vmulps zmm1,zmm30,zmm28\t\n"
3249 "add r12, r13\t\n"
3250 "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n"
3251 "vmulps zmm2,zmm29,zmm28\t\n"
3252 "vmulps zmm3,zmm30,zmm28\t\n"
3253 "add r12, r13\t\n"
3254 "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n"
3255 "vmulps zmm4,zmm29,zmm28\t\n"
3256 "vmulps zmm5,zmm30,zmm28\t\n"
3257 "add r12, r13\t\n"
3258 "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n"
3259 "vmulps zmm6,zmm29,zmm28\t\n"
3260 "vmulps zmm7,zmm30,zmm28\t\n"
3261 "add r12, r13\t\n"
3262 "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n"
3263 "vmulps zmm8,zmm29,zmm28\t\n"
3264 "vmulps zmm9,zmm30,zmm28\t\n"
3265 "add r12, r13\t\n"
3266 "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n"
3267 "vmulps zmm10,zmm29,zmm28\t\n"
3268 "vmulps zmm11,zmm30,zmm28\t\n"
3269 "add r12, r13\t\n"
3270 "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n"
3271 "vmulps zmm12,zmm29,zmm28\t\n"
3272 "vmulps zmm13,zmm30,zmm28\t\n"
3273 "add r12, r13\t\n"
3274 "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n"
3275 "vmulps zmm14,zmm29,zmm28\t\n"
3276 "vmulps zmm15,zmm30,zmm28\t\n"
3277 "add r12, r13\t\n"
3278 "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n"
3279 "vmulps zmm16,zmm29,zmm28\t\n"
3280 "vmulps zmm17,zmm30,zmm28\t\n"
3281 "add r12, r13\t\n"
3282 "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n"
3283 "vmulps zmm18,zmm29,zmm28\t\n"
3284 "vmulps zmm19,zmm30,zmm28\t\n"
3285 "add r12, r13\t\n"
3286 "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n"
3287 "vmulps zmm20,zmm29,zmm28\t\n"
3288 "vmulps zmm21,zmm30,zmm28\t\n"
3289 "add r12, r13\t\n"
3290 "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n"
3291 "vmulps zmm22,zmm29,zmm28\t\n"
3292 "vmulps zmm23,zmm30,zmm28\t\n"
3293 "add r12, r13\t\n"
3294 "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n"
3295 "vmulps zmm24,zmm29,zmm28\t\n"
3296 "vmulps zmm25,zmm30,zmm28\t\n"
3297 "add r12, r13\t\n"
3298 "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n"
3299 "vmulps zmm26,zmm29,zmm28\t\n"
3300 "vmulps zmm27,zmm30,zmm28\t\n"
3301 "mov r12, rcx\t\n"
3302 "test r14,r14\t\n"
3303 "jnz next_inner%=\t\n"
3304 "add r10,64\t\n"
3305 "jmp dump_C%=\t\n"
3306
3307 "loop_inner%=:\t\n"
3308
3309 "vmovaps zmm29,zmm31\t\n"
3310 "vcvtph2ps zmm30,YMMWORD PTR [r10 + 32]\t\n"
3311 "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n"
3312 "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n"
3313 "vfmadd231ps zmm0,zmm29,zmm28\t\n"
3314 "vfmadd231ps zmm1,zmm30,zmm28\t\n"
3315 "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n"
3316 "vfmadd231ps zmm2,zmm29,zmm28\t\n"
3317 "vfmadd231ps zmm3,zmm30,zmm28\t\n"
3318 "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n"
3319 "vfmadd231ps zmm4,zmm29,zmm28\t\n"
3320 "vfmadd231ps zmm5,zmm30,zmm28\t\n"
3321 "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n"
3322 "vfmadd231ps zmm6,zmm29,zmm28\t\n"
3323 "vfmadd231ps zmm7,zmm30,zmm28\t\n"
3324 "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n"
3325 "vfmadd231ps zmm8,zmm29,zmm28\t\n"
3326 "vfmadd231ps zmm9,zmm30,zmm28\t\n"
3327 "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n"
3328 "vfmadd231ps zmm10,zmm29,zmm28\t\n"
3329 "vfmadd231ps zmm11,zmm30,zmm28\t\n"
3330 "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n"
3331 "vfmadd231ps zmm12,zmm29,zmm28\t\n"
3332 "vfmadd231ps zmm13,zmm30,zmm28\t\n"
3333 "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n"
3334 "vfmadd231ps zmm14,zmm29,zmm28\t\n"
3335 "vfmadd231ps zmm15,zmm30,zmm28\t\n"
3336 "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n"
3337 "vfmadd231ps zmm16,zmm29,zmm28\t\n"
3338 "vfmadd231ps zmm17,zmm30,zmm28\t\n"
3339 "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n"
3340 "vfmadd231ps zmm18,zmm29,zmm28\t\n"
3341 "vfmadd231ps zmm19,zmm30,zmm28\t\n"
3342 "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n"
3343 "vfmadd231ps zmm20,zmm29,zmm28\t\n"
3344 "vfmadd231ps zmm21,zmm30,zmm28\t\n"
3345 "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n"
3346 "vfmadd231ps zmm22,zmm29,zmm28\t\n"
3347 "vfmadd231ps zmm23,zmm30,zmm28\t\n"
3348 "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n"
3349 "vfmadd231ps zmm24,zmm29,zmm28\t\n"
3350 "vfmadd231ps zmm25,zmm30,zmm28\t\n"
3351 "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n"
3352 "vfmadd231ps zmm26,zmm29,zmm28\t\n"
3353 "vfmadd231ps zmm27,zmm30,zmm28\t\n"
3354
3355 "next_inner%=:\t\n"
3356 "add r9,56\t\n"
3357 "add r10,64\t\n"
3358 "dec r14\t\n"
3359 "jnz loop_inner%=\t\n"
3360
3361 "vmovaps zmm29,zmm31\t\n"
3362 "vcvtph2ps zmm30,YMMWORD PTR [r10 + 32]\t\n"
3363 "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n"
3364 "vfmadd231ps zmm0,zmm29,zmm28\t\n"
3365 "vfmadd231ps zmm1,zmm30,zmm28\t\n"
3366 "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n"
3367 "vfmadd231ps zmm2,zmm29,zmm28\t\n"
3368 "vfmadd231ps zmm3,zmm30,zmm28\t\n"
3369 "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n"
3370 "vfmadd231ps zmm4,zmm29,zmm28\t\n"
3371 "vfmadd231ps zmm5,zmm30,zmm28\t\n"
3372 "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n"
3373 "vfmadd231ps zmm6,zmm29,zmm28\t\n"
3374 "vfmadd231ps zmm7,zmm30,zmm28\t\n"
3375 "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n"
3376 "vfmadd231ps zmm8,zmm29,zmm28\t\n"
3377 "vfmadd231ps zmm9,zmm30,zmm28\t\n"
3378 "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n"
3379 "vfmadd231ps zmm10,zmm29,zmm28\t\n"
3380 "vfmadd231ps zmm11,zmm30,zmm28\t\n"
3381 "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n"
3382 "vfmadd231ps zmm12,zmm29,zmm28\t\n"
3383 "vfmadd231ps zmm13,zmm30,zmm28\t\n"
3384 "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n"
3385 "vfmadd231ps zmm14,zmm29,zmm28\t\n"
3386 "vfmadd231ps zmm15,zmm30,zmm28\t\n"
3387 "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n"
3388 "vfmadd231ps zmm16,zmm29,zmm28\t\n"
3389 "vfmadd231ps zmm17,zmm30,zmm28\t\n"
3390 "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n"
3391 "vfmadd231ps zmm18,zmm29,zmm28\t\n"
3392 "vfmadd231ps zmm19,zmm30,zmm28\t\n"
3393 "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n"
3394 "vfmadd231ps zmm20,zmm29,zmm28\t\n"
3395 "vfmadd231ps zmm21,zmm30,zmm28\t\n"
3396 "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n"
3397 "vfmadd231ps zmm22,zmm29,zmm28\t\n"
3398 "vfmadd231ps zmm23,zmm30,zmm28\t\n"
3399 "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n"
3400 "vfmadd231ps zmm24,zmm29,zmm28\t\n"
3401 "vfmadd231ps zmm25,zmm30,zmm28\t\n"
3402 "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n"
3403 "vfmadd231ps zmm26,zmm29,zmm28\t\n"
3404 "vfmadd231ps zmm27,zmm30,zmm28\t\n"
3405 "add r9,56\t\n"
3406 "add r10,64\t\n"
3407 // Dump C
3408 "dump_C%=:\t\n"
3409 "vmovups zmmword PTR [r12 + 0], zmm0\t\n"
3410 "vmovups zmmword PTR [r12 + 64], zmm1\t\n"
3411 "add r12, r13\t\n"
3412 "vmovups zmmword PTR [r12 + 0], zmm2\t\n"
3413 "vmovups zmmword PTR [r12 + 64], zmm3\t\n"
3414 "add r12, r13\t\n"
3415 "vmovups zmmword PTR [r12 + 0], zmm4\t\n"
3416 "vmovups zmmword PTR [r12 + 64], zmm5\t\n"
3417 "add r12, r13\t\n"
3418 "vmovups zmmword PTR [r12 + 0], zmm6\t\n"
3419 "vmovups zmmword PTR [r12 + 64], zmm7\t\n"
3420 "add r12, r13\t\n"
3421 "vmovups zmmword PTR [r12 + 0], zmm8\t\n"
3422 "vmovups zmmword PTR [r12 + 64], zmm9\t\n"
3423 "add r12, r13\t\n"
3424 "vmovups zmmword PTR [r12 + 0], zmm10\t\n"
3425 "vmovups zmmword PTR [r12 + 64], zmm11\t\n"
3426 "add r12, r13\t\n"
3427 "vmovups zmmword PTR [r12 + 0], zmm12\t\n"
3428 "vmovups zmmword PTR [r12 + 64], zmm13\t\n"
3429 "add r12, r13\t\n"
3430 "vmovups zmmword PTR [r12 + 0], zmm14\t\n"
3431 "vmovups zmmword PTR [r12 + 64], zmm15\t\n"
3432 "add r12, r13\t\n"
3433 "vmovups zmmword PTR [r12 + 0], zmm16\t\n"
3434 "vmovups zmmword PTR [r12 + 64], zmm17\t\n"
3435 "add r12, r13\t\n"
3436 "vmovups zmmword PTR [r12 + 0], zmm18\t\n"
3437 "vmovups zmmword PTR [r12 + 64], zmm19\t\n"
3438 "add r12, r13\t\n"
3439 "vmovups zmmword PTR [r12 + 0], zmm20\t\n"
3440 "vmovups zmmword PTR [r12 + 64], zmm21\t\n"
3441 "add r12, r13\t\n"
3442 "vmovups zmmword PTR [r12 + 0], zmm22\t\n"
3443 "vmovups zmmword PTR [r12 + 64], zmm23\t\n"
3444 "add r12, r13\t\n"
3445 "vmovups zmmword PTR [r12 + 0], zmm24\t\n"
3446 "vmovups zmmword PTR [r12 + 64], zmm25\t\n"
3447 "add r12, r13\t\n"
3448 "vmovups zmmword PTR [r12 + 0], zmm26\t\n"
3449 "vmovups zmmword PTR [r12 + 64], zmm27\t\n"
3450
3451 // next outer iteration
3452 "add rcx, 128\t\n"
3453 "mov r12, rcx\t\n"
3454 "mov r9, rax\t\n"
3455 "inc rbx\t\n"
3456 "cmp rbx, rdi\t\n"
3457 "jl loop_outter%=\t\n"
3458 :
3459 : [gp] "rm"(gp)
3460 : "r8",
3461 "r9",
3462 "r10",
3463 "r11",
3464 "r13",
3465 "r14",
3466 "rax",
3467 "rcx",
3468 "rsi",
3469 "rdi",
3470 "rbx",
3471 "r12",
3472 "r15",
3473 "memory");
3474}
3475
3476} // namespace fbgemm
3477