1/*
2 * Copyright (c) Meta Platforms, Inc. and affiliates.
3 * All rights reserved.
4 * This source code is licensed under the BSD-style license found in the
5 * LICENSE file in the root directory of this source tree.
6 */
7#include "./FbgemmFP16UKernelsAvx512_256.h"
8#include "./InlineAsmDefines.h"
9
10namespace fbgemm {
11
12void NOINLINE gemmkernel_7x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
13 asm volatile(
14#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
15 "mov %[gp], %%r14\t\n"
16 ".intel_syntax noprefix\t\n"
17#else
18 "mov r14, %[gp]\t\n"
19#endif
20
21 // Copy parameters
22 // k
23 "mov r8, [r14 + 0]\t\n"
24 "dec r8\t\n"
25 // A
26 "mov r9, [r14 + 8]\t\n"
27 // B
28 "mov r10, [r14 + 16]\t\n"
29 // beta
30 "lea r15, [r14 + 24]\t\n"
31 // C
32 "mov r12, [r14 + 32]\t\n"
33 // ldc
34 "mov r13, [r14 + 40]\t\n"
35 // b_block_cols
36 "mov rdi, [r14 + 48]\t\n"
37 // b_block_size
38 "mov rsi, [r14 + 56]\t\n"
39
40 // Make copies of A and C
41 "mov rax, r9\t\n"
42 "mov rcx, r12\t\n"
43
44 "xor ebx, ebx\t\n"
45 "loop_outter%=:\t\n"
46 "mov r14, r8\t\n"
47 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
48 "vcvtph2ps ymm15,XMMWORD PTR [r10 + 0]\t\n"
49 "vcvtph2ps ymm16,XMMWORD PTR [r10 + 16]\t\n"
50 "vxorps xmm0, xmm0, xmm0\t\n"
51 "vcomiss xmm31, xmm0\t\n"
52 "jz zero_regs%=\t\n"
53
54 // Setup values with beta multiplication
55 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
56 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
57 "add r12, r13\t\n"
58 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
59 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
60 "add r12, r13\t\n"
61 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
62 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
63 "add r12, r13\t\n"
64 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
65 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
66 "add r12, r13\t\n"
67 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
68 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
69 "add r12, r13\t\n"
70 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
71 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
72 "add r12, r13\t\n"
73 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
74 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
75 "test r14,r14\t\n"
76 "jz skip_preload%=\t\n"
77 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
78 "skip_preload%=:\t\n"
79 "vbroadcastss ymm14,DWORD PTR [r9+0]\t\n"
80 "vfmadd231ps ymm0,ymm15,ymm14\t\n"
81 "vfmadd231ps ymm1,ymm16,ymm14\t\n"
82 "vbroadcastss ymm14,DWORD PTR [r9+4]\t\n"
83 "vfmadd231ps ymm2,ymm15,ymm14\t\n"
84 "vfmadd231ps ymm3,ymm16,ymm14\t\n"
85 "vbroadcastss ymm14,DWORD PTR [r9+8]\t\n"
86 "vfmadd231ps ymm4,ymm15,ymm14\t\n"
87 "vfmadd231ps ymm5,ymm16,ymm14\t\n"
88 "vbroadcastss ymm14,DWORD PTR [r9+12]\t\n"
89 "vfmadd231ps ymm6,ymm15,ymm14\t\n"
90 "vfmadd231ps ymm7,ymm16,ymm14\t\n"
91 "vbroadcastss ymm14,DWORD PTR [r9+16]\t\n"
92 "vfmadd231ps ymm8,ymm15,ymm14\t\n"
93 "vfmadd231ps ymm9,ymm16,ymm14\t\n"
94 "vbroadcastss ymm14,DWORD PTR [r9+20]\t\n"
95 "vfmadd231ps ymm10,ymm15,ymm14\t\n"
96 "vfmadd231ps ymm11,ymm16,ymm14\t\n"
97 "vbroadcastss ymm14,DWORD PTR [r9+24]\t\n"
98 "vfmadd231ps ymm12,ymm15,ymm14\t\n"
99 "vfmadd231ps ymm13,ymm16,ymm14\t\n"
100 "mov r12, rcx\t\n"
101 "test r14,r14\t\n"
102 "jnz next_inner%=\t\n"
103 "add r10,32\t\n"
104 "jmp dump_C%=\t\n"
105
106 "zero_regs%=:\t\n"
107
108 "test r14,r14\t\n"
109 "jz skip_preload_b_zero%=\t\n"
110 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
111 "skip_preload_b_zero%=:\t\n"
112 "vbroadcastss ymm14,DWORD PTR [r9+0]\t\n"
113 "vmulps ymm0,ymm15,ymm14\t\n"
114 "vmulps ymm1,ymm16,ymm14\t\n"
115 "add r12, r13\t\n"
116 "vbroadcastss ymm14,DWORD PTR [r9+4]\t\n"
117 "vmulps ymm2,ymm15,ymm14\t\n"
118 "vmulps ymm3,ymm16,ymm14\t\n"
119 "add r12, r13\t\n"
120 "vbroadcastss ymm14,DWORD PTR [r9+8]\t\n"
121 "vmulps ymm4,ymm15,ymm14\t\n"
122 "vmulps ymm5,ymm16,ymm14\t\n"
123 "add r12, r13\t\n"
124 "vbroadcastss ymm14,DWORD PTR [r9+12]\t\n"
125 "vmulps ymm6,ymm15,ymm14\t\n"
126 "vmulps ymm7,ymm16,ymm14\t\n"
127 "add r12, r13\t\n"
128 "vbroadcastss ymm14,DWORD PTR [r9+16]\t\n"
129 "vmulps ymm8,ymm15,ymm14\t\n"
130 "vmulps ymm9,ymm16,ymm14\t\n"
131 "add r12, r13\t\n"
132 "vbroadcastss ymm14,DWORD PTR [r9+20]\t\n"
133 "vmulps ymm10,ymm15,ymm14\t\n"
134 "vmulps ymm11,ymm16,ymm14\t\n"
135 "add r12, r13\t\n"
136 "vbroadcastss ymm14,DWORD PTR [r9+24]\t\n"
137 "vmulps ymm12,ymm15,ymm14\t\n"
138 "vmulps ymm13,ymm16,ymm14\t\n"
139 "mov r12, rcx\t\n"
140 "test r14,r14\t\n"
141 "jnz next_inner%=\t\n"
142 "add r10,32\t\n"
143 "jmp dump_C%=\t\n"
144
145 "loop_inner%=:\t\n"
146
147 "vmovaps ymm15,ymm31\t\n"
148 "vcvtph2ps ymm16,XMMWORD PTR [r10 + 16]\t\n"
149 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
150 "vbroadcastss ymm14,DWORD PTR [r9+0]\t\n"
151 "vfmadd231ps ymm0,ymm15,ymm14\t\n"
152 "vfmadd231ps ymm1,ymm16,ymm14\t\n"
153 "vbroadcastss ymm14,DWORD PTR [r9+4]\t\n"
154 "vfmadd231ps ymm2,ymm15,ymm14\t\n"
155 "vfmadd231ps ymm3,ymm16,ymm14\t\n"
156 "vbroadcastss ymm14,DWORD PTR [r9+8]\t\n"
157 "vfmadd231ps ymm4,ymm15,ymm14\t\n"
158 "vfmadd231ps ymm5,ymm16,ymm14\t\n"
159 "vbroadcastss ymm14,DWORD PTR [r9+12]\t\n"
160 "vfmadd231ps ymm6,ymm15,ymm14\t\n"
161 "vfmadd231ps ymm7,ymm16,ymm14\t\n"
162 "vbroadcastss ymm14,DWORD PTR [r9+16]\t\n"
163 "vfmadd231ps ymm8,ymm15,ymm14\t\n"
164 "vfmadd231ps ymm9,ymm16,ymm14\t\n"
165 "vbroadcastss ymm14,DWORD PTR [r9+20]\t\n"
166 "vfmadd231ps ymm10,ymm15,ymm14\t\n"
167 "vfmadd231ps ymm11,ymm16,ymm14\t\n"
168 "vbroadcastss ymm14,DWORD PTR [r9+24]\t\n"
169 "vfmadd231ps ymm12,ymm15,ymm14\t\n"
170 "vfmadd231ps ymm13,ymm16,ymm14\t\n"
171
172 "next_inner%=:\t\n"
173 "add r9,28\t\n"
174 "add r10,32\t\n"
175 "dec r14\t\n"
176 "jnz loop_inner%=\t\n"
177
178 "vmovaps ymm15,ymm31\t\n"
179 "vcvtph2ps ymm16,XMMWORD PTR [r10 + 16]\t\n"
180 "vbroadcastss ymm14,DWORD PTR [r9+0]\t\n"
181 "vfmadd231ps ymm0,ymm15,ymm14\t\n"
182 "vfmadd231ps ymm1,ymm16,ymm14\t\n"
183 "vbroadcastss ymm14,DWORD PTR [r9+4]\t\n"
184 "vfmadd231ps ymm2,ymm15,ymm14\t\n"
185 "vfmadd231ps ymm3,ymm16,ymm14\t\n"
186 "vbroadcastss ymm14,DWORD PTR [r9+8]\t\n"
187 "vfmadd231ps ymm4,ymm15,ymm14\t\n"
188 "vfmadd231ps ymm5,ymm16,ymm14\t\n"
189 "vbroadcastss ymm14,DWORD PTR [r9+12]\t\n"
190 "vfmadd231ps ymm6,ymm15,ymm14\t\n"
191 "vfmadd231ps ymm7,ymm16,ymm14\t\n"
192 "vbroadcastss ymm14,DWORD PTR [r9+16]\t\n"
193 "vfmadd231ps ymm8,ymm15,ymm14\t\n"
194 "vfmadd231ps ymm9,ymm16,ymm14\t\n"
195 "vbroadcastss ymm14,DWORD PTR [r9+20]\t\n"
196 "vfmadd231ps ymm10,ymm15,ymm14\t\n"
197 "vfmadd231ps ymm11,ymm16,ymm14\t\n"
198 "vbroadcastss ymm14,DWORD PTR [r9+24]\t\n"
199 "vfmadd231ps ymm12,ymm15,ymm14\t\n"
200 "vfmadd231ps ymm13,ymm16,ymm14\t\n"
201 "add r9,28\t\n"
202 "add r10,32\t\n"
203 // Dump C
204 "dump_C%=:\t\n"
205 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
206 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
207 "add r12, r13\t\n"
208 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
209 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
210 "add r12, r13\t\n"
211 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
212 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
213 "add r12, r13\t\n"
214 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
215 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
216 "add r12, r13\t\n"
217 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
218 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
219 "add r12, r13\t\n"
220 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
221 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
222 "add r12, r13\t\n"
223 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
224 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
225
226 // next outer iteration
227 "add rcx, 64\t\n"
228 "mov r12, rcx\t\n"
229 "mov r9, rax\t\n"
230 "inc rbx\t\n"
231 "cmp rbx, rdi\t\n"
232 "jl loop_outter%=\t\n"
233 :
234 : [gp] "rm"(gp)
235 : "r8",
236 "r9",
237 "r10",
238 "r11",
239 "r13",
240 "r14",
241 "rax",
242 "rcx",
243 "rsi",
244 "rdi",
245 "rbx",
246 "r12",
247 "r15",
248 "memory");
249}
250void NOINLINE gemmkernel_8x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
251 asm volatile(
252#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
253 "mov %[gp], %%r14\t\n"
254 ".intel_syntax noprefix\t\n"
255#else
256 "mov r14, %[gp]\t\n"
257#endif
258
259 // Copy parameters
260 // k
261 "mov r8, [r14 + 0]\t\n"
262 "dec r8\t\n"
263 // A
264 "mov r9, [r14 + 8]\t\n"
265 // B
266 "mov r10, [r14 + 16]\t\n"
267 // beta
268 "lea r15, [r14 + 24]\t\n"
269 // C
270 "mov r12, [r14 + 32]\t\n"
271 // ldc
272 "mov r13, [r14 + 40]\t\n"
273 // b_block_cols
274 "mov rdi, [r14 + 48]\t\n"
275 // b_block_size
276 "mov rsi, [r14 + 56]\t\n"
277
278 // Make copies of A and C
279 "mov rax, r9\t\n"
280 "mov rcx, r12\t\n"
281
282 "xor ebx, ebx\t\n"
283 "loop_outter%=:\t\n"
284 "mov r14, r8\t\n"
285 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
286 "vcvtph2ps ymm17,XMMWORD PTR [r10 + 0]\t\n"
287 "vcvtph2ps ymm18,XMMWORD PTR [r10 + 16]\t\n"
288 "vxorps xmm0, xmm0, xmm0\t\n"
289 "vcomiss xmm31, xmm0\t\n"
290 "jz zero_regs%=\t\n"
291
292 // Setup values with beta multiplication
293 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
294 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
295 "add r12, r13\t\n"
296 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
297 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
298 "add r12, r13\t\n"
299 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
300 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
301 "add r12, r13\t\n"
302 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
303 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
304 "add r12, r13\t\n"
305 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
306 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
307 "add r12, r13\t\n"
308 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
309 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
310 "add r12, r13\t\n"
311 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
312 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
313 "add r12, r13\t\n"
314 "vmulps ymm14, ymm31, [r12 + 0]\t\n"
315 "vmulps ymm15, ymm31, [r12 + 32]\t\n"
316 "test r14,r14\t\n"
317 "jz skip_preload%=\t\n"
318 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
319 "skip_preload%=:\t\n"
320 "vbroadcastss ymm16,DWORD PTR [r9+0]\t\n"
321 "vfmadd231ps ymm0,ymm17,ymm16\t\n"
322 "vfmadd231ps ymm1,ymm18,ymm16\t\n"
323 "vbroadcastss ymm16,DWORD PTR [r9+4]\t\n"
324 "vfmadd231ps ymm2,ymm17,ymm16\t\n"
325 "vfmadd231ps ymm3,ymm18,ymm16\t\n"
326 "vbroadcastss ymm16,DWORD PTR [r9+8]\t\n"
327 "vfmadd231ps ymm4,ymm17,ymm16\t\n"
328 "vfmadd231ps ymm5,ymm18,ymm16\t\n"
329 "vbroadcastss ymm16,DWORD PTR [r9+12]\t\n"
330 "vfmadd231ps ymm6,ymm17,ymm16\t\n"
331 "vfmadd231ps ymm7,ymm18,ymm16\t\n"
332 "vbroadcastss ymm16,DWORD PTR [r9+16]\t\n"
333 "vfmadd231ps ymm8,ymm17,ymm16\t\n"
334 "vfmadd231ps ymm9,ymm18,ymm16\t\n"
335 "vbroadcastss ymm16,DWORD PTR [r9+20]\t\n"
336 "vfmadd231ps ymm10,ymm17,ymm16\t\n"
337 "vfmadd231ps ymm11,ymm18,ymm16\t\n"
338 "vbroadcastss ymm16,DWORD PTR [r9+24]\t\n"
339 "vfmadd231ps ymm12,ymm17,ymm16\t\n"
340 "vfmadd231ps ymm13,ymm18,ymm16\t\n"
341 "vbroadcastss ymm16,DWORD PTR [r9+28]\t\n"
342 "vfmadd231ps ymm14,ymm17,ymm16\t\n"
343 "vfmadd231ps ymm15,ymm18,ymm16\t\n"
344 "mov r12, rcx\t\n"
345 "test r14,r14\t\n"
346 "jnz next_inner%=\t\n"
347 "add r10,32\t\n"
348 "jmp dump_C%=\t\n"
349
350 "zero_regs%=:\t\n"
351
352 "test r14,r14\t\n"
353 "jz skip_preload_b_zero%=\t\n"
354 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
355 "skip_preload_b_zero%=:\t\n"
356 "vbroadcastss ymm16,DWORD PTR [r9+0]\t\n"
357 "vmulps ymm0,ymm17,ymm16\t\n"
358 "vmulps ymm1,ymm18,ymm16\t\n"
359 "add r12, r13\t\n"
360 "vbroadcastss ymm16,DWORD PTR [r9+4]\t\n"
361 "vmulps ymm2,ymm17,ymm16\t\n"
362 "vmulps ymm3,ymm18,ymm16\t\n"
363 "add r12, r13\t\n"
364 "vbroadcastss ymm16,DWORD PTR [r9+8]\t\n"
365 "vmulps ymm4,ymm17,ymm16\t\n"
366 "vmulps ymm5,ymm18,ymm16\t\n"
367 "add r12, r13\t\n"
368 "vbroadcastss ymm16,DWORD PTR [r9+12]\t\n"
369 "vmulps ymm6,ymm17,ymm16\t\n"
370 "vmulps ymm7,ymm18,ymm16\t\n"
371 "add r12, r13\t\n"
372 "vbroadcastss ymm16,DWORD PTR [r9+16]\t\n"
373 "vmulps ymm8,ymm17,ymm16\t\n"
374 "vmulps ymm9,ymm18,ymm16\t\n"
375 "add r12, r13\t\n"
376 "vbroadcastss ymm16,DWORD PTR [r9+20]\t\n"
377 "vmulps ymm10,ymm17,ymm16\t\n"
378 "vmulps ymm11,ymm18,ymm16\t\n"
379 "add r12, r13\t\n"
380 "vbroadcastss ymm16,DWORD PTR [r9+24]\t\n"
381 "vmulps ymm12,ymm17,ymm16\t\n"
382 "vmulps ymm13,ymm18,ymm16\t\n"
383 "add r12, r13\t\n"
384 "vbroadcastss ymm16,DWORD PTR [r9+28]\t\n"
385 "vmulps ymm14,ymm17,ymm16\t\n"
386 "vmulps ymm15,ymm18,ymm16\t\n"
387 "mov r12, rcx\t\n"
388 "test r14,r14\t\n"
389 "jnz next_inner%=\t\n"
390 "add r10,32\t\n"
391 "jmp dump_C%=\t\n"
392
393 "loop_inner%=:\t\n"
394
395 "vmovaps ymm17,ymm31\t\n"
396 "vcvtph2ps ymm18,XMMWORD PTR [r10 + 16]\t\n"
397 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
398 "vbroadcastss ymm16,DWORD PTR [r9+0]\t\n"
399 "vfmadd231ps ymm0,ymm17,ymm16\t\n"
400 "vfmadd231ps ymm1,ymm18,ymm16\t\n"
401 "vbroadcastss ymm16,DWORD PTR [r9+4]\t\n"
402 "vfmadd231ps ymm2,ymm17,ymm16\t\n"
403 "vfmadd231ps ymm3,ymm18,ymm16\t\n"
404 "vbroadcastss ymm16,DWORD PTR [r9+8]\t\n"
405 "vfmadd231ps ymm4,ymm17,ymm16\t\n"
406 "vfmadd231ps ymm5,ymm18,ymm16\t\n"
407 "vbroadcastss ymm16,DWORD PTR [r9+12]\t\n"
408 "vfmadd231ps ymm6,ymm17,ymm16\t\n"
409 "vfmadd231ps ymm7,ymm18,ymm16\t\n"
410 "vbroadcastss ymm16,DWORD PTR [r9+16]\t\n"
411 "vfmadd231ps ymm8,ymm17,ymm16\t\n"
412 "vfmadd231ps ymm9,ymm18,ymm16\t\n"
413 "vbroadcastss ymm16,DWORD PTR [r9+20]\t\n"
414 "vfmadd231ps ymm10,ymm17,ymm16\t\n"
415 "vfmadd231ps ymm11,ymm18,ymm16\t\n"
416 "vbroadcastss ymm16,DWORD PTR [r9+24]\t\n"
417 "vfmadd231ps ymm12,ymm17,ymm16\t\n"
418 "vfmadd231ps ymm13,ymm18,ymm16\t\n"
419 "vbroadcastss ymm16,DWORD PTR [r9+28]\t\n"
420 "vfmadd231ps ymm14,ymm17,ymm16\t\n"
421 "vfmadd231ps ymm15,ymm18,ymm16\t\n"
422
423 "next_inner%=:\t\n"
424 "add r9,32\t\n"
425 "add r10,32\t\n"
426 "dec r14\t\n"
427 "jnz loop_inner%=\t\n"
428
429 "vmovaps ymm17,ymm31\t\n"
430 "vcvtph2ps ymm18,XMMWORD PTR [r10 + 16]\t\n"
431 "vbroadcastss ymm16,DWORD PTR [r9+0]\t\n"
432 "vfmadd231ps ymm0,ymm17,ymm16\t\n"
433 "vfmadd231ps ymm1,ymm18,ymm16\t\n"
434 "vbroadcastss ymm16,DWORD PTR [r9+4]\t\n"
435 "vfmadd231ps ymm2,ymm17,ymm16\t\n"
436 "vfmadd231ps ymm3,ymm18,ymm16\t\n"
437 "vbroadcastss ymm16,DWORD PTR [r9+8]\t\n"
438 "vfmadd231ps ymm4,ymm17,ymm16\t\n"
439 "vfmadd231ps ymm5,ymm18,ymm16\t\n"
440 "vbroadcastss ymm16,DWORD PTR [r9+12]\t\n"
441 "vfmadd231ps ymm6,ymm17,ymm16\t\n"
442 "vfmadd231ps ymm7,ymm18,ymm16\t\n"
443 "vbroadcastss ymm16,DWORD PTR [r9+16]\t\n"
444 "vfmadd231ps ymm8,ymm17,ymm16\t\n"
445 "vfmadd231ps ymm9,ymm18,ymm16\t\n"
446 "vbroadcastss ymm16,DWORD PTR [r9+20]\t\n"
447 "vfmadd231ps ymm10,ymm17,ymm16\t\n"
448 "vfmadd231ps ymm11,ymm18,ymm16\t\n"
449 "vbroadcastss ymm16,DWORD PTR [r9+24]\t\n"
450 "vfmadd231ps ymm12,ymm17,ymm16\t\n"
451 "vfmadd231ps ymm13,ymm18,ymm16\t\n"
452 "vbroadcastss ymm16,DWORD PTR [r9+28]\t\n"
453 "vfmadd231ps ymm14,ymm17,ymm16\t\n"
454 "vfmadd231ps ymm15,ymm18,ymm16\t\n"
455 "add r9,32\t\n"
456 "add r10,32\t\n"
457 // Dump C
458 "dump_C%=:\t\n"
459 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
460 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
461 "add r12, r13\t\n"
462 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
463 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
464 "add r12, r13\t\n"
465 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
466 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
467 "add r12, r13\t\n"
468 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
469 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
470 "add r12, r13\t\n"
471 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
472 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
473 "add r12, r13\t\n"
474 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
475 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
476 "add r12, r13\t\n"
477 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
478 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
479 "add r12, r13\t\n"
480 "vmovups ymmword PTR [r12 + 0], ymm14\t\n"
481 "vmovups ymmword PTR [r12 + 32], ymm15\t\n"
482
483 // next outer iteration
484 "add rcx, 64\t\n"
485 "mov r12, rcx\t\n"
486 "mov r9, rax\t\n"
487 "inc rbx\t\n"
488 "cmp rbx, rdi\t\n"
489 "jl loop_outter%=\t\n"
490 :
491 : [gp] "rm"(gp)
492 : "r8",
493 "r9",
494 "r10",
495 "r11",
496 "r13",
497 "r14",
498 "rax",
499 "rcx",
500 "rsi",
501 "rdi",
502 "rbx",
503 "r12",
504 "r15",
505 "memory");
506}
507void NOINLINE gemmkernel_9x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
508 asm volatile(
509#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
510 "mov %[gp], %%r14\t\n"
511 ".intel_syntax noprefix\t\n"
512#else
513 "mov r14, %[gp]\t\n"
514#endif
515
516 // Copy parameters
517 // k
518 "mov r8, [r14 + 0]\t\n"
519 "dec r8\t\n"
520 // A
521 "mov r9, [r14 + 8]\t\n"
522 // B
523 "mov r10, [r14 + 16]\t\n"
524 // beta
525 "lea r15, [r14 + 24]\t\n"
526 // C
527 "mov r12, [r14 + 32]\t\n"
528 // ldc
529 "mov r13, [r14 + 40]\t\n"
530 // b_block_cols
531 "mov rdi, [r14 + 48]\t\n"
532 // b_block_size
533 "mov rsi, [r14 + 56]\t\n"
534
535 // Make copies of A and C
536 "mov rax, r9\t\n"
537 "mov rcx, r12\t\n"
538
539 "xor ebx, ebx\t\n"
540 "loop_outter%=:\t\n"
541 "mov r14, r8\t\n"
542 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
543 "vcvtph2ps ymm19,XMMWORD PTR [r10 + 0]\t\n"
544 "vcvtph2ps ymm20,XMMWORD PTR [r10 + 16]\t\n"
545 "vxorps xmm0, xmm0, xmm0\t\n"
546 "vcomiss xmm31, xmm0\t\n"
547 "jz zero_regs%=\t\n"
548
549 // Setup values with beta multiplication
550 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
551 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
552 "add r12, r13\t\n"
553 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
554 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
555 "add r12, r13\t\n"
556 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
557 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
558 "add r12, r13\t\n"
559 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
560 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
561 "add r12, r13\t\n"
562 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
563 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
564 "add r12, r13\t\n"
565 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
566 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
567 "add r12, r13\t\n"
568 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
569 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
570 "add r12, r13\t\n"
571 "vmulps ymm14, ymm31, [r12 + 0]\t\n"
572 "vmulps ymm15, ymm31, [r12 + 32]\t\n"
573 "add r12, r13\t\n"
574 "vmulps ymm16, ymm31, [r12 + 0]\t\n"
575 "vmulps ymm17, ymm31, [r12 + 32]\t\n"
576 "test r14,r14\t\n"
577 "jz skip_preload%=\t\n"
578 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
579 "skip_preload%=:\t\n"
580 "vbroadcastss ymm18,DWORD PTR [r9+0]\t\n"
581 "vfmadd231ps ymm0,ymm19,ymm18\t\n"
582 "vfmadd231ps ymm1,ymm20,ymm18\t\n"
583 "vbroadcastss ymm18,DWORD PTR [r9+4]\t\n"
584 "vfmadd231ps ymm2,ymm19,ymm18\t\n"
585 "vfmadd231ps ymm3,ymm20,ymm18\t\n"
586 "vbroadcastss ymm18,DWORD PTR [r9+8]\t\n"
587 "vfmadd231ps ymm4,ymm19,ymm18\t\n"
588 "vfmadd231ps ymm5,ymm20,ymm18\t\n"
589 "vbroadcastss ymm18,DWORD PTR [r9+12]\t\n"
590 "vfmadd231ps ymm6,ymm19,ymm18\t\n"
591 "vfmadd231ps ymm7,ymm20,ymm18\t\n"
592 "vbroadcastss ymm18,DWORD PTR [r9+16]\t\n"
593 "vfmadd231ps ymm8,ymm19,ymm18\t\n"
594 "vfmadd231ps ymm9,ymm20,ymm18\t\n"
595 "vbroadcastss ymm18,DWORD PTR [r9+20]\t\n"
596 "vfmadd231ps ymm10,ymm19,ymm18\t\n"
597 "vfmadd231ps ymm11,ymm20,ymm18\t\n"
598 "vbroadcastss ymm18,DWORD PTR [r9+24]\t\n"
599 "vfmadd231ps ymm12,ymm19,ymm18\t\n"
600 "vfmadd231ps ymm13,ymm20,ymm18\t\n"
601 "vbroadcastss ymm18,DWORD PTR [r9+28]\t\n"
602 "vfmadd231ps ymm14,ymm19,ymm18\t\n"
603 "vfmadd231ps ymm15,ymm20,ymm18\t\n"
604 "vbroadcastss ymm18,DWORD PTR [r9+32]\t\n"
605 "vfmadd231ps ymm16,ymm19,ymm18\t\n"
606 "vfmadd231ps ymm17,ymm20,ymm18\t\n"
607 "mov r12, rcx\t\n"
608 "test r14,r14\t\n"
609 "jnz next_inner%=\t\n"
610 "add r10,32\t\n"
611 "jmp dump_C%=\t\n"
612
613 "zero_regs%=:\t\n"
614
615 "test r14,r14\t\n"
616 "jz skip_preload_b_zero%=\t\n"
617 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
618 "skip_preload_b_zero%=:\t\n"
619 "vbroadcastss ymm18,DWORD PTR [r9+0]\t\n"
620 "vmulps ymm0,ymm19,ymm18\t\n"
621 "vmulps ymm1,ymm20,ymm18\t\n"
622 "add r12, r13\t\n"
623 "vbroadcastss ymm18,DWORD PTR [r9+4]\t\n"
624 "vmulps ymm2,ymm19,ymm18\t\n"
625 "vmulps ymm3,ymm20,ymm18\t\n"
626 "add r12, r13\t\n"
627 "vbroadcastss ymm18,DWORD PTR [r9+8]\t\n"
628 "vmulps ymm4,ymm19,ymm18\t\n"
629 "vmulps ymm5,ymm20,ymm18\t\n"
630 "add r12, r13\t\n"
631 "vbroadcastss ymm18,DWORD PTR [r9+12]\t\n"
632 "vmulps ymm6,ymm19,ymm18\t\n"
633 "vmulps ymm7,ymm20,ymm18\t\n"
634 "add r12, r13\t\n"
635 "vbroadcastss ymm18,DWORD PTR [r9+16]\t\n"
636 "vmulps ymm8,ymm19,ymm18\t\n"
637 "vmulps ymm9,ymm20,ymm18\t\n"
638 "add r12, r13\t\n"
639 "vbroadcastss ymm18,DWORD PTR [r9+20]\t\n"
640 "vmulps ymm10,ymm19,ymm18\t\n"
641 "vmulps ymm11,ymm20,ymm18\t\n"
642 "add r12, r13\t\n"
643 "vbroadcastss ymm18,DWORD PTR [r9+24]\t\n"
644 "vmulps ymm12,ymm19,ymm18\t\n"
645 "vmulps ymm13,ymm20,ymm18\t\n"
646 "add r12, r13\t\n"
647 "vbroadcastss ymm18,DWORD PTR [r9+28]\t\n"
648 "vmulps ymm14,ymm19,ymm18\t\n"
649 "vmulps ymm15,ymm20,ymm18\t\n"
650 "add r12, r13\t\n"
651 "vbroadcastss ymm18,DWORD PTR [r9+32]\t\n"
652 "vmulps ymm16,ymm19,ymm18\t\n"
653 "vmulps ymm17,ymm20,ymm18\t\n"
654 "mov r12, rcx\t\n"
655 "test r14,r14\t\n"
656 "jnz next_inner%=\t\n"
657 "add r10,32\t\n"
658 "jmp dump_C%=\t\n"
659
660 "loop_inner%=:\t\n"
661
662 "vmovaps ymm19,ymm31\t\n"
663 "vcvtph2ps ymm20,XMMWORD PTR [r10 + 16]\t\n"
664 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
665 "vbroadcastss ymm18,DWORD PTR [r9+0]\t\n"
666 "vfmadd231ps ymm0,ymm19,ymm18\t\n"
667 "vfmadd231ps ymm1,ymm20,ymm18\t\n"
668 "vbroadcastss ymm18,DWORD PTR [r9+4]\t\n"
669 "vfmadd231ps ymm2,ymm19,ymm18\t\n"
670 "vfmadd231ps ymm3,ymm20,ymm18\t\n"
671 "vbroadcastss ymm18,DWORD PTR [r9+8]\t\n"
672 "vfmadd231ps ymm4,ymm19,ymm18\t\n"
673 "vfmadd231ps ymm5,ymm20,ymm18\t\n"
674 "vbroadcastss ymm18,DWORD PTR [r9+12]\t\n"
675 "vfmadd231ps ymm6,ymm19,ymm18\t\n"
676 "vfmadd231ps ymm7,ymm20,ymm18\t\n"
677 "vbroadcastss ymm18,DWORD PTR [r9+16]\t\n"
678 "vfmadd231ps ymm8,ymm19,ymm18\t\n"
679 "vfmadd231ps ymm9,ymm20,ymm18\t\n"
680 "vbroadcastss ymm18,DWORD PTR [r9+20]\t\n"
681 "vfmadd231ps ymm10,ymm19,ymm18\t\n"
682 "vfmadd231ps ymm11,ymm20,ymm18\t\n"
683 "vbroadcastss ymm18,DWORD PTR [r9+24]\t\n"
684 "vfmadd231ps ymm12,ymm19,ymm18\t\n"
685 "vfmadd231ps ymm13,ymm20,ymm18\t\n"
686 "vbroadcastss ymm18,DWORD PTR [r9+28]\t\n"
687 "vfmadd231ps ymm14,ymm19,ymm18\t\n"
688 "vfmadd231ps ymm15,ymm20,ymm18\t\n"
689 "vbroadcastss ymm18,DWORD PTR [r9+32]\t\n"
690 "vfmadd231ps ymm16,ymm19,ymm18\t\n"
691 "vfmadd231ps ymm17,ymm20,ymm18\t\n"
692
693 "next_inner%=:\t\n"
694 "add r9,36\t\n"
695 "add r10,32\t\n"
696 "dec r14\t\n"
697 "jnz loop_inner%=\t\n"
698
699 "vmovaps ymm19,ymm31\t\n"
700 "vcvtph2ps ymm20,XMMWORD PTR [r10 + 16]\t\n"
701 "vbroadcastss ymm18,DWORD PTR [r9+0]\t\n"
702 "vfmadd231ps ymm0,ymm19,ymm18\t\n"
703 "vfmadd231ps ymm1,ymm20,ymm18\t\n"
704 "vbroadcastss ymm18,DWORD PTR [r9+4]\t\n"
705 "vfmadd231ps ymm2,ymm19,ymm18\t\n"
706 "vfmadd231ps ymm3,ymm20,ymm18\t\n"
707 "vbroadcastss ymm18,DWORD PTR [r9+8]\t\n"
708 "vfmadd231ps ymm4,ymm19,ymm18\t\n"
709 "vfmadd231ps ymm5,ymm20,ymm18\t\n"
710 "vbroadcastss ymm18,DWORD PTR [r9+12]\t\n"
711 "vfmadd231ps ymm6,ymm19,ymm18\t\n"
712 "vfmadd231ps ymm7,ymm20,ymm18\t\n"
713 "vbroadcastss ymm18,DWORD PTR [r9+16]\t\n"
714 "vfmadd231ps ymm8,ymm19,ymm18\t\n"
715 "vfmadd231ps ymm9,ymm20,ymm18\t\n"
716 "vbroadcastss ymm18,DWORD PTR [r9+20]\t\n"
717 "vfmadd231ps ymm10,ymm19,ymm18\t\n"
718 "vfmadd231ps ymm11,ymm20,ymm18\t\n"
719 "vbroadcastss ymm18,DWORD PTR [r9+24]\t\n"
720 "vfmadd231ps ymm12,ymm19,ymm18\t\n"
721 "vfmadd231ps ymm13,ymm20,ymm18\t\n"
722 "vbroadcastss ymm18,DWORD PTR [r9+28]\t\n"
723 "vfmadd231ps ymm14,ymm19,ymm18\t\n"
724 "vfmadd231ps ymm15,ymm20,ymm18\t\n"
725 "vbroadcastss ymm18,DWORD PTR [r9+32]\t\n"
726 "vfmadd231ps ymm16,ymm19,ymm18\t\n"
727 "vfmadd231ps ymm17,ymm20,ymm18\t\n"
728 "add r9,36\t\n"
729 "add r10,32\t\n"
730 // Dump C
731 "dump_C%=:\t\n"
732 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
733 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
734 "add r12, r13\t\n"
735 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
736 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
737 "add r12, r13\t\n"
738 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
739 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
740 "add r12, r13\t\n"
741 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
742 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
743 "add r12, r13\t\n"
744 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
745 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
746 "add r12, r13\t\n"
747 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
748 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
749 "add r12, r13\t\n"
750 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
751 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
752 "add r12, r13\t\n"
753 "vmovups ymmword PTR [r12 + 0], ymm14\t\n"
754 "vmovups ymmword PTR [r12 + 32], ymm15\t\n"
755 "add r12, r13\t\n"
756 "vmovups ymmword PTR [r12 + 0], ymm16\t\n"
757 "vmovups ymmword PTR [r12 + 32], ymm17\t\n"
758
759 // next outer iteration
760 "add rcx, 64\t\n"
761 "mov r12, rcx\t\n"
762 "mov r9, rax\t\n"
763 "inc rbx\t\n"
764 "cmp rbx, rdi\t\n"
765 "jl loop_outter%=\t\n"
766 :
767 : [gp] "rm"(gp)
768 : "r8",
769 "r9",
770 "r10",
771 "r11",
772 "r13",
773 "r14",
774 "rax",
775 "rcx",
776 "rsi",
777 "rdi",
778 "rbx",
779 "r12",
780 "r15",
781 "memory");
782}
783void NOINLINE gemmkernel_10x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
784 asm volatile(
785#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
786 "mov %[gp], %%r14\t\n"
787 ".intel_syntax noprefix\t\n"
788#else
789 "mov r14, %[gp]\t\n"
790#endif
791
792 // Copy parameters
793 // k
794 "mov r8, [r14 + 0]\t\n"
795 "dec r8\t\n"
796 // A
797 "mov r9, [r14 + 8]\t\n"
798 // B
799 "mov r10, [r14 + 16]\t\n"
800 // beta
801 "lea r15, [r14 + 24]\t\n"
802 // C
803 "mov r12, [r14 + 32]\t\n"
804 // ldc
805 "mov r13, [r14 + 40]\t\n"
806 // b_block_cols
807 "mov rdi, [r14 + 48]\t\n"
808 // b_block_size
809 "mov rsi, [r14 + 56]\t\n"
810
811 // Make copies of A and C
812 "mov rax, r9\t\n"
813 "mov rcx, r12\t\n"
814
815 "xor ebx, ebx\t\n"
816 "loop_outter%=:\t\n"
817 "mov r14, r8\t\n"
818 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
819 "vcvtph2ps ymm21,XMMWORD PTR [r10 + 0]\t\n"
820 "vcvtph2ps ymm22,XMMWORD PTR [r10 + 16]\t\n"
821 "vxorps xmm0, xmm0, xmm0\t\n"
822 "vcomiss xmm31, xmm0\t\n"
823 "jz zero_regs%=\t\n"
824
825 // Setup values with beta multiplication
826 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
827 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
828 "add r12, r13\t\n"
829 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
830 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
831 "add r12, r13\t\n"
832 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
833 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
834 "add r12, r13\t\n"
835 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
836 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
837 "add r12, r13\t\n"
838 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
839 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
840 "add r12, r13\t\n"
841 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
842 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
843 "add r12, r13\t\n"
844 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
845 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
846 "add r12, r13\t\n"
847 "vmulps ymm14, ymm31, [r12 + 0]\t\n"
848 "vmulps ymm15, ymm31, [r12 + 32]\t\n"
849 "add r12, r13\t\n"
850 "vmulps ymm16, ymm31, [r12 + 0]\t\n"
851 "vmulps ymm17, ymm31, [r12 + 32]\t\n"
852 "add r12, r13\t\n"
853 "vmulps ymm18, ymm31, [r12 + 0]\t\n"
854 "vmulps ymm19, ymm31, [r12 + 32]\t\n"
855 "test r14,r14\t\n"
856 "jz skip_preload%=\t\n"
857 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
858 "skip_preload%=:\t\n"
859 "vbroadcastss ymm20,DWORD PTR [r9+0]\t\n"
860 "vfmadd231ps ymm0,ymm21,ymm20\t\n"
861 "vfmadd231ps ymm1,ymm22,ymm20\t\n"
862 "vbroadcastss ymm20,DWORD PTR [r9+4]\t\n"
863 "vfmadd231ps ymm2,ymm21,ymm20\t\n"
864 "vfmadd231ps ymm3,ymm22,ymm20\t\n"
865 "vbroadcastss ymm20,DWORD PTR [r9+8]\t\n"
866 "vfmadd231ps ymm4,ymm21,ymm20\t\n"
867 "vfmadd231ps ymm5,ymm22,ymm20\t\n"
868 "vbroadcastss ymm20,DWORD PTR [r9+12]\t\n"
869 "vfmadd231ps ymm6,ymm21,ymm20\t\n"
870 "vfmadd231ps ymm7,ymm22,ymm20\t\n"
871 "vbroadcastss ymm20,DWORD PTR [r9+16]\t\n"
872 "vfmadd231ps ymm8,ymm21,ymm20\t\n"
873 "vfmadd231ps ymm9,ymm22,ymm20\t\n"
874 "vbroadcastss ymm20,DWORD PTR [r9+20]\t\n"
875 "vfmadd231ps ymm10,ymm21,ymm20\t\n"
876 "vfmadd231ps ymm11,ymm22,ymm20\t\n"
877 "vbroadcastss ymm20,DWORD PTR [r9+24]\t\n"
878 "vfmadd231ps ymm12,ymm21,ymm20\t\n"
879 "vfmadd231ps ymm13,ymm22,ymm20\t\n"
880 "vbroadcastss ymm20,DWORD PTR [r9+28]\t\n"
881 "vfmadd231ps ymm14,ymm21,ymm20\t\n"
882 "vfmadd231ps ymm15,ymm22,ymm20\t\n"
883 "vbroadcastss ymm20,DWORD PTR [r9+32]\t\n"
884 "vfmadd231ps ymm16,ymm21,ymm20\t\n"
885 "vfmadd231ps ymm17,ymm22,ymm20\t\n"
886 "vbroadcastss ymm20,DWORD PTR [r9+36]\t\n"
887 "vfmadd231ps ymm18,ymm21,ymm20\t\n"
888 "vfmadd231ps ymm19,ymm22,ymm20\t\n"
889 "mov r12, rcx\t\n"
890 "test r14,r14\t\n"
891 "jnz next_inner%=\t\n"
892 "add r10,32\t\n"
893 "jmp dump_C%=\t\n"
894
895 "zero_regs%=:\t\n"
896
897 "test r14,r14\t\n"
898 "jz skip_preload_b_zero%=\t\n"
899 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
900 "skip_preload_b_zero%=:\t\n"
901 "vbroadcastss ymm20,DWORD PTR [r9+0]\t\n"
902 "vmulps ymm0,ymm21,ymm20\t\n"
903 "vmulps ymm1,ymm22,ymm20\t\n"
904 "add r12, r13\t\n"
905 "vbroadcastss ymm20,DWORD PTR [r9+4]\t\n"
906 "vmulps ymm2,ymm21,ymm20\t\n"
907 "vmulps ymm3,ymm22,ymm20\t\n"
908 "add r12, r13\t\n"
909 "vbroadcastss ymm20,DWORD PTR [r9+8]\t\n"
910 "vmulps ymm4,ymm21,ymm20\t\n"
911 "vmulps ymm5,ymm22,ymm20\t\n"
912 "add r12, r13\t\n"
913 "vbroadcastss ymm20,DWORD PTR [r9+12]\t\n"
914 "vmulps ymm6,ymm21,ymm20\t\n"
915 "vmulps ymm7,ymm22,ymm20\t\n"
916 "add r12, r13\t\n"
917 "vbroadcastss ymm20,DWORD PTR [r9+16]\t\n"
918 "vmulps ymm8,ymm21,ymm20\t\n"
919 "vmulps ymm9,ymm22,ymm20\t\n"
920 "add r12, r13\t\n"
921 "vbroadcastss ymm20,DWORD PTR [r9+20]\t\n"
922 "vmulps ymm10,ymm21,ymm20\t\n"
923 "vmulps ymm11,ymm22,ymm20\t\n"
924 "add r12, r13\t\n"
925 "vbroadcastss ymm20,DWORD PTR [r9+24]\t\n"
926 "vmulps ymm12,ymm21,ymm20\t\n"
927 "vmulps ymm13,ymm22,ymm20\t\n"
928 "add r12, r13\t\n"
929 "vbroadcastss ymm20,DWORD PTR [r9+28]\t\n"
930 "vmulps ymm14,ymm21,ymm20\t\n"
931 "vmulps ymm15,ymm22,ymm20\t\n"
932 "add r12, r13\t\n"
933 "vbroadcastss ymm20,DWORD PTR [r9+32]\t\n"
934 "vmulps ymm16,ymm21,ymm20\t\n"
935 "vmulps ymm17,ymm22,ymm20\t\n"
936 "add r12, r13\t\n"
937 "vbroadcastss ymm20,DWORD PTR [r9+36]\t\n"
938 "vmulps ymm18,ymm21,ymm20\t\n"
939 "vmulps ymm19,ymm22,ymm20\t\n"
940 "mov r12, rcx\t\n"
941 "test r14,r14\t\n"
942 "jnz next_inner%=\t\n"
943 "add r10,32\t\n"
944 "jmp dump_C%=\t\n"
945
946 "loop_inner%=:\t\n"
947
948 "vmovaps ymm21,ymm31\t\n"
949 "vcvtph2ps ymm22,XMMWORD PTR [r10 + 16]\t\n"
950 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
951 "vbroadcastss ymm20,DWORD PTR [r9+0]\t\n"
952 "vfmadd231ps ymm0,ymm21,ymm20\t\n"
953 "vfmadd231ps ymm1,ymm22,ymm20\t\n"
954 "vbroadcastss ymm20,DWORD PTR [r9+4]\t\n"
955 "vfmadd231ps ymm2,ymm21,ymm20\t\n"
956 "vfmadd231ps ymm3,ymm22,ymm20\t\n"
957 "vbroadcastss ymm20,DWORD PTR [r9+8]\t\n"
958 "vfmadd231ps ymm4,ymm21,ymm20\t\n"
959 "vfmadd231ps ymm5,ymm22,ymm20\t\n"
960 "vbroadcastss ymm20,DWORD PTR [r9+12]\t\n"
961 "vfmadd231ps ymm6,ymm21,ymm20\t\n"
962 "vfmadd231ps ymm7,ymm22,ymm20\t\n"
963 "vbroadcastss ymm20,DWORD PTR [r9+16]\t\n"
964 "vfmadd231ps ymm8,ymm21,ymm20\t\n"
965 "vfmadd231ps ymm9,ymm22,ymm20\t\n"
966 "vbroadcastss ymm20,DWORD PTR [r9+20]\t\n"
967 "vfmadd231ps ymm10,ymm21,ymm20\t\n"
968 "vfmadd231ps ymm11,ymm22,ymm20\t\n"
969 "vbroadcastss ymm20,DWORD PTR [r9+24]\t\n"
970 "vfmadd231ps ymm12,ymm21,ymm20\t\n"
971 "vfmadd231ps ymm13,ymm22,ymm20\t\n"
972 "vbroadcastss ymm20,DWORD PTR [r9+28]\t\n"
973 "vfmadd231ps ymm14,ymm21,ymm20\t\n"
974 "vfmadd231ps ymm15,ymm22,ymm20\t\n"
975 "vbroadcastss ymm20,DWORD PTR [r9+32]\t\n"
976 "vfmadd231ps ymm16,ymm21,ymm20\t\n"
977 "vfmadd231ps ymm17,ymm22,ymm20\t\n"
978 "vbroadcastss ymm20,DWORD PTR [r9+36]\t\n"
979 "vfmadd231ps ymm18,ymm21,ymm20\t\n"
980 "vfmadd231ps ymm19,ymm22,ymm20\t\n"
981
982 "next_inner%=:\t\n"
983 "add r9,40\t\n"
984 "add r10,32\t\n"
985 "dec r14\t\n"
986 "jnz loop_inner%=\t\n"
987
988 "vmovaps ymm21,ymm31\t\n"
989 "vcvtph2ps ymm22,XMMWORD PTR [r10 + 16]\t\n"
990 "vbroadcastss ymm20,DWORD PTR [r9+0]\t\n"
991 "vfmadd231ps ymm0,ymm21,ymm20\t\n"
992 "vfmadd231ps ymm1,ymm22,ymm20\t\n"
993 "vbroadcastss ymm20,DWORD PTR [r9+4]\t\n"
994 "vfmadd231ps ymm2,ymm21,ymm20\t\n"
995 "vfmadd231ps ymm3,ymm22,ymm20\t\n"
996 "vbroadcastss ymm20,DWORD PTR [r9+8]\t\n"
997 "vfmadd231ps ymm4,ymm21,ymm20\t\n"
998 "vfmadd231ps ymm5,ymm22,ymm20\t\n"
999 "vbroadcastss ymm20,DWORD PTR [r9+12]\t\n"
1000 "vfmadd231ps ymm6,ymm21,ymm20\t\n"
1001 "vfmadd231ps ymm7,ymm22,ymm20\t\n"
1002 "vbroadcastss ymm20,DWORD PTR [r9+16]\t\n"
1003 "vfmadd231ps ymm8,ymm21,ymm20\t\n"
1004 "vfmadd231ps ymm9,ymm22,ymm20\t\n"
1005 "vbroadcastss ymm20,DWORD PTR [r9+20]\t\n"
1006 "vfmadd231ps ymm10,ymm21,ymm20\t\n"
1007 "vfmadd231ps ymm11,ymm22,ymm20\t\n"
1008 "vbroadcastss ymm20,DWORD PTR [r9+24]\t\n"
1009 "vfmadd231ps ymm12,ymm21,ymm20\t\n"
1010 "vfmadd231ps ymm13,ymm22,ymm20\t\n"
1011 "vbroadcastss ymm20,DWORD PTR [r9+28]\t\n"
1012 "vfmadd231ps ymm14,ymm21,ymm20\t\n"
1013 "vfmadd231ps ymm15,ymm22,ymm20\t\n"
1014 "vbroadcastss ymm20,DWORD PTR [r9+32]\t\n"
1015 "vfmadd231ps ymm16,ymm21,ymm20\t\n"
1016 "vfmadd231ps ymm17,ymm22,ymm20\t\n"
1017 "vbroadcastss ymm20,DWORD PTR [r9+36]\t\n"
1018 "vfmadd231ps ymm18,ymm21,ymm20\t\n"
1019 "vfmadd231ps ymm19,ymm22,ymm20\t\n"
1020 "add r9,40\t\n"
1021 "add r10,32\t\n"
1022 // Dump C
1023 "dump_C%=:\t\n"
1024 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
1025 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
1026 "add r12, r13\t\n"
1027 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
1028 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
1029 "add r12, r13\t\n"
1030 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
1031 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
1032 "add r12, r13\t\n"
1033 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
1034 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
1035 "add r12, r13\t\n"
1036 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
1037 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
1038 "add r12, r13\t\n"
1039 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
1040 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
1041 "add r12, r13\t\n"
1042 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
1043 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
1044 "add r12, r13\t\n"
1045 "vmovups ymmword PTR [r12 + 0], ymm14\t\n"
1046 "vmovups ymmword PTR [r12 + 32], ymm15\t\n"
1047 "add r12, r13\t\n"
1048 "vmovups ymmword PTR [r12 + 0], ymm16\t\n"
1049 "vmovups ymmword PTR [r12 + 32], ymm17\t\n"
1050 "add r12, r13\t\n"
1051 "vmovups ymmword PTR [r12 + 0], ymm18\t\n"
1052 "vmovups ymmword PTR [r12 + 32], ymm19\t\n"
1053
1054 // next outer iteration
1055 "add rcx, 64\t\n"
1056 "mov r12, rcx\t\n"
1057 "mov r9, rax\t\n"
1058 "inc rbx\t\n"
1059 "cmp rbx, rdi\t\n"
1060 "jl loop_outter%=\t\n"
1061 :
1062 : [gp] "rm"(gp)
1063 : "r8",
1064 "r9",
1065 "r10",
1066 "r11",
1067 "r13",
1068 "r14",
1069 "rax",
1070 "rcx",
1071 "rsi",
1072 "rdi",
1073 "rbx",
1074 "r12",
1075 "r15",
1076 "memory");
1077}
1078void NOINLINE gemmkernel_11x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1079 asm volatile(
1080#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
1081 "mov %[gp], %%r14\t\n"
1082 ".intel_syntax noprefix\t\n"
1083#else
1084 "mov r14, %[gp]\t\n"
1085#endif
1086
1087 // Copy parameters
1088 // k
1089 "mov r8, [r14 + 0]\t\n"
1090 "dec r8\t\n"
1091 // A
1092 "mov r9, [r14 + 8]\t\n"
1093 // B
1094 "mov r10, [r14 + 16]\t\n"
1095 // beta
1096 "lea r15, [r14 + 24]\t\n"
1097 // C
1098 "mov r12, [r14 + 32]\t\n"
1099 // ldc
1100 "mov r13, [r14 + 40]\t\n"
1101 // b_block_cols
1102 "mov rdi, [r14 + 48]\t\n"
1103 // b_block_size
1104 "mov rsi, [r14 + 56]\t\n"
1105
1106 // Make copies of A and C
1107 "mov rax, r9\t\n"
1108 "mov rcx, r12\t\n"
1109
1110 "xor ebx, ebx\t\n"
1111 "loop_outter%=:\t\n"
1112 "mov r14, r8\t\n"
1113 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
1114 "vcvtph2ps ymm23,XMMWORD PTR [r10 + 0]\t\n"
1115 "vcvtph2ps ymm24,XMMWORD PTR [r10 + 16]\t\n"
1116 "vxorps xmm0, xmm0, xmm0\t\n"
1117 "vcomiss xmm31, xmm0\t\n"
1118 "jz zero_regs%=\t\n"
1119
1120 // Setup values with beta multiplication
1121 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
1122 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
1123 "add r12, r13\t\n"
1124 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
1125 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
1126 "add r12, r13\t\n"
1127 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
1128 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
1129 "add r12, r13\t\n"
1130 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
1131 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
1132 "add r12, r13\t\n"
1133 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
1134 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
1135 "add r12, r13\t\n"
1136 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
1137 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
1138 "add r12, r13\t\n"
1139 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
1140 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
1141 "add r12, r13\t\n"
1142 "vmulps ymm14, ymm31, [r12 + 0]\t\n"
1143 "vmulps ymm15, ymm31, [r12 + 32]\t\n"
1144 "add r12, r13\t\n"
1145 "vmulps ymm16, ymm31, [r12 + 0]\t\n"
1146 "vmulps ymm17, ymm31, [r12 + 32]\t\n"
1147 "add r12, r13\t\n"
1148 "vmulps ymm18, ymm31, [r12 + 0]\t\n"
1149 "vmulps ymm19, ymm31, [r12 + 32]\t\n"
1150 "add r12, r13\t\n"
1151 "vmulps ymm20, ymm31, [r12 + 0]\t\n"
1152 "vmulps ymm21, ymm31, [r12 + 32]\t\n"
1153 "test r14,r14\t\n"
1154 "jz skip_preload%=\t\n"
1155 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1156 "skip_preload%=:\t\n"
1157 "vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
1158 "vfmadd231ps ymm0,ymm23,ymm22\t\n"
1159 "vfmadd231ps ymm1,ymm24,ymm22\t\n"
1160 "vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
1161 "vfmadd231ps ymm2,ymm23,ymm22\t\n"
1162 "vfmadd231ps ymm3,ymm24,ymm22\t\n"
1163 "vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
1164 "vfmadd231ps ymm4,ymm23,ymm22\t\n"
1165 "vfmadd231ps ymm5,ymm24,ymm22\t\n"
1166 "vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
1167 "vfmadd231ps ymm6,ymm23,ymm22\t\n"
1168 "vfmadd231ps ymm7,ymm24,ymm22\t\n"
1169 "vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
1170 "vfmadd231ps ymm8,ymm23,ymm22\t\n"
1171 "vfmadd231ps ymm9,ymm24,ymm22\t\n"
1172 "vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
1173 "vfmadd231ps ymm10,ymm23,ymm22\t\n"
1174 "vfmadd231ps ymm11,ymm24,ymm22\t\n"
1175 "vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
1176 "vfmadd231ps ymm12,ymm23,ymm22\t\n"
1177 "vfmadd231ps ymm13,ymm24,ymm22\t\n"
1178 "vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
1179 "vfmadd231ps ymm14,ymm23,ymm22\t\n"
1180 "vfmadd231ps ymm15,ymm24,ymm22\t\n"
1181 "vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
1182 "vfmadd231ps ymm16,ymm23,ymm22\t\n"
1183 "vfmadd231ps ymm17,ymm24,ymm22\t\n"
1184 "vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
1185 "vfmadd231ps ymm18,ymm23,ymm22\t\n"
1186 "vfmadd231ps ymm19,ymm24,ymm22\t\n"
1187 "vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
1188 "vfmadd231ps ymm20,ymm23,ymm22\t\n"
1189 "vfmadd231ps ymm21,ymm24,ymm22\t\n"
1190 "mov r12, rcx\t\n"
1191 "test r14,r14\t\n"
1192 "jnz next_inner%=\t\n"
1193 "add r10,32\t\n"
1194 "jmp dump_C%=\t\n"
1195
1196 "zero_regs%=:\t\n"
1197
1198 "test r14,r14\t\n"
1199 "jz skip_preload_b_zero%=\t\n"
1200 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1201 "skip_preload_b_zero%=:\t\n"
1202 "vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
1203 "vmulps ymm0,ymm23,ymm22\t\n"
1204 "vmulps ymm1,ymm24,ymm22\t\n"
1205 "add r12, r13\t\n"
1206 "vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
1207 "vmulps ymm2,ymm23,ymm22\t\n"
1208 "vmulps ymm3,ymm24,ymm22\t\n"
1209 "add r12, r13\t\n"
1210 "vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
1211 "vmulps ymm4,ymm23,ymm22\t\n"
1212 "vmulps ymm5,ymm24,ymm22\t\n"
1213 "add r12, r13\t\n"
1214 "vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
1215 "vmulps ymm6,ymm23,ymm22\t\n"
1216 "vmulps ymm7,ymm24,ymm22\t\n"
1217 "add r12, r13\t\n"
1218 "vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
1219 "vmulps ymm8,ymm23,ymm22\t\n"
1220 "vmulps ymm9,ymm24,ymm22\t\n"
1221 "add r12, r13\t\n"
1222 "vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
1223 "vmulps ymm10,ymm23,ymm22\t\n"
1224 "vmulps ymm11,ymm24,ymm22\t\n"
1225 "add r12, r13\t\n"
1226 "vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
1227 "vmulps ymm12,ymm23,ymm22\t\n"
1228 "vmulps ymm13,ymm24,ymm22\t\n"
1229 "add r12, r13\t\n"
1230 "vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
1231 "vmulps ymm14,ymm23,ymm22\t\n"
1232 "vmulps ymm15,ymm24,ymm22\t\n"
1233 "add r12, r13\t\n"
1234 "vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
1235 "vmulps ymm16,ymm23,ymm22\t\n"
1236 "vmulps ymm17,ymm24,ymm22\t\n"
1237 "add r12, r13\t\n"
1238 "vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
1239 "vmulps ymm18,ymm23,ymm22\t\n"
1240 "vmulps ymm19,ymm24,ymm22\t\n"
1241 "add r12, r13\t\n"
1242 "vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
1243 "vmulps ymm20,ymm23,ymm22\t\n"
1244 "vmulps ymm21,ymm24,ymm22\t\n"
1245 "mov r12, rcx\t\n"
1246 "test r14,r14\t\n"
1247 "jnz next_inner%=\t\n"
1248 "add r10,32\t\n"
1249 "jmp dump_C%=\t\n"
1250
1251 "loop_inner%=:\t\n"
1252
1253 "vmovaps ymm23,ymm31\t\n"
1254 "vcvtph2ps ymm24,XMMWORD PTR [r10 + 16]\t\n"
1255 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1256 "vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
1257 "vfmadd231ps ymm0,ymm23,ymm22\t\n"
1258 "vfmadd231ps ymm1,ymm24,ymm22\t\n"
1259 "vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
1260 "vfmadd231ps ymm2,ymm23,ymm22\t\n"
1261 "vfmadd231ps ymm3,ymm24,ymm22\t\n"
1262 "vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
1263 "vfmadd231ps ymm4,ymm23,ymm22\t\n"
1264 "vfmadd231ps ymm5,ymm24,ymm22\t\n"
1265 "vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
1266 "vfmadd231ps ymm6,ymm23,ymm22\t\n"
1267 "vfmadd231ps ymm7,ymm24,ymm22\t\n"
1268 "vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
1269 "vfmadd231ps ymm8,ymm23,ymm22\t\n"
1270 "vfmadd231ps ymm9,ymm24,ymm22\t\n"
1271 "vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
1272 "vfmadd231ps ymm10,ymm23,ymm22\t\n"
1273 "vfmadd231ps ymm11,ymm24,ymm22\t\n"
1274 "vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
1275 "vfmadd231ps ymm12,ymm23,ymm22\t\n"
1276 "vfmadd231ps ymm13,ymm24,ymm22\t\n"
1277 "vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
1278 "vfmadd231ps ymm14,ymm23,ymm22\t\n"
1279 "vfmadd231ps ymm15,ymm24,ymm22\t\n"
1280 "vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
1281 "vfmadd231ps ymm16,ymm23,ymm22\t\n"
1282 "vfmadd231ps ymm17,ymm24,ymm22\t\n"
1283 "vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
1284 "vfmadd231ps ymm18,ymm23,ymm22\t\n"
1285 "vfmadd231ps ymm19,ymm24,ymm22\t\n"
1286 "vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
1287 "vfmadd231ps ymm20,ymm23,ymm22\t\n"
1288 "vfmadd231ps ymm21,ymm24,ymm22\t\n"
1289
1290 "next_inner%=:\t\n"
1291 "add r9,44\t\n"
1292 "add r10,32\t\n"
1293 "dec r14\t\n"
1294 "jnz loop_inner%=\t\n"
1295
1296 "vmovaps ymm23,ymm31\t\n"
1297 "vcvtph2ps ymm24,XMMWORD PTR [r10 + 16]\t\n"
1298 "vbroadcastss ymm22,DWORD PTR [r9+0]\t\n"
1299 "vfmadd231ps ymm0,ymm23,ymm22\t\n"
1300 "vfmadd231ps ymm1,ymm24,ymm22\t\n"
1301 "vbroadcastss ymm22,DWORD PTR [r9+4]\t\n"
1302 "vfmadd231ps ymm2,ymm23,ymm22\t\n"
1303 "vfmadd231ps ymm3,ymm24,ymm22\t\n"
1304 "vbroadcastss ymm22,DWORD PTR [r9+8]\t\n"
1305 "vfmadd231ps ymm4,ymm23,ymm22\t\n"
1306 "vfmadd231ps ymm5,ymm24,ymm22\t\n"
1307 "vbroadcastss ymm22,DWORD PTR [r9+12]\t\n"
1308 "vfmadd231ps ymm6,ymm23,ymm22\t\n"
1309 "vfmadd231ps ymm7,ymm24,ymm22\t\n"
1310 "vbroadcastss ymm22,DWORD PTR [r9+16]\t\n"
1311 "vfmadd231ps ymm8,ymm23,ymm22\t\n"
1312 "vfmadd231ps ymm9,ymm24,ymm22\t\n"
1313 "vbroadcastss ymm22,DWORD PTR [r9+20]\t\n"
1314 "vfmadd231ps ymm10,ymm23,ymm22\t\n"
1315 "vfmadd231ps ymm11,ymm24,ymm22\t\n"
1316 "vbroadcastss ymm22,DWORD PTR [r9+24]\t\n"
1317 "vfmadd231ps ymm12,ymm23,ymm22\t\n"
1318 "vfmadd231ps ymm13,ymm24,ymm22\t\n"
1319 "vbroadcastss ymm22,DWORD PTR [r9+28]\t\n"
1320 "vfmadd231ps ymm14,ymm23,ymm22\t\n"
1321 "vfmadd231ps ymm15,ymm24,ymm22\t\n"
1322 "vbroadcastss ymm22,DWORD PTR [r9+32]\t\n"
1323 "vfmadd231ps ymm16,ymm23,ymm22\t\n"
1324 "vfmadd231ps ymm17,ymm24,ymm22\t\n"
1325 "vbroadcastss ymm22,DWORD PTR [r9+36]\t\n"
1326 "vfmadd231ps ymm18,ymm23,ymm22\t\n"
1327 "vfmadd231ps ymm19,ymm24,ymm22\t\n"
1328 "vbroadcastss ymm22,DWORD PTR [r9+40]\t\n"
1329 "vfmadd231ps ymm20,ymm23,ymm22\t\n"
1330 "vfmadd231ps ymm21,ymm24,ymm22\t\n"
1331 "add r9,44\t\n"
1332 "add r10,32\t\n"
1333 // Dump C
1334 "dump_C%=:\t\n"
1335 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
1336 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
1337 "add r12, r13\t\n"
1338 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
1339 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
1340 "add r12, r13\t\n"
1341 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
1342 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
1343 "add r12, r13\t\n"
1344 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
1345 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
1346 "add r12, r13\t\n"
1347 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
1348 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
1349 "add r12, r13\t\n"
1350 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
1351 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
1352 "add r12, r13\t\n"
1353 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
1354 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
1355 "add r12, r13\t\n"
1356 "vmovups ymmword PTR [r12 + 0], ymm14\t\n"
1357 "vmovups ymmword PTR [r12 + 32], ymm15\t\n"
1358 "add r12, r13\t\n"
1359 "vmovups ymmword PTR [r12 + 0], ymm16\t\n"
1360 "vmovups ymmword PTR [r12 + 32], ymm17\t\n"
1361 "add r12, r13\t\n"
1362 "vmovups ymmword PTR [r12 + 0], ymm18\t\n"
1363 "vmovups ymmword PTR [r12 + 32], ymm19\t\n"
1364 "add r12, r13\t\n"
1365 "vmovups ymmword PTR [r12 + 0], ymm20\t\n"
1366 "vmovups ymmword PTR [r12 + 32], ymm21\t\n"
1367
1368 // next outer iteration
1369 "add rcx, 64\t\n"
1370 "mov r12, rcx\t\n"
1371 "mov r9, rax\t\n"
1372 "inc rbx\t\n"
1373 "cmp rbx, rdi\t\n"
1374 "jl loop_outter%=\t\n"
1375 :
1376 : [gp] "rm"(gp)
1377 : "r8",
1378 "r9",
1379 "r10",
1380 "r11",
1381 "r13",
1382 "r14",
1383 "rax",
1384 "rcx",
1385 "rsi",
1386 "rdi",
1387 "rbx",
1388 "r12",
1389 "r15",
1390 "memory");
1391}
1392void NOINLINE gemmkernel_12x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1393 asm volatile(
1394#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
1395 "mov %[gp], %%r14\t\n"
1396 ".intel_syntax noprefix\t\n"
1397#else
1398 "mov r14, %[gp]\t\n"
1399#endif
1400
1401 // Copy parameters
1402 // k
1403 "mov r8, [r14 + 0]\t\n"
1404 "dec r8\t\n"
1405 // A
1406 "mov r9, [r14 + 8]\t\n"
1407 // B
1408 "mov r10, [r14 + 16]\t\n"
1409 // beta
1410 "lea r15, [r14 + 24]\t\n"
1411 // C
1412 "mov r12, [r14 + 32]\t\n"
1413 // ldc
1414 "mov r13, [r14 + 40]\t\n"
1415 // b_block_cols
1416 "mov rdi, [r14 + 48]\t\n"
1417 // b_block_size
1418 "mov rsi, [r14 + 56]\t\n"
1419
1420 // Make copies of A and C
1421 "mov rax, r9\t\n"
1422 "mov rcx, r12\t\n"
1423
1424 "xor ebx, ebx\t\n"
1425 "loop_outter%=:\t\n"
1426 "mov r14, r8\t\n"
1427 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
1428 "vcvtph2ps ymm25,XMMWORD PTR [r10 + 0]\t\n"
1429 "vcvtph2ps ymm26,XMMWORD PTR [r10 + 16]\t\n"
1430 "vxorps xmm0, xmm0, xmm0\t\n"
1431 "vcomiss xmm31, xmm0\t\n"
1432 "jz zero_regs%=\t\n"
1433
1434 // Setup values with beta multiplication
1435 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
1436 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
1437 "add r12, r13\t\n"
1438 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
1439 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
1440 "add r12, r13\t\n"
1441 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
1442 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
1443 "add r12, r13\t\n"
1444 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
1445 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
1446 "add r12, r13\t\n"
1447 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
1448 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
1449 "add r12, r13\t\n"
1450 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
1451 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
1452 "add r12, r13\t\n"
1453 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
1454 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
1455 "add r12, r13\t\n"
1456 "vmulps ymm14, ymm31, [r12 + 0]\t\n"
1457 "vmulps ymm15, ymm31, [r12 + 32]\t\n"
1458 "add r12, r13\t\n"
1459 "vmulps ymm16, ymm31, [r12 + 0]\t\n"
1460 "vmulps ymm17, ymm31, [r12 + 32]\t\n"
1461 "add r12, r13\t\n"
1462 "vmulps ymm18, ymm31, [r12 + 0]\t\n"
1463 "vmulps ymm19, ymm31, [r12 + 32]\t\n"
1464 "add r12, r13\t\n"
1465 "vmulps ymm20, ymm31, [r12 + 0]\t\n"
1466 "vmulps ymm21, ymm31, [r12 + 32]\t\n"
1467 "add r12, r13\t\n"
1468 "vmulps ymm22, ymm31, [r12 + 0]\t\n"
1469 "vmulps ymm23, ymm31, [r12 + 32]\t\n"
1470 "test r14,r14\t\n"
1471 "jz skip_preload%=\t\n"
1472 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1473 "skip_preload%=:\t\n"
1474 "vbroadcastss ymm24,DWORD PTR [r9+0]\t\n"
1475 "vfmadd231ps ymm0,ymm25,ymm24\t\n"
1476 "vfmadd231ps ymm1,ymm26,ymm24\t\n"
1477 "vbroadcastss ymm24,DWORD PTR [r9+4]\t\n"
1478 "vfmadd231ps ymm2,ymm25,ymm24\t\n"
1479 "vfmadd231ps ymm3,ymm26,ymm24\t\n"
1480 "vbroadcastss ymm24,DWORD PTR [r9+8]\t\n"
1481 "vfmadd231ps ymm4,ymm25,ymm24\t\n"
1482 "vfmadd231ps ymm5,ymm26,ymm24\t\n"
1483 "vbroadcastss ymm24,DWORD PTR [r9+12]\t\n"
1484 "vfmadd231ps ymm6,ymm25,ymm24\t\n"
1485 "vfmadd231ps ymm7,ymm26,ymm24\t\n"
1486 "vbroadcastss ymm24,DWORD PTR [r9+16]\t\n"
1487 "vfmadd231ps ymm8,ymm25,ymm24\t\n"
1488 "vfmadd231ps ymm9,ymm26,ymm24\t\n"
1489 "vbroadcastss ymm24,DWORD PTR [r9+20]\t\n"
1490 "vfmadd231ps ymm10,ymm25,ymm24\t\n"
1491 "vfmadd231ps ymm11,ymm26,ymm24\t\n"
1492 "vbroadcastss ymm24,DWORD PTR [r9+24]\t\n"
1493 "vfmadd231ps ymm12,ymm25,ymm24\t\n"
1494 "vfmadd231ps ymm13,ymm26,ymm24\t\n"
1495 "vbroadcastss ymm24,DWORD PTR [r9+28]\t\n"
1496 "vfmadd231ps ymm14,ymm25,ymm24\t\n"
1497 "vfmadd231ps ymm15,ymm26,ymm24\t\n"
1498 "vbroadcastss ymm24,DWORD PTR [r9+32]\t\n"
1499 "vfmadd231ps ymm16,ymm25,ymm24\t\n"
1500 "vfmadd231ps ymm17,ymm26,ymm24\t\n"
1501 "vbroadcastss ymm24,DWORD PTR [r9+36]\t\n"
1502 "vfmadd231ps ymm18,ymm25,ymm24\t\n"
1503 "vfmadd231ps ymm19,ymm26,ymm24\t\n"
1504 "vbroadcastss ymm24,DWORD PTR [r9+40]\t\n"
1505 "vfmadd231ps ymm20,ymm25,ymm24\t\n"
1506 "vfmadd231ps ymm21,ymm26,ymm24\t\n"
1507 "vbroadcastss ymm24,DWORD PTR [r9+44]\t\n"
1508 "vfmadd231ps ymm22,ymm25,ymm24\t\n"
1509 "vfmadd231ps ymm23,ymm26,ymm24\t\n"
1510 "mov r12, rcx\t\n"
1511 "test r14,r14\t\n"
1512 "jnz next_inner%=\t\n"
1513 "add r10,32\t\n"
1514 "jmp dump_C%=\t\n"
1515
1516 "zero_regs%=:\t\n"
1517
1518 "test r14,r14\t\n"
1519 "jz skip_preload_b_zero%=\t\n"
1520 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1521 "skip_preload_b_zero%=:\t\n"
1522 "vbroadcastss ymm24,DWORD PTR [r9+0]\t\n"
1523 "vmulps ymm0,ymm25,ymm24\t\n"
1524 "vmulps ymm1,ymm26,ymm24\t\n"
1525 "add r12, r13\t\n"
1526 "vbroadcastss ymm24,DWORD PTR [r9+4]\t\n"
1527 "vmulps ymm2,ymm25,ymm24\t\n"
1528 "vmulps ymm3,ymm26,ymm24\t\n"
1529 "add r12, r13\t\n"
1530 "vbroadcastss ymm24,DWORD PTR [r9+8]\t\n"
1531 "vmulps ymm4,ymm25,ymm24\t\n"
1532 "vmulps ymm5,ymm26,ymm24\t\n"
1533 "add r12, r13\t\n"
1534 "vbroadcastss ymm24,DWORD PTR [r9+12]\t\n"
1535 "vmulps ymm6,ymm25,ymm24\t\n"
1536 "vmulps ymm7,ymm26,ymm24\t\n"
1537 "add r12, r13\t\n"
1538 "vbroadcastss ymm24,DWORD PTR [r9+16]\t\n"
1539 "vmulps ymm8,ymm25,ymm24\t\n"
1540 "vmulps ymm9,ymm26,ymm24\t\n"
1541 "add r12, r13\t\n"
1542 "vbroadcastss ymm24,DWORD PTR [r9+20]\t\n"
1543 "vmulps ymm10,ymm25,ymm24\t\n"
1544 "vmulps ymm11,ymm26,ymm24\t\n"
1545 "add r12, r13\t\n"
1546 "vbroadcastss ymm24,DWORD PTR [r9+24]\t\n"
1547 "vmulps ymm12,ymm25,ymm24\t\n"
1548 "vmulps ymm13,ymm26,ymm24\t\n"
1549 "add r12, r13\t\n"
1550 "vbroadcastss ymm24,DWORD PTR [r9+28]\t\n"
1551 "vmulps ymm14,ymm25,ymm24\t\n"
1552 "vmulps ymm15,ymm26,ymm24\t\n"
1553 "add r12, r13\t\n"
1554 "vbroadcastss ymm24,DWORD PTR [r9+32]\t\n"
1555 "vmulps ymm16,ymm25,ymm24\t\n"
1556 "vmulps ymm17,ymm26,ymm24\t\n"
1557 "add r12, r13\t\n"
1558 "vbroadcastss ymm24,DWORD PTR [r9+36]\t\n"
1559 "vmulps ymm18,ymm25,ymm24\t\n"
1560 "vmulps ymm19,ymm26,ymm24\t\n"
1561 "add r12, r13\t\n"
1562 "vbroadcastss ymm24,DWORD PTR [r9+40]\t\n"
1563 "vmulps ymm20,ymm25,ymm24\t\n"
1564 "vmulps ymm21,ymm26,ymm24\t\n"
1565 "add r12, r13\t\n"
1566 "vbroadcastss ymm24,DWORD PTR [r9+44]\t\n"
1567 "vmulps ymm22,ymm25,ymm24\t\n"
1568 "vmulps ymm23,ymm26,ymm24\t\n"
1569 "mov r12, rcx\t\n"
1570 "test r14,r14\t\n"
1571 "jnz next_inner%=\t\n"
1572 "add r10,32\t\n"
1573 "jmp dump_C%=\t\n"
1574
1575 "loop_inner%=:\t\n"
1576
1577 "vmovaps ymm25,ymm31\t\n"
1578 "vcvtph2ps ymm26,XMMWORD PTR [r10 + 16]\t\n"
1579 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1580 "vbroadcastss ymm24,DWORD PTR [r9+0]\t\n"
1581 "vfmadd231ps ymm0,ymm25,ymm24\t\n"
1582 "vfmadd231ps ymm1,ymm26,ymm24\t\n"
1583 "vbroadcastss ymm24,DWORD PTR [r9+4]\t\n"
1584 "vfmadd231ps ymm2,ymm25,ymm24\t\n"
1585 "vfmadd231ps ymm3,ymm26,ymm24\t\n"
1586 "vbroadcastss ymm24,DWORD PTR [r9+8]\t\n"
1587 "vfmadd231ps ymm4,ymm25,ymm24\t\n"
1588 "vfmadd231ps ymm5,ymm26,ymm24\t\n"
1589 "vbroadcastss ymm24,DWORD PTR [r9+12]\t\n"
1590 "vfmadd231ps ymm6,ymm25,ymm24\t\n"
1591 "vfmadd231ps ymm7,ymm26,ymm24\t\n"
1592 "vbroadcastss ymm24,DWORD PTR [r9+16]\t\n"
1593 "vfmadd231ps ymm8,ymm25,ymm24\t\n"
1594 "vfmadd231ps ymm9,ymm26,ymm24\t\n"
1595 "vbroadcastss ymm24,DWORD PTR [r9+20]\t\n"
1596 "vfmadd231ps ymm10,ymm25,ymm24\t\n"
1597 "vfmadd231ps ymm11,ymm26,ymm24\t\n"
1598 "vbroadcastss ymm24,DWORD PTR [r9+24]\t\n"
1599 "vfmadd231ps ymm12,ymm25,ymm24\t\n"
1600 "vfmadd231ps ymm13,ymm26,ymm24\t\n"
1601 "vbroadcastss ymm24,DWORD PTR [r9+28]\t\n"
1602 "vfmadd231ps ymm14,ymm25,ymm24\t\n"
1603 "vfmadd231ps ymm15,ymm26,ymm24\t\n"
1604 "vbroadcastss ymm24,DWORD PTR [r9+32]\t\n"
1605 "vfmadd231ps ymm16,ymm25,ymm24\t\n"
1606 "vfmadd231ps ymm17,ymm26,ymm24\t\n"
1607 "vbroadcastss ymm24,DWORD PTR [r9+36]\t\n"
1608 "vfmadd231ps ymm18,ymm25,ymm24\t\n"
1609 "vfmadd231ps ymm19,ymm26,ymm24\t\n"
1610 "vbroadcastss ymm24,DWORD PTR [r9+40]\t\n"
1611 "vfmadd231ps ymm20,ymm25,ymm24\t\n"
1612 "vfmadd231ps ymm21,ymm26,ymm24\t\n"
1613 "vbroadcastss ymm24,DWORD PTR [r9+44]\t\n"
1614 "vfmadd231ps ymm22,ymm25,ymm24\t\n"
1615 "vfmadd231ps ymm23,ymm26,ymm24\t\n"
1616
1617 "next_inner%=:\t\n"
1618 "add r9,48\t\n"
1619 "add r10,32\t\n"
1620 "dec r14\t\n"
1621 "jnz loop_inner%=\t\n"
1622
1623 "vmovaps ymm25,ymm31\t\n"
1624 "vcvtph2ps ymm26,XMMWORD PTR [r10 + 16]\t\n"
1625 "vbroadcastss ymm24,DWORD PTR [r9+0]\t\n"
1626 "vfmadd231ps ymm0,ymm25,ymm24\t\n"
1627 "vfmadd231ps ymm1,ymm26,ymm24\t\n"
1628 "vbroadcastss ymm24,DWORD PTR [r9+4]\t\n"
1629 "vfmadd231ps ymm2,ymm25,ymm24\t\n"
1630 "vfmadd231ps ymm3,ymm26,ymm24\t\n"
1631 "vbroadcastss ymm24,DWORD PTR [r9+8]\t\n"
1632 "vfmadd231ps ymm4,ymm25,ymm24\t\n"
1633 "vfmadd231ps ymm5,ymm26,ymm24\t\n"
1634 "vbroadcastss ymm24,DWORD PTR [r9+12]\t\n"
1635 "vfmadd231ps ymm6,ymm25,ymm24\t\n"
1636 "vfmadd231ps ymm7,ymm26,ymm24\t\n"
1637 "vbroadcastss ymm24,DWORD PTR [r9+16]\t\n"
1638 "vfmadd231ps ymm8,ymm25,ymm24\t\n"
1639 "vfmadd231ps ymm9,ymm26,ymm24\t\n"
1640 "vbroadcastss ymm24,DWORD PTR [r9+20]\t\n"
1641 "vfmadd231ps ymm10,ymm25,ymm24\t\n"
1642 "vfmadd231ps ymm11,ymm26,ymm24\t\n"
1643 "vbroadcastss ymm24,DWORD PTR [r9+24]\t\n"
1644 "vfmadd231ps ymm12,ymm25,ymm24\t\n"
1645 "vfmadd231ps ymm13,ymm26,ymm24\t\n"
1646 "vbroadcastss ymm24,DWORD PTR [r9+28]\t\n"
1647 "vfmadd231ps ymm14,ymm25,ymm24\t\n"
1648 "vfmadd231ps ymm15,ymm26,ymm24\t\n"
1649 "vbroadcastss ymm24,DWORD PTR [r9+32]\t\n"
1650 "vfmadd231ps ymm16,ymm25,ymm24\t\n"
1651 "vfmadd231ps ymm17,ymm26,ymm24\t\n"
1652 "vbroadcastss ymm24,DWORD PTR [r9+36]\t\n"
1653 "vfmadd231ps ymm18,ymm25,ymm24\t\n"
1654 "vfmadd231ps ymm19,ymm26,ymm24\t\n"
1655 "vbroadcastss ymm24,DWORD PTR [r9+40]\t\n"
1656 "vfmadd231ps ymm20,ymm25,ymm24\t\n"
1657 "vfmadd231ps ymm21,ymm26,ymm24\t\n"
1658 "vbroadcastss ymm24,DWORD PTR [r9+44]\t\n"
1659 "vfmadd231ps ymm22,ymm25,ymm24\t\n"
1660 "vfmadd231ps ymm23,ymm26,ymm24\t\n"
1661 "add r9,48\t\n"
1662 "add r10,32\t\n"
1663 // Dump C
1664 "dump_C%=:\t\n"
1665 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
1666 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
1667 "add r12, r13\t\n"
1668 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
1669 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
1670 "add r12, r13\t\n"
1671 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
1672 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
1673 "add r12, r13\t\n"
1674 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
1675 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
1676 "add r12, r13\t\n"
1677 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
1678 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
1679 "add r12, r13\t\n"
1680 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
1681 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
1682 "add r12, r13\t\n"
1683 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
1684 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
1685 "add r12, r13\t\n"
1686 "vmovups ymmword PTR [r12 + 0], ymm14\t\n"
1687 "vmovups ymmword PTR [r12 + 32], ymm15\t\n"
1688 "add r12, r13\t\n"
1689 "vmovups ymmword PTR [r12 + 0], ymm16\t\n"
1690 "vmovups ymmword PTR [r12 + 32], ymm17\t\n"
1691 "add r12, r13\t\n"
1692 "vmovups ymmword PTR [r12 + 0], ymm18\t\n"
1693 "vmovups ymmword PTR [r12 + 32], ymm19\t\n"
1694 "add r12, r13\t\n"
1695 "vmovups ymmword PTR [r12 + 0], ymm20\t\n"
1696 "vmovups ymmword PTR [r12 + 32], ymm21\t\n"
1697 "add r12, r13\t\n"
1698 "vmovups ymmword PTR [r12 + 0], ymm22\t\n"
1699 "vmovups ymmword PTR [r12 + 32], ymm23\t\n"
1700
1701 // next outer iteration
1702 "add rcx, 64\t\n"
1703 "mov r12, rcx\t\n"
1704 "mov r9, rax\t\n"
1705 "inc rbx\t\n"
1706 "cmp rbx, rdi\t\n"
1707 "jl loop_outter%=\t\n"
1708 :
1709 : [gp] "rm"(gp)
1710 : "r8",
1711 "r9",
1712 "r10",
1713 "r11",
1714 "r13",
1715 "r14",
1716 "rax",
1717 "rcx",
1718 "rsi",
1719 "rdi",
1720 "rbx",
1721 "r12",
1722 "r15",
1723 "memory");
1724}
1725void NOINLINE gemmkernel_13x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
1726 asm volatile(
1727#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
1728 "mov %[gp], %%r14\t\n"
1729 ".intel_syntax noprefix\t\n"
1730#else
1731 "mov r14, %[gp]\t\n"
1732#endif
1733
1734 // Copy parameters
1735 // k
1736 "mov r8, [r14 + 0]\t\n"
1737 "dec r8\t\n"
1738 // A
1739 "mov r9, [r14 + 8]\t\n"
1740 // B
1741 "mov r10, [r14 + 16]\t\n"
1742 // beta
1743 "lea r15, [r14 + 24]\t\n"
1744 // C
1745 "mov r12, [r14 + 32]\t\n"
1746 // ldc
1747 "mov r13, [r14 + 40]\t\n"
1748 // b_block_cols
1749 "mov rdi, [r14 + 48]\t\n"
1750 // b_block_size
1751 "mov rsi, [r14 + 56]\t\n"
1752
1753 // Make copies of A and C
1754 "mov rax, r9\t\n"
1755 "mov rcx, r12\t\n"
1756
1757 "xor ebx, ebx\t\n"
1758 "loop_outter%=:\t\n"
1759 "mov r14, r8\t\n"
1760 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
1761 "vcvtph2ps ymm27,XMMWORD PTR [r10 + 0]\t\n"
1762 "vcvtph2ps ymm28,XMMWORD PTR [r10 + 16]\t\n"
1763 "vxorps xmm0, xmm0, xmm0\t\n"
1764 "vcomiss xmm31, xmm0\t\n"
1765 "jz zero_regs%=\t\n"
1766
1767 // Setup values with beta multiplication
1768 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
1769 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
1770 "add r12, r13\t\n"
1771 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
1772 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
1773 "add r12, r13\t\n"
1774 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
1775 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
1776 "add r12, r13\t\n"
1777 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
1778 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
1779 "add r12, r13\t\n"
1780 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
1781 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
1782 "add r12, r13\t\n"
1783 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
1784 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
1785 "add r12, r13\t\n"
1786 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
1787 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
1788 "add r12, r13\t\n"
1789 "vmulps ymm14, ymm31, [r12 + 0]\t\n"
1790 "vmulps ymm15, ymm31, [r12 + 32]\t\n"
1791 "add r12, r13\t\n"
1792 "vmulps ymm16, ymm31, [r12 + 0]\t\n"
1793 "vmulps ymm17, ymm31, [r12 + 32]\t\n"
1794 "add r12, r13\t\n"
1795 "vmulps ymm18, ymm31, [r12 + 0]\t\n"
1796 "vmulps ymm19, ymm31, [r12 + 32]\t\n"
1797 "add r12, r13\t\n"
1798 "vmulps ymm20, ymm31, [r12 + 0]\t\n"
1799 "vmulps ymm21, ymm31, [r12 + 32]\t\n"
1800 "add r12, r13\t\n"
1801 "vmulps ymm22, ymm31, [r12 + 0]\t\n"
1802 "vmulps ymm23, ymm31, [r12 + 32]\t\n"
1803 "add r12, r13\t\n"
1804 "vmulps ymm24, ymm31, [r12 + 0]\t\n"
1805 "vmulps ymm25, ymm31, [r12 + 32]\t\n"
1806 "test r14,r14\t\n"
1807 "jz skip_preload%=\t\n"
1808 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1809 "skip_preload%=:\t\n"
1810 "vbroadcastss ymm26,DWORD PTR [r9+0]\t\n"
1811 "vfmadd231ps ymm0,ymm27,ymm26\t\n"
1812 "vfmadd231ps ymm1,ymm28,ymm26\t\n"
1813 "vbroadcastss ymm26,DWORD PTR [r9+4]\t\n"
1814 "vfmadd231ps ymm2,ymm27,ymm26\t\n"
1815 "vfmadd231ps ymm3,ymm28,ymm26\t\n"
1816 "vbroadcastss ymm26,DWORD PTR [r9+8]\t\n"
1817 "vfmadd231ps ymm4,ymm27,ymm26\t\n"
1818 "vfmadd231ps ymm5,ymm28,ymm26\t\n"
1819 "vbroadcastss ymm26,DWORD PTR [r9+12]\t\n"
1820 "vfmadd231ps ymm6,ymm27,ymm26\t\n"
1821 "vfmadd231ps ymm7,ymm28,ymm26\t\n"
1822 "vbroadcastss ymm26,DWORD PTR [r9+16]\t\n"
1823 "vfmadd231ps ymm8,ymm27,ymm26\t\n"
1824 "vfmadd231ps ymm9,ymm28,ymm26\t\n"
1825 "vbroadcastss ymm26,DWORD PTR [r9+20]\t\n"
1826 "vfmadd231ps ymm10,ymm27,ymm26\t\n"
1827 "vfmadd231ps ymm11,ymm28,ymm26\t\n"
1828 "vbroadcastss ymm26,DWORD PTR [r9+24]\t\n"
1829 "vfmadd231ps ymm12,ymm27,ymm26\t\n"
1830 "vfmadd231ps ymm13,ymm28,ymm26\t\n"
1831 "vbroadcastss ymm26,DWORD PTR [r9+28]\t\n"
1832 "vfmadd231ps ymm14,ymm27,ymm26\t\n"
1833 "vfmadd231ps ymm15,ymm28,ymm26\t\n"
1834 "vbroadcastss ymm26,DWORD PTR [r9+32]\t\n"
1835 "vfmadd231ps ymm16,ymm27,ymm26\t\n"
1836 "vfmadd231ps ymm17,ymm28,ymm26\t\n"
1837 "vbroadcastss ymm26,DWORD PTR [r9+36]\t\n"
1838 "vfmadd231ps ymm18,ymm27,ymm26\t\n"
1839 "vfmadd231ps ymm19,ymm28,ymm26\t\n"
1840 "vbroadcastss ymm26,DWORD PTR [r9+40]\t\n"
1841 "vfmadd231ps ymm20,ymm27,ymm26\t\n"
1842 "vfmadd231ps ymm21,ymm28,ymm26\t\n"
1843 "vbroadcastss ymm26,DWORD PTR [r9+44]\t\n"
1844 "vfmadd231ps ymm22,ymm27,ymm26\t\n"
1845 "vfmadd231ps ymm23,ymm28,ymm26\t\n"
1846 "vbroadcastss ymm26,DWORD PTR [r9+48]\t\n"
1847 "vfmadd231ps ymm24,ymm27,ymm26\t\n"
1848 "vfmadd231ps ymm25,ymm28,ymm26\t\n"
1849 "mov r12, rcx\t\n"
1850 "test r14,r14\t\n"
1851 "jnz next_inner%=\t\n"
1852 "add r10,32\t\n"
1853 "jmp dump_C%=\t\n"
1854
1855 "zero_regs%=:\t\n"
1856
1857 "test r14,r14\t\n"
1858 "jz skip_preload_b_zero%=\t\n"
1859 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1860 "skip_preload_b_zero%=:\t\n"
1861 "vbroadcastss ymm26,DWORD PTR [r9+0]\t\n"
1862 "vmulps ymm0,ymm27,ymm26\t\n"
1863 "vmulps ymm1,ymm28,ymm26\t\n"
1864 "add r12, r13\t\n"
1865 "vbroadcastss ymm26,DWORD PTR [r9+4]\t\n"
1866 "vmulps ymm2,ymm27,ymm26\t\n"
1867 "vmulps ymm3,ymm28,ymm26\t\n"
1868 "add r12, r13\t\n"
1869 "vbroadcastss ymm26,DWORD PTR [r9+8]\t\n"
1870 "vmulps ymm4,ymm27,ymm26\t\n"
1871 "vmulps ymm5,ymm28,ymm26\t\n"
1872 "add r12, r13\t\n"
1873 "vbroadcastss ymm26,DWORD PTR [r9+12]\t\n"
1874 "vmulps ymm6,ymm27,ymm26\t\n"
1875 "vmulps ymm7,ymm28,ymm26\t\n"
1876 "add r12, r13\t\n"
1877 "vbroadcastss ymm26,DWORD PTR [r9+16]\t\n"
1878 "vmulps ymm8,ymm27,ymm26\t\n"
1879 "vmulps ymm9,ymm28,ymm26\t\n"
1880 "add r12, r13\t\n"
1881 "vbroadcastss ymm26,DWORD PTR [r9+20]\t\n"
1882 "vmulps ymm10,ymm27,ymm26\t\n"
1883 "vmulps ymm11,ymm28,ymm26\t\n"
1884 "add r12, r13\t\n"
1885 "vbroadcastss ymm26,DWORD PTR [r9+24]\t\n"
1886 "vmulps ymm12,ymm27,ymm26\t\n"
1887 "vmulps ymm13,ymm28,ymm26\t\n"
1888 "add r12, r13\t\n"
1889 "vbroadcastss ymm26,DWORD PTR [r9+28]\t\n"
1890 "vmulps ymm14,ymm27,ymm26\t\n"
1891 "vmulps ymm15,ymm28,ymm26\t\n"
1892 "add r12, r13\t\n"
1893 "vbroadcastss ymm26,DWORD PTR [r9+32]\t\n"
1894 "vmulps ymm16,ymm27,ymm26\t\n"
1895 "vmulps ymm17,ymm28,ymm26\t\n"
1896 "add r12, r13\t\n"
1897 "vbroadcastss ymm26,DWORD PTR [r9+36]\t\n"
1898 "vmulps ymm18,ymm27,ymm26\t\n"
1899 "vmulps ymm19,ymm28,ymm26\t\n"
1900 "add r12, r13\t\n"
1901 "vbroadcastss ymm26,DWORD PTR [r9+40]\t\n"
1902 "vmulps ymm20,ymm27,ymm26\t\n"
1903 "vmulps ymm21,ymm28,ymm26\t\n"
1904 "add r12, r13\t\n"
1905 "vbroadcastss ymm26,DWORD PTR [r9+44]\t\n"
1906 "vmulps ymm22,ymm27,ymm26\t\n"
1907 "vmulps ymm23,ymm28,ymm26\t\n"
1908 "add r12, r13\t\n"
1909 "vbroadcastss ymm26,DWORD PTR [r9+48]\t\n"
1910 "vmulps ymm24,ymm27,ymm26\t\n"
1911 "vmulps ymm25,ymm28,ymm26\t\n"
1912 "mov r12, rcx\t\n"
1913 "test r14,r14\t\n"
1914 "jnz next_inner%=\t\n"
1915 "add r10,32\t\n"
1916 "jmp dump_C%=\t\n"
1917
1918 "loop_inner%=:\t\n"
1919
1920 "vmovaps ymm27,ymm31\t\n"
1921 "vcvtph2ps ymm28,XMMWORD PTR [r10 + 16]\t\n"
1922 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
1923 "vbroadcastss ymm26,DWORD PTR [r9+0]\t\n"
1924 "vfmadd231ps ymm0,ymm27,ymm26\t\n"
1925 "vfmadd231ps ymm1,ymm28,ymm26\t\n"
1926 "vbroadcastss ymm26,DWORD PTR [r9+4]\t\n"
1927 "vfmadd231ps ymm2,ymm27,ymm26\t\n"
1928 "vfmadd231ps ymm3,ymm28,ymm26\t\n"
1929 "vbroadcastss ymm26,DWORD PTR [r9+8]\t\n"
1930 "vfmadd231ps ymm4,ymm27,ymm26\t\n"
1931 "vfmadd231ps ymm5,ymm28,ymm26\t\n"
1932 "vbroadcastss ymm26,DWORD PTR [r9+12]\t\n"
1933 "vfmadd231ps ymm6,ymm27,ymm26\t\n"
1934 "vfmadd231ps ymm7,ymm28,ymm26\t\n"
1935 "vbroadcastss ymm26,DWORD PTR [r9+16]\t\n"
1936 "vfmadd231ps ymm8,ymm27,ymm26\t\n"
1937 "vfmadd231ps ymm9,ymm28,ymm26\t\n"
1938 "vbroadcastss ymm26,DWORD PTR [r9+20]\t\n"
1939 "vfmadd231ps ymm10,ymm27,ymm26\t\n"
1940 "vfmadd231ps ymm11,ymm28,ymm26\t\n"
1941 "vbroadcastss ymm26,DWORD PTR [r9+24]\t\n"
1942 "vfmadd231ps ymm12,ymm27,ymm26\t\n"
1943 "vfmadd231ps ymm13,ymm28,ymm26\t\n"
1944 "vbroadcastss ymm26,DWORD PTR [r9+28]\t\n"
1945 "vfmadd231ps ymm14,ymm27,ymm26\t\n"
1946 "vfmadd231ps ymm15,ymm28,ymm26\t\n"
1947 "vbroadcastss ymm26,DWORD PTR [r9+32]\t\n"
1948 "vfmadd231ps ymm16,ymm27,ymm26\t\n"
1949 "vfmadd231ps ymm17,ymm28,ymm26\t\n"
1950 "vbroadcastss ymm26,DWORD PTR [r9+36]\t\n"
1951 "vfmadd231ps ymm18,ymm27,ymm26\t\n"
1952 "vfmadd231ps ymm19,ymm28,ymm26\t\n"
1953 "vbroadcastss ymm26,DWORD PTR [r9+40]\t\n"
1954 "vfmadd231ps ymm20,ymm27,ymm26\t\n"
1955 "vfmadd231ps ymm21,ymm28,ymm26\t\n"
1956 "vbroadcastss ymm26,DWORD PTR [r9+44]\t\n"
1957 "vfmadd231ps ymm22,ymm27,ymm26\t\n"
1958 "vfmadd231ps ymm23,ymm28,ymm26\t\n"
1959 "vbroadcastss ymm26,DWORD PTR [r9+48]\t\n"
1960 "vfmadd231ps ymm24,ymm27,ymm26\t\n"
1961 "vfmadd231ps ymm25,ymm28,ymm26\t\n"
1962
1963 "next_inner%=:\t\n"
1964 "add r9,52\t\n"
1965 "add r10,32\t\n"
1966 "dec r14\t\n"
1967 "jnz loop_inner%=\t\n"
1968
1969 "vmovaps ymm27,ymm31\t\n"
1970 "vcvtph2ps ymm28,XMMWORD PTR [r10 + 16]\t\n"
1971 "vbroadcastss ymm26,DWORD PTR [r9+0]\t\n"
1972 "vfmadd231ps ymm0,ymm27,ymm26\t\n"
1973 "vfmadd231ps ymm1,ymm28,ymm26\t\n"
1974 "vbroadcastss ymm26,DWORD PTR [r9+4]\t\n"
1975 "vfmadd231ps ymm2,ymm27,ymm26\t\n"
1976 "vfmadd231ps ymm3,ymm28,ymm26\t\n"
1977 "vbroadcastss ymm26,DWORD PTR [r9+8]\t\n"
1978 "vfmadd231ps ymm4,ymm27,ymm26\t\n"
1979 "vfmadd231ps ymm5,ymm28,ymm26\t\n"
1980 "vbroadcastss ymm26,DWORD PTR [r9+12]\t\n"
1981 "vfmadd231ps ymm6,ymm27,ymm26\t\n"
1982 "vfmadd231ps ymm7,ymm28,ymm26\t\n"
1983 "vbroadcastss ymm26,DWORD PTR [r9+16]\t\n"
1984 "vfmadd231ps ymm8,ymm27,ymm26\t\n"
1985 "vfmadd231ps ymm9,ymm28,ymm26\t\n"
1986 "vbroadcastss ymm26,DWORD PTR [r9+20]\t\n"
1987 "vfmadd231ps ymm10,ymm27,ymm26\t\n"
1988 "vfmadd231ps ymm11,ymm28,ymm26\t\n"
1989 "vbroadcastss ymm26,DWORD PTR [r9+24]\t\n"
1990 "vfmadd231ps ymm12,ymm27,ymm26\t\n"
1991 "vfmadd231ps ymm13,ymm28,ymm26\t\n"
1992 "vbroadcastss ymm26,DWORD PTR [r9+28]\t\n"
1993 "vfmadd231ps ymm14,ymm27,ymm26\t\n"
1994 "vfmadd231ps ymm15,ymm28,ymm26\t\n"
1995 "vbroadcastss ymm26,DWORD PTR [r9+32]\t\n"
1996 "vfmadd231ps ymm16,ymm27,ymm26\t\n"
1997 "vfmadd231ps ymm17,ymm28,ymm26\t\n"
1998 "vbroadcastss ymm26,DWORD PTR [r9+36]\t\n"
1999 "vfmadd231ps ymm18,ymm27,ymm26\t\n"
2000 "vfmadd231ps ymm19,ymm28,ymm26\t\n"
2001 "vbroadcastss ymm26,DWORD PTR [r9+40]\t\n"
2002 "vfmadd231ps ymm20,ymm27,ymm26\t\n"
2003 "vfmadd231ps ymm21,ymm28,ymm26\t\n"
2004 "vbroadcastss ymm26,DWORD PTR [r9+44]\t\n"
2005 "vfmadd231ps ymm22,ymm27,ymm26\t\n"
2006 "vfmadd231ps ymm23,ymm28,ymm26\t\n"
2007 "vbroadcastss ymm26,DWORD PTR [r9+48]\t\n"
2008 "vfmadd231ps ymm24,ymm27,ymm26\t\n"
2009 "vfmadd231ps ymm25,ymm28,ymm26\t\n"
2010 "add r9,52\t\n"
2011 "add r10,32\t\n"
2012 // Dump C
2013 "dump_C%=:\t\n"
2014 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
2015 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
2016 "add r12, r13\t\n"
2017 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
2018 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
2019 "add r12, r13\t\n"
2020 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
2021 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
2022 "add r12, r13\t\n"
2023 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
2024 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
2025 "add r12, r13\t\n"
2026 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
2027 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
2028 "add r12, r13\t\n"
2029 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
2030 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
2031 "add r12, r13\t\n"
2032 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
2033 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
2034 "add r12, r13\t\n"
2035 "vmovups ymmword PTR [r12 + 0], ymm14\t\n"
2036 "vmovups ymmword PTR [r12 + 32], ymm15\t\n"
2037 "add r12, r13\t\n"
2038 "vmovups ymmword PTR [r12 + 0], ymm16\t\n"
2039 "vmovups ymmword PTR [r12 + 32], ymm17\t\n"
2040 "add r12, r13\t\n"
2041 "vmovups ymmword PTR [r12 + 0], ymm18\t\n"
2042 "vmovups ymmword PTR [r12 + 32], ymm19\t\n"
2043 "add r12, r13\t\n"
2044 "vmovups ymmword PTR [r12 + 0], ymm20\t\n"
2045 "vmovups ymmword PTR [r12 + 32], ymm21\t\n"
2046 "add r12, r13\t\n"
2047 "vmovups ymmword PTR [r12 + 0], ymm22\t\n"
2048 "vmovups ymmword PTR [r12 + 32], ymm23\t\n"
2049 "add r12, r13\t\n"
2050 "vmovups ymmword PTR [r12 + 0], ymm24\t\n"
2051 "vmovups ymmword PTR [r12 + 32], ymm25\t\n"
2052
2053 // next outer iteration
2054 "add rcx, 64\t\n"
2055 "mov r12, rcx\t\n"
2056 "mov r9, rax\t\n"
2057 "inc rbx\t\n"
2058 "cmp rbx, rdi\t\n"
2059 "jl loop_outter%=\t\n"
2060 :
2061 : [gp] "rm"(gp)
2062 : "r8",
2063 "r9",
2064 "r10",
2065 "r11",
2066 "r13",
2067 "r14",
2068 "rax",
2069 "rcx",
2070 "rsi",
2071 "rdi",
2072 "rbx",
2073 "r12",
2074 "r15",
2075 "memory");
2076}
2077void NOINLINE gemmkernel_14x2_Avx512_256_fp16_fA0fB0fC0(GemmParamsFP16* gp) {
2078 asm volatile(
2079#if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK
2080 "mov %[gp], %%r14\t\n"
2081 ".intel_syntax noprefix\t\n"
2082#else
2083 "mov r14, %[gp]\t\n"
2084#endif
2085
2086 // Copy parameters
2087 // k
2088 "mov r8, [r14 + 0]\t\n"
2089 "dec r8\t\n"
2090 // A
2091 "mov r9, [r14 + 8]\t\n"
2092 // B
2093 "mov r10, [r14 + 16]\t\n"
2094 // beta
2095 "lea r15, [r14 + 24]\t\n"
2096 // C
2097 "mov r12, [r14 + 32]\t\n"
2098 // ldc
2099 "mov r13, [r14 + 40]\t\n"
2100 // b_block_cols
2101 "mov rdi, [r14 + 48]\t\n"
2102 // b_block_size
2103 "mov rsi, [r14 + 56]\t\n"
2104
2105 // Make copies of A and C
2106 "mov rax, r9\t\n"
2107 "mov rcx, r12\t\n"
2108
2109 "xor ebx, ebx\t\n"
2110 "loop_outter%=:\t\n"
2111 "mov r14, r8\t\n"
2112 "vbroadcastss ymm31,DWORD PTR [r15]\t\n"
2113 "vcvtph2ps ymm29,XMMWORD PTR [r10 + 0]\t\n"
2114 "vcvtph2ps ymm30,XMMWORD PTR [r10 + 16]\t\n"
2115 "vxorps xmm0, xmm0, xmm0\t\n"
2116 "vcomiss xmm31, xmm0\t\n"
2117 "jz zero_regs%=\t\n"
2118
2119 // Setup values with beta multiplication
2120 "vmulps ymm0, ymm31, [r12 + 0]\t\n"
2121 "vmulps ymm1, ymm31, [r12 + 32]\t\n"
2122 "add r12, r13\t\n"
2123 "vmulps ymm2, ymm31, [r12 + 0]\t\n"
2124 "vmulps ymm3, ymm31, [r12 + 32]\t\n"
2125 "add r12, r13\t\n"
2126 "vmulps ymm4, ymm31, [r12 + 0]\t\n"
2127 "vmulps ymm5, ymm31, [r12 + 32]\t\n"
2128 "add r12, r13\t\n"
2129 "vmulps ymm6, ymm31, [r12 + 0]\t\n"
2130 "vmulps ymm7, ymm31, [r12 + 32]\t\n"
2131 "add r12, r13\t\n"
2132 "vmulps ymm8, ymm31, [r12 + 0]\t\n"
2133 "vmulps ymm9, ymm31, [r12 + 32]\t\n"
2134 "add r12, r13\t\n"
2135 "vmulps ymm10, ymm31, [r12 + 0]\t\n"
2136 "vmulps ymm11, ymm31, [r12 + 32]\t\n"
2137 "add r12, r13\t\n"
2138 "vmulps ymm12, ymm31, [r12 + 0]\t\n"
2139 "vmulps ymm13, ymm31, [r12 + 32]\t\n"
2140 "add r12, r13\t\n"
2141 "vmulps ymm14, ymm31, [r12 + 0]\t\n"
2142 "vmulps ymm15, ymm31, [r12 + 32]\t\n"
2143 "add r12, r13\t\n"
2144 "vmulps ymm16, ymm31, [r12 + 0]\t\n"
2145 "vmulps ymm17, ymm31, [r12 + 32]\t\n"
2146 "add r12, r13\t\n"
2147 "vmulps ymm18, ymm31, [r12 + 0]\t\n"
2148 "vmulps ymm19, ymm31, [r12 + 32]\t\n"
2149 "add r12, r13\t\n"
2150 "vmulps ymm20, ymm31, [r12 + 0]\t\n"
2151 "vmulps ymm21, ymm31, [r12 + 32]\t\n"
2152 "add r12, r13\t\n"
2153 "vmulps ymm22, ymm31, [r12 + 0]\t\n"
2154 "vmulps ymm23, ymm31, [r12 + 32]\t\n"
2155 "add r12, r13\t\n"
2156 "vmulps ymm24, ymm31, [r12 + 0]\t\n"
2157 "vmulps ymm25, ymm31, [r12 + 32]\t\n"
2158 "add r12, r13\t\n"
2159 "vmulps ymm26, ymm31, [r12 + 0]\t\n"
2160 "vmulps ymm27, ymm31, [r12 + 32]\t\n"
2161 "test r14,r14\t\n"
2162 "jz skip_preload%=\t\n"
2163 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
2164 "skip_preload%=:\t\n"
2165 "vbroadcastss ymm28,DWORD PTR [r9+0]\t\n"
2166 "vfmadd231ps ymm0,ymm29,ymm28\t\n"
2167 "vfmadd231ps ymm1,ymm30,ymm28\t\n"
2168 "vbroadcastss ymm28,DWORD PTR [r9+4]\t\n"
2169 "vfmadd231ps ymm2,ymm29,ymm28\t\n"
2170 "vfmadd231ps ymm3,ymm30,ymm28\t\n"
2171 "vbroadcastss ymm28,DWORD PTR [r9+8]\t\n"
2172 "vfmadd231ps ymm4,ymm29,ymm28\t\n"
2173 "vfmadd231ps ymm5,ymm30,ymm28\t\n"
2174 "vbroadcastss ymm28,DWORD PTR [r9+12]\t\n"
2175 "vfmadd231ps ymm6,ymm29,ymm28\t\n"
2176 "vfmadd231ps ymm7,ymm30,ymm28\t\n"
2177 "vbroadcastss ymm28,DWORD PTR [r9+16]\t\n"
2178 "vfmadd231ps ymm8,ymm29,ymm28\t\n"
2179 "vfmadd231ps ymm9,ymm30,ymm28\t\n"
2180 "vbroadcastss ymm28,DWORD PTR [r9+20]\t\n"
2181 "vfmadd231ps ymm10,ymm29,ymm28\t\n"
2182 "vfmadd231ps ymm11,ymm30,ymm28\t\n"
2183 "vbroadcastss ymm28,DWORD PTR [r9+24]\t\n"
2184 "vfmadd231ps ymm12,ymm29,ymm28\t\n"
2185 "vfmadd231ps ymm13,ymm30,ymm28\t\n"
2186 "vbroadcastss ymm28,DWORD PTR [r9+28]\t\n"
2187 "vfmadd231ps ymm14,ymm29,ymm28\t\n"
2188 "vfmadd231ps ymm15,ymm30,ymm28\t\n"
2189 "vbroadcastss ymm28,DWORD PTR [r9+32]\t\n"
2190 "vfmadd231ps ymm16,ymm29,ymm28\t\n"
2191 "vfmadd231ps ymm17,ymm30,ymm28\t\n"
2192 "vbroadcastss ymm28,DWORD PTR [r9+36]\t\n"
2193 "vfmadd231ps ymm18,ymm29,ymm28\t\n"
2194 "vfmadd231ps ymm19,ymm30,ymm28\t\n"
2195 "vbroadcastss ymm28,DWORD PTR [r9+40]\t\n"
2196 "vfmadd231ps ymm20,ymm29,ymm28\t\n"
2197 "vfmadd231ps ymm21,ymm30,ymm28\t\n"
2198 "vbroadcastss ymm28,DWORD PTR [r9+44]\t\n"
2199 "vfmadd231ps ymm22,ymm29,ymm28\t\n"
2200 "vfmadd231ps ymm23,ymm30,ymm28\t\n"
2201 "vbroadcastss ymm28,DWORD PTR [r9+48]\t\n"
2202 "vfmadd231ps ymm24,ymm29,ymm28\t\n"
2203 "vfmadd231ps ymm25,ymm30,ymm28\t\n"
2204 "vbroadcastss ymm28,DWORD PTR [r9+52]\t\n"
2205 "vfmadd231ps ymm26,ymm29,ymm28\t\n"
2206 "vfmadd231ps ymm27,ymm30,ymm28\t\n"
2207 "mov r12, rcx\t\n"
2208 "test r14,r14\t\n"
2209 "jnz next_inner%=\t\n"
2210 "add r10,32\t\n"
2211 "jmp dump_C%=\t\n"
2212
2213 "zero_regs%=:\t\n"
2214
2215 "test r14,r14\t\n"
2216 "jz skip_preload_b_zero%=\t\n"
2217 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
2218 "skip_preload_b_zero%=:\t\n"
2219 "vbroadcastss ymm28,DWORD PTR [r9+0]\t\n"
2220 "vmulps ymm0,ymm29,ymm28\t\n"
2221 "vmulps ymm1,ymm30,ymm28\t\n"
2222 "add r12, r13\t\n"
2223 "vbroadcastss ymm28,DWORD PTR [r9+4]\t\n"
2224 "vmulps ymm2,ymm29,ymm28\t\n"
2225 "vmulps ymm3,ymm30,ymm28\t\n"
2226 "add r12, r13\t\n"
2227 "vbroadcastss ymm28,DWORD PTR [r9+8]\t\n"
2228 "vmulps ymm4,ymm29,ymm28\t\n"
2229 "vmulps ymm5,ymm30,ymm28\t\n"
2230 "add r12, r13\t\n"
2231 "vbroadcastss ymm28,DWORD PTR [r9+12]\t\n"
2232 "vmulps ymm6,ymm29,ymm28\t\n"
2233 "vmulps ymm7,ymm30,ymm28\t\n"
2234 "add r12, r13\t\n"
2235 "vbroadcastss ymm28,DWORD PTR [r9+16]\t\n"
2236 "vmulps ymm8,ymm29,ymm28\t\n"
2237 "vmulps ymm9,ymm30,ymm28\t\n"
2238 "add r12, r13\t\n"
2239 "vbroadcastss ymm28,DWORD PTR [r9+20]\t\n"
2240 "vmulps ymm10,ymm29,ymm28\t\n"
2241 "vmulps ymm11,ymm30,ymm28\t\n"
2242 "add r12, r13\t\n"
2243 "vbroadcastss ymm28,DWORD PTR [r9+24]\t\n"
2244 "vmulps ymm12,ymm29,ymm28\t\n"
2245 "vmulps ymm13,ymm30,ymm28\t\n"
2246 "add r12, r13\t\n"
2247 "vbroadcastss ymm28,DWORD PTR [r9+28]\t\n"
2248 "vmulps ymm14,ymm29,ymm28\t\n"
2249 "vmulps ymm15,ymm30,ymm28\t\n"
2250 "add r12, r13\t\n"
2251 "vbroadcastss ymm28,DWORD PTR [r9+32]\t\n"
2252 "vmulps ymm16,ymm29,ymm28\t\n"
2253 "vmulps ymm17,ymm30,ymm28\t\n"
2254 "add r12, r13\t\n"
2255 "vbroadcastss ymm28,DWORD PTR [r9+36]\t\n"
2256 "vmulps ymm18,ymm29,ymm28\t\n"
2257 "vmulps ymm19,ymm30,ymm28\t\n"
2258 "add r12, r13\t\n"
2259 "vbroadcastss ymm28,DWORD PTR [r9+40]\t\n"
2260 "vmulps ymm20,ymm29,ymm28\t\n"
2261 "vmulps ymm21,ymm30,ymm28\t\n"
2262 "add r12, r13\t\n"
2263 "vbroadcastss ymm28,DWORD PTR [r9+44]\t\n"
2264 "vmulps ymm22,ymm29,ymm28\t\n"
2265 "vmulps ymm23,ymm30,ymm28\t\n"
2266 "add r12, r13\t\n"
2267 "vbroadcastss ymm28,DWORD PTR [r9+48]\t\n"
2268 "vmulps ymm24,ymm29,ymm28\t\n"
2269 "vmulps ymm25,ymm30,ymm28\t\n"
2270 "add r12, r13\t\n"
2271 "vbroadcastss ymm28,DWORD PTR [r9+52]\t\n"
2272 "vmulps ymm26,ymm29,ymm28\t\n"
2273 "vmulps ymm27,ymm30,ymm28\t\n"
2274 "mov r12, rcx\t\n"
2275 "test r14,r14\t\n"
2276 "jnz next_inner%=\t\n"
2277 "add r10,32\t\n"
2278 "jmp dump_C%=\t\n"
2279
2280 "loop_inner%=:\t\n"
2281
2282 "vmovaps ymm29,ymm31\t\n"
2283 "vcvtph2ps ymm30,XMMWORD PTR [r10 + 16]\t\n"
2284 "vcvtph2ps ymm31,XMMWORD PTR [r10 + 32]\t\n"
2285 "vbroadcastss ymm28,DWORD PTR [r9+0]\t\n"
2286 "vfmadd231ps ymm0,ymm29,ymm28\t\n"
2287 "vfmadd231ps ymm1,ymm30,ymm28\t\n"
2288 "vbroadcastss ymm28,DWORD PTR [r9+4]\t\n"
2289 "vfmadd231ps ymm2,ymm29,ymm28\t\n"
2290 "vfmadd231ps ymm3,ymm30,ymm28\t\n"
2291 "vbroadcastss ymm28,DWORD PTR [r9+8]\t\n"
2292 "vfmadd231ps ymm4,ymm29,ymm28\t\n"
2293 "vfmadd231ps ymm5,ymm30,ymm28\t\n"
2294 "vbroadcastss ymm28,DWORD PTR [r9+12]\t\n"
2295 "vfmadd231ps ymm6,ymm29,ymm28\t\n"
2296 "vfmadd231ps ymm7,ymm30,ymm28\t\n"
2297 "vbroadcastss ymm28,DWORD PTR [r9+16]\t\n"
2298 "vfmadd231ps ymm8,ymm29,ymm28\t\n"
2299 "vfmadd231ps ymm9,ymm30,ymm28\t\n"
2300 "vbroadcastss ymm28,DWORD PTR [r9+20]\t\n"
2301 "vfmadd231ps ymm10,ymm29,ymm28\t\n"
2302 "vfmadd231ps ymm11,ymm30,ymm28\t\n"
2303 "vbroadcastss ymm28,DWORD PTR [r9+24]\t\n"
2304 "vfmadd231ps ymm12,ymm29,ymm28\t\n"
2305 "vfmadd231ps ymm13,ymm30,ymm28\t\n"
2306 "vbroadcastss ymm28,DWORD PTR [r9+28]\t\n"
2307 "vfmadd231ps ymm14,ymm29,ymm28\t\n"
2308 "vfmadd231ps ymm15,ymm30,ymm28\t\n"
2309 "vbroadcastss ymm28,DWORD PTR [r9+32]\t\n"
2310 "vfmadd231ps ymm16,ymm29,ymm28\t\n"
2311 "vfmadd231ps ymm17,ymm30,ymm28\t\n"
2312 "vbroadcastss ymm28,DWORD PTR [r9+36]\t\n"
2313 "vfmadd231ps ymm18,ymm29,ymm28\t\n"
2314 "vfmadd231ps ymm19,ymm30,ymm28\t\n"
2315 "vbroadcastss ymm28,DWORD PTR [r9+40]\t\n"
2316 "vfmadd231ps ymm20,ymm29,ymm28\t\n"
2317 "vfmadd231ps ymm21,ymm30,ymm28\t\n"
2318 "vbroadcastss ymm28,DWORD PTR [r9+44]\t\n"
2319 "vfmadd231ps ymm22,ymm29,ymm28\t\n"
2320 "vfmadd231ps ymm23,ymm30,ymm28\t\n"
2321 "vbroadcastss ymm28,DWORD PTR [r9+48]\t\n"
2322 "vfmadd231ps ymm24,ymm29,ymm28\t\n"
2323 "vfmadd231ps ymm25,ymm30,ymm28\t\n"
2324 "vbroadcastss ymm28,DWORD PTR [r9+52]\t\n"
2325 "vfmadd231ps ymm26,ymm29,ymm28\t\n"
2326 "vfmadd231ps ymm27,ymm30,ymm28\t\n"
2327
2328 "next_inner%=:\t\n"
2329 "add r9,56\t\n"
2330 "add r10,32\t\n"
2331 "dec r14\t\n"
2332 "jnz loop_inner%=\t\n"
2333
2334 "vmovaps ymm29,ymm31\t\n"
2335 "vcvtph2ps ymm30,XMMWORD PTR [r10 + 16]\t\n"
2336 "vbroadcastss ymm28,DWORD PTR [r9+0]\t\n"
2337 "vfmadd231ps ymm0,ymm29,ymm28\t\n"
2338 "vfmadd231ps ymm1,ymm30,ymm28\t\n"
2339 "vbroadcastss ymm28,DWORD PTR [r9+4]\t\n"
2340 "vfmadd231ps ymm2,ymm29,ymm28\t\n"
2341 "vfmadd231ps ymm3,ymm30,ymm28\t\n"
2342 "vbroadcastss ymm28,DWORD PTR [r9+8]\t\n"
2343 "vfmadd231ps ymm4,ymm29,ymm28\t\n"
2344 "vfmadd231ps ymm5,ymm30,ymm28\t\n"
2345 "vbroadcastss ymm28,DWORD PTR [r9+12]\t\n"
2346 "vfmadd231ps ymm6,ymm29,ymm28\t\n"
2347 "vfmadd231ps ymm7,ymm30,ymm28\t\n"
2348 "vbroadcastss ymm28,DWORD PTR [r9+16]\t\n"
2349 "vfmadd231ps ymm8,ymm29,ymm28\t\n"
2350 "vfmadd231ps ymm9,ymm30,ymm28\t\n"
2351 "vbroadcastss ymm28,DWORD PTR [r9+20]\t\n"
2352 "vfmadd231ps ymm10,ymm29,ymm28\t\n"
2353 "vfmadd231ps ymm11,ymm30,ymm28\t\n"
2354 "vbroadcastss ymm28,DWORD PTR [r9+24]\t\n"
2355 "vfmadd231ps ymm12,ymm29,ymm28\t\n"
2356 "vfmadd231ps ymm13,ymm30,ymm28\t\n"
2357 "vbroadcastss ymm28,DWORD PTR [r9+28]\t\n"
2358 "vfmadd231ps ymm14,ymm29,ymm28\t\n"
2359 "vfmadd231ps ymm15,ymm30,ymm28\t\n"
2360 "vbroadcastss ymm28,DWORD PTR [r9+32]\t\n"
2361 "vfmadd231ps ymm16,ymm29,ymm28\t\n"
2362 "vfmadd231ps ymm17,ymm30,ymm28\t\n"
2363 "vbroadcastss ymm28,DWORD PTR [r9+36]\t\n"
2364 "vfmadd231ps ymm18,ymm29,ymm28\t\n"
2365 "vfmadd231ps ymm19,ymm30,ymm28\t\n"
2366 "vbroadcastss ymm28,DWORD PTR [r9+40]\t\n"
2367 "vfmadd231ps ymm20,ymm29,ymm28\t\n"
2368 "vfmadd231ps ymm21,ymm30,ymm28\t\n"
2369 "vbroadcastss ymm28,DWORD PTR [r9+44]\t\n"
2370 "vfmadd231ps ymm22,ymm29,ymm28\t\n"
2371 "vfmadd231ps ymm23,ymm30,ymm28\t\n"
2372 "vbroadcastss ymm28,DWORD PTR [r9+48]\t\n"
2373 "vfmadd231ps ymm24,ymm29,ymm28\t\n"
2374 "vfmadd231ps ymm25,ymm30,ymm28\t\n"
2375 "vbroadcastss ymm28,DWORD PTR [r9+52]\t\n"
2376 "vfmadd231ps ymm26,ymm29,ymm28\t\n"
2377 "vfmadd231ps ymm27,ymm30,ymm28\t\n"
2378 "add r9,56\t\n"
2379 "add r10,32\t\n"
2380 // Dump C
2381 "dump_C%=:\t\n"
2382 "vmovups ymmword PTR [r12 + 0], ymm0\t\n"
2383 "vmovups ymmword PTR [r12 + 32], ymm1\t\n"
2384 "add r12, r13\t\n"
2385 "vmovups ymmword PTR [r12 + 0], ymm2\t\n"
2386 "vmovups ymmword PTR [r12 + 32], ymm3\t\n"
2387 "add r12, r13\t\n"
2388 "vmovups ymmword PTR [r12 + 0], ymm4\t\n"
2389 "vmovups ymmword PTR [r12 + 32], ymm5\t\n"
2390 "add r12, r13\t\n"
2391 "vmovups ymmword PTR [r12 + 0], ymm6\t\n"
2392 "vmovups ymmword PTR [r12 + 32], ymm7\t\n"
2393 "add r12, r13\t\n"
2394 "vmovups ymmword PTR [r12 + 0], ymm8\t\n"
2395 "vmovups ymmword PTR [r12 + 32], ymm9\t\n"
2396 "add r12, r13\t\n"
2397 "vmovups ymmword PTR [r12 + 0], ymm10\t\n"
2398 "vmovups ymmword PTR [r12 + 32], ymm11\t\n"
2399 "add r12, r13\t\n"
2400 "vmovups ymmword PTR [r12 + 0], ymm12\t\n"
2401 "vmovups ymmword PTR [r12 + 32], ymm13\t\n"
2402 "add r12, r13\t\n"
2403 "vmovups ymmword PTR [r12 + 0], ymm14\t\n"
2404 "vmovups ymmword PTR [r12 + 32], ymm15\t\n"
2405 "add r12, r13\t\n"
2406 "vmovups ymmword PTR [r12 + 0], ymm16\t\n"
2407 "vmovups ymmword PTR [r12 + 32], ymm17\t\n"
2408 "add r12, r13\t\n"
2409 "vmovups ymmword PTR [r12 + 0], ymm18\t\n"
2410 "vmovups ymmword PTR [r12 + 32], ymm19\t\n"
2411 "add r12, r13\t\n"
2412 "vmovups ymmword PTR [r12 + 0], ymm20\t\n"
2413 "vmovups ymmword PTR [r12 + 32], ymm21\t\n"
2414 "add r12, r13\t\n"
2415 "vmovups ymmword PTR [r12 + 0], ymm22\t\n"
2416 "vmovups ymmword PTR [r12 + 32], ymm23\t\n"
2417 "add r12, r13\t\n"
2418 "vmovups ymmword PTR [r12 + 0], ymm24\t\n"
2419 "vmovups ymmword PTR [r12 + 32], ymm25\t\n"
2420 "add r12, r13\t\n"
2421 "vmovups ymmword PTR [r12 + 0], ymm26\t\n"
2422 "vmovups ymmword PTR [r12 + 32], ymm27\t\n"
2423
2424 // next outer iteration
2425 "add rcx, 64\t\n"
2426 "mov r12, rcx\t\n"
2427 "mov r9, rax\t\n"
2428 "inc rbx\t\n"
2429 "cmp rbx, rdi\t\n"
2430 "jl loop_outter%=\t\n"
2431 :
2432 : [gp] "rm"(gp)
2433 : "r8",
2434 "r9",
2435 "r10",
2436 "r11",
2437 "r13",
2438 "r14",
2439 "rax",
2440 "rcx",
2441 "rsi",
2442 "rdi",
2443 "rbx",
2444 "r12",
2445 "r15",
2446 "memory");
2447}
2448
2449} // namespace fbgemm
2450