1 | /* |
2 | * Copyright (c) Meta Platforms, Inc. and affiliates. |
3 | * All rights reserved. |
4 | * This source code is licensed under the BSD-style license found in the |
5 | * LICENSE file in the root directory of this source tree. |
6 | */ |
7 | #include "./FbgemmFP16UKernelsAvx512.h" |
8 | #include "./InlineAsmDefines.h" |
9 | |
10 | namespace fbgemm { |
11 | |
12 | void NOINLINE gemmkernel_1x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
13 | asm volatile( |
14 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
15 | "mov %[gp], %%r14\t\n" |
16 | ".intel_syntax noprefix\t\n" |
17 | #else |
18 | "mov r14, %[gp]\t\n" |
19 | #endif |
20 | |
21 | // Copy parameters |
22 | // k |
23 | "mov r8, [r14 + 0]\t\n" |
24 | "dec r8\t\n" |
25 | // A |
26 | "mov r9, [r14 + 8]\t\n" |
27 | // B |
28 | "mov r10, [r14 + 16]\t\n" |
29 | // beta |
30 | "lea r15, [r14 + 24]\t\n" |
31 | // C |
32 | "mov r12, [r14 + 32]\t\n" |
33 | // ldc |
34 | "mov r13, [r14 + 40]\t\n" |
35 | // b_block_cols |
36 | "mov rdi, [r14 + 48]\t\n" |
37 | // b_block_size |
38 | "mov rsi, [r14 + 56]\t\n" |
39 | |
40 | // Make copies of A and C |
41 | "mov rax, r9\t\n" |
42 | "mov rcx, r12\t\n" |
43 | |
44 | "xor ebx, ebx\t\n" |
45 | "loop_outter%=:\t\n" |
46 | "mov r14, r8\t\n" |
47 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
48 | "vcvtph2ps zmm3,YMMWORD PTR [r10 + 0]\t\n" |
49 | "vcvtph2ps zmm4,YMMWORD PTR [r10 + 32]\t\n" |
50 | "vxorps xmm0, xmm0, xmm0\t\n" |
51 | "vcomiss xmm31, xmm0\t\n" |
52 | "jz zero_regs%=\t\n" |
53 | |
54 | // Setup values with beta multiplication |
55 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
56 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
57 | "test r14,r14\t\n" |
58 | "jz skip_preload%=\t\n" |
59 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
60 | "skip_preload%=:\t\n" |
61 | "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n" |
62 | "vfmadd231ps zmm0,zmm3,zmm2\t\n" |
63 | "vfmadd231ps zmm1,zmm4,zmm2\t\n" |
64 | "test r14,r14\t\n" |
65 | "jnz next_inner%=\t\n" |
66 | "add r10,64\t\n" |
67 | "jmp dump_C%=\t\n" |
68 | |
69 | "zero_regs%=:\t\n" |
70 | |
71 | "test r14,r14\t\n" |
72 | "jz skip_preload_b_zero%=\t\n" |
73 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
74 | "skip_preload_b_zero%=:\t\n" |
75 | "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n" |
76 | "vmulps zmm0,zmm3,zmm2\t\n" |
77 | "vmulps zmm1,zmm4,zmm2\t\n" |
78 | "test r14,r14\t\n" |
79 | "jnz next_inner%=\t\n" |
80 | "add r10,64\t\n" |
81 | "jmp dump_C%=\t\n" |
82 | |
83 | "loop_inner%=:\t\n" |
84 | |
85 | "vmovaps zmm3,zmm31\t\n" |
86 | "vcvtph2ps zmm4,YMMWORD PTR [r10 + 32]\t\n" |
87 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
88 | "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n" |
89 | "vfmadd231ps zmm0,zmm3,zmm2\t\n" |
90 | "vfmadd231ps zmm1,zmm4,zmm2\t\n" |
91 | |
92 | "next_inner%=:\t\n" |
93 | "add r9,4\t\n" |
94 | "add r10,64\t\n" |
95 | "dec r14\t\n" |
96 | "jnz loop_inner%=\t\n" |
97 | |
98 | "vmovaps zmm3,zmm31\t\n" |
99 | "vcvtph2ps zmm4,YMMWORD PTR [r10 + 32]\t\n" |
100 | "vbroadcastss zmm2,DWORD PTR [r9+0]\t\n" |
101 | "vfmadd231ps zmm0,zmm3,zmm2\t\n" |
102 | "vfmadd231ps zmm1,zmm4,zmm2\t\n" |
103 | "add r9,4\t\n" |
104 | "add r10,64\t\n" |
105 | // Dump C |
106 | "dump_C%=:\t\n" |
107 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
108 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
109 | |
110 | // next outer iteration |
111 | "add rcx, 128\t\n" |
112 | "mov r12, rcx\t\n" |
113 | "mov r9, rax\t\n" |
114 | "inc rbx\t\n" |
115 | "cmp rbx, rdi\t\n" |
116 | "jl loop_outter%=\t\n" |
117 | : |
118 | : [gp] "rm" (gp) |
119 | : "r8" , |
120 | "r9" , |
121 | "r10" , |
122 | "r11" , |
123 | "r13" , |
124 | "r14" , |
125 | "rax" , |
126 | "rcx" , |
127 | "rsi" , |
128 | "rdi" , |
129 | "rbx" , |
130 | "r12" , |
131 | "r15" , |
132 | "memory" ); |
133 | } |
134 | void NOINLINE gemmkernel_2x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
135 | asm volatile( |
136 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
137 | "mov %[gp], %%r14\t\n" |
138 | ".intel_syntax noprefix\t\n" |
139 | #else |
140 | "mov r14, %[gp]\t\n" |
141 | #endif |
142 | |
143 | // Copy parameters |
144 | // k |
145 | "mov r8, [r14 + 0]\t\n" |
146 | "dec r8\t\n" |
147 | // A |
148 | "mov r9, [r14 + 8]\t\n" |
149 | // B |
150 | "mov r10, [r14 + 16]\t\n" |
151 | // beta |
152 | "lea r15, [r14 + 24]\t\n" |
153 | // C |
154 | "mov r12, [r14 + 32]\t\n" |
155 | // ldc |
156 | "mov r13, [r14 + 40]\t\n" |
157 | // b_block_cols |
158 | "mov rdi, [r14 + 48]\t\n" |
159 | // b_block_size |
160 | "mov rsi, [r14 + 56]\t\n" |
161 | |
162 | // Make copies of A and C |
163 | "mov rax, r9\t\n" |
164 | "mov rcx, r12\t\n" |
165 | |
166 | "xor ebx, ebx\t\n" |
167 | "loop_outter%=:\t\n" |
168 | "mov r14, r8\t\n" |
169 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
170 | "vcvtph2ps zmm5,YMMWORD PTR [r10 + 0]\t\n" |
171 | "vcvtph2ps zmm6,YMMWORD PTR [r10 + 32]\t\n" |
172 | "vxorps xmm0, xmm0, xmm0\t\n" |
173 | "vcomiss xmm31, xmm0\t\n" |
174 | "jz zero_regs%=\t\n" |
175 | |
176 | // Setup values with beta multiplication |
177 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
178 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
179 | "add r12, r13\t\n" |
180 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
181 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
182 | "test r14,r14\t\n" |
183 | "jz skip_preload%=\t\n" |
184 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
185 | "skip_preload%=:\t\n" |
186 | "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n" |
187 | "vfmadd231ps zmm0,zmm5,zmm4\t\n" |
188 | "vfmadd231ps zmm1,zmm6,zmm4\t\n" |
189 | "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n" |
190 | "vfmadd231ps zmm2,zmm5,zmm4\t\n" |
191 | "vfmadd231ps zmm3,zmm6,zmm4\t\n" |
192 | "mov r12, rcx\t\n" |
193 | "test r14,r14\t\n" |
194 | "jnz next_inner%=\t\n" |
195 | "add r10,64\t\n" |
196 | "jmp dump_C%=\t\n" |
197 | |
198 | "zero_regs%=:\t\n" |
199 | |
200 | "test r14,r14\t\n" |
201 | "jz skip_preload_b_zero%=\t\n" |
202 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
203 | "skip_preload_b_zero%=:\t\n" |
204 | "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n" |
205 | "vmulps zmm0,zmm5,zmm4\t\n" |
206 | "vmulps zmm1,zmm6,zmm4\t\n" |
207 | "add r12, r13\t\n" |
208 | "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n" |
209 | "vmulps zmm2,zmm5,zmm4\t\n" |
210 | "vmulps zmm3,zmm6,zmm4\t\n" |
211 | "mov r12, rcx\t\n" |
212 | "test r14,r14\t\n" |
213 | "jnz next_inner%=\t\n" |
214 | "add r10,64\t\n" |
215 | "jmp dump_C%=\t\n" |
216 | |
217 | "loop_inner%=:\t\n" |
218 | |
219 | "vmovaps zmm5,zmm31\t\n" |
220 | "vcvtph2ps zmm6,YMMWORD PTR [r10 + 32]\t\n" |
221 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
222 | "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n" |
223 | "vfmadd231ps zmm0,zmm5,zmm4\t\n" |
224 | "vfmadd231ps zmm1,zmm6,zmm4\t\n" |
225 | "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n" |
226 | "vfmadd231ps zmm2,zmm5,zmm4\t\n" |
227 | "vfmadd231ps zmm3,zmm6,zmm4\t\n" |
228 | |
229 | "next_inner%=:\t\n" |
230 | "add r9,8\t\n" |
231 | "add r10,64\t\n" |
232 | "dec r14\t\n" |
233 | "jnz loop_inner%=\t\n" |
234 | |
235 | "vmovaps zmm5,zmm31\t\n" |
236 | "vcvtph2ps zmm6,YMMWORD PTR [r10 + 32]\t\n" |
237 | "vbroadcastss zmm4,DWORD PTR [r9+0]\t\n" |
238 | "vfmadd231ps zmm0,zmm5,zmm4\t\n" |
239 | "vfmadd231ps zmm1,zmm6,zmm4\t\n" |
240 | "vbroadcastss zmm4,DWORD PTR [r9+4]\t\n" |
241 | "vfmadd231ps zmm2,zmm5,zmm4\t\n" |
242 | "vfmadd231ps zmm3,zmm6,zmm4\t\n" |
243 | "add r9,8\t\n" |
244 | "add r10,64\t\n" |
245 | // Dump C |
246 | "dump_C%=:\t\n" |
247 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
248 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
249 | "add r12, r13\t\n" |
250 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
251 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
252 | |
253 | // next outer iteration |
254 | "add rcx, 128\t\n" |
255 | "mov r12, rcx\t\n" |
256 | "mov r9, rax\t\n" |
257 | "inc rbx\t\n" |
258 | "cmp rbx, rdi\t\n" |
259 | "jl loop_outter%=\t\n" |
260 | : |
261 | : [gp] "rm" (gp) |
262 | : "r8" , |
263 | "r9" , |
264 | "r10" , |
265 | "r11" , |
266 | "r13" , |
267 | "r14" , |
268 | "rax" , |
269 | "rcx" , |
270 | "rsi" , |
271 | "rdi" , |
272 | "rbx" , |
273 | "r12" , |
274 | "r15" , |
275 | "memory" ); |
276 | } |
277 | void NOINLINE gemmkernel_3x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
278 | asm volatile( |
279 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
280 | "mov %[gp], %%r14\t\n" |
281 | ".intel_syntax noprefix\t\n" |
282 | #else |
283 | "mov r14, %[gp]\t\n" |
284 | #endif |
285 | |
286 | // Copy parameters |
287 | // k |
288 | "mov r8, [r14 + 0]\t\n" |
289 | "dec r8\t\n" |
290 | // A |
291 | "mov r9, [r14 + 8]\t\n" |
292 | // B |
293 | "mov r10, [r14 + 16]\t\n" |
294 | // beta |
295 | "lea r15, [r14 + 24]\t\n" |
296 | // C |
297 | "mov r12, [r14 + 32]\t\n" |
298 | // ldc |
299 | "mov r13, [r14 + 40]\t\n" |
300 | // b_block_cols |
301 | "mov rdi, [r14 + 48]\t\n" |
302 | // b_block_size |
303 | "mov rsi, [r14 + 56]\t\n" |
304 | |
305 | // Make copies of A and C |
306 | "mov rax, r9\t\n" |
307 | "mov rcx, r12\t\n" |
308 | |
309 | "xor ebx, ebx\t\n" |
310 | "loop_outter%=:\t\n" |
311 | "mov r14, r8\t\n" |
312 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
313 | "vcvtph2ps zmm7,YMMWORD PTR [r10 + 0]\t\n" |
314 | "vcvtph2ps zmm8,YMMWORD PTR [r10 + 32]\t\n" |
315 | "vxorps xmm0, xmm0, xmm0\t\n" |
316 | "vcomiss xmm31, xmm0\t\n" |
317 | "jz zero_regs%=\t\n" |
318 | |
319 | // Setup values with beta multiplication |
320 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
321 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
322 | "add r12, r13\t\n" |
323 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
324 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
325 | "add r12, r13\t\n" |
326 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
327 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
328 | "test r14,r14\t\n" |
329 | "jz skip_preload%=\t\n" |
330 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
331 | "skip_preload%=:\t\n" |
332 | "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n" |
333 | "vfmadd231ps zmm0,zmm7,zmm6\t\n" |
334 | "vfmadd231ps zmm1,zmm8,zmm6\t\n" |
335 | "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n" |
336 | "vfmadd231ps zmm2,zmm7,zmm6\t\n" |
337 | "vfmadd231ps zmm3,zmm8,zmm6\t\n" |
338 | "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n" |
339 | "vfmadd231ps zmm4,zmm7,zmm6\t\n" |
340 | "vfmadd231ps zmm5,zmm8,zmm6\t\n" |
341 | "mov r12, rcx\t\n" |
342 | "test r14,r14\t\n" |
343 | "jnz next_inner%=\t\n" |
344 | "add r10,64\t\n" |
345 | "jmp dump_C%=\t\n" |
346 | |
347 | "zero_regs%=:\t\n" |
348 | |
349 | "test r14,r14\t\n" |
350 | "jz skip_preload_b_zero%=\t\n" |
351 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
352 | "skip_preload_b_zero%=:\t\n" |
353 | "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n" |
354 | "vmulps zmm0,zmm7,zmm6\t\n" |
355 | "vmulps zmm1,zmm8,zmm6\t\n" |
356 | "add r12, r13\t\n" |
357 | "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n" |
358 | "vmulps zmm2,zmm7,zmm6\t\n" |
359 | "vmulps zmm3,zmm8,zmm6\t\n" |
360 | "add r12, r13\t\n" |
361 | "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n" |
362 | "vmulps zmm4,zmm7,zmm6\t\n" |
363 | "vmulps zmm5,zmm8,zmm6\t\n" |
364 | "mov r12, rcx\t\n" |
365 | "test r14,r14\t\n" |
366 | "jnz next_inner%=\t\n" |
367 | "add r10,64\t\n" |
368 | "jmp dump_C%=\t\n" |
369 | |
370 | "loop_inner%=:\t\n" |
371 | |
372 | "vmovaps zmm7,zmm31\t\n" |
373 | "vcvtph2ps zmm8,YMMWORD PTR [r10 + 32]\t\n" |
374 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
375 | "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n" |
376 | "vfmadd231ps zmm0,zmm7,zmm6\t\n" |
377 | "vfmadd231ps zmm1,zmm8,zmm6\t\n" |
378 | "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n" |
379 | "vfmadd231ps zmm2,zmm7,zmm6\t\n" |
380 | "vfmadd231ps zmm3,zmm8,zmm6\t\n" |
381 | "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n" |
382 | "vfmadd231ps zmm4,zmm7,zmm6\t\n" |
383 | "vfmadd231ps zmm5,zmm8,zmm6\t\n" |
384 | |
385 | "next_inner%=:\t\n" |
386 | "add r9,12\t\n" |
387 | "add r10,64\t\n" |
388 | "dec r14\t\n" |
389 | "jnz loop_inner%=\t\n" |
390 | |
391 | "vmovaps zmm7,zmm31\t\n" |
392 | "vcvtph2ps zmm8,YMMWORD PTR [r10 + 32]\t\n" |
393 | "vbroadcastss zmm6,DWORD PTR [r9+0]\t\n" |
394 | "vfmadd231ps zmm0,zmm7,zmm6\t\n" |
395 | "vfmadd231ps zmm1,zmm8,zmm6\t\n" |
396 | "vbroadcastss zmm6,DWORD PTR [r9+4]\t\n" |
397 | "vfmadd231ps zmm2,zmm7,zmm6\t\n" |
398 | "vfmadd231ps zmm3,zmm8,zmm6\t\n" |
399 | "vbroadcastss zmm6,DWORD PTR [r9+8]\t\n" |
400 | "vfmadd231ps zmm4,zmm7,zmm6\t\n" |
401 | "vfmadd231ps zmm5,zmm8,zmm6\t\n" |
402 | "add r9,12\t\n" |
403 | "add r10,64\t\n" |
404 | // Dump C |
405 | "dump_C%=:\t\n" |
406 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
407 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
408 | "add r12, r13\t\n" |
409 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
410 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
411 | "add r12, r13\t\n" |
412 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
413 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
414 | |
415 | // next outer iteration |
416 | "add rcx, 128\t\n" |
417 | "mov r12, rcx\t\n" |
418 | "mov r9, rax\t\n" |
419 | "inc rbx\t\n" |
420 | "cmp rbx, rdi\t\n" |
421 | "jl loop_outter%=\t\n" |
422 | : |
423 | : [gp] "rm" (gp) |
424 | : "r8" , |
425 | "r9" , |
426 | "r10" , |
427 | "r11" , |
428 | "r13" , |
429 | "r14" , |
430 | "rax" , |
431 | "rcx" , |
432 | "rsi" , |
433 | "rdi" , |
434 | "rbx" , |
435 | "r12" , |
436 | "r15" , |
437 | "memory" ); |
438 | } |
439 | void NOINLINE gemmkernel_4x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
440 | asm volatile( |
441 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
442 | "mov %[gp], %%r14\t\n" |
443 | ".intel_syntax noprefix\t\n" |
444 | #else |
445 | "mov r14, %[gp]\t\n" |
446 | #endif |
447 | |
448 | // Copy parameters |
449 | // k |
450 | "mov r8, [r14 + 0]\t\n" |
451 | "dec r8\t\n" |
452 | // A |
453 | "mov r9, [r14 + 8]\t\n" |
454 | // B |
455 | "mov r10, [r14 + 16]\t\n" |
456 | // beta |
457 | "lea r15, [r14 + 24]\t\n" |
458 | // C |
459 | "mov r12, [r14 + 32]\t\n" |
460 | // ldc |
461 | "mov r13, [r14 + 40]\t\n" |
462 | // b_block_cols |
463 | "mov rdi, [r14 + 48]\t\n" |
464 | // b_block_size |
465 | "mov rsi, [r14 + 56]\t\n" |
466 | |
467 | // Make copies of A and C |
468 | "mov rax, r9\t\n" |
469 | "mov rcx, r12\t\n" |
470 | |
471 | "xor ebx, ebx\t\n" |
472 | "loop_outter%=:\t\n" |
473 | "mov r14, r8\t\n" |
474 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
475 | "vcvtph2ps zmm9,YMMWORD PTR [r10 + 0]\t\n" |
476 | "vcvtph2ps zmm10,YMMWORD PTR [r10 + 32]\t\n" |
477 | "vxorps xmm0, xmm0, xmm0\t\n" |
478 | "vcomiss xmm31, xmm0\t\n" |
479 | "jz zero_regs%=\t\n" |
480 | |
481 | // Setup values with beta multiplication |
482 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
483 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
484 | "add r12, r13\t\n" |
485 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
486 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
487 | "add r12, r13\t\n" |
488 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
489 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
490 | "add r12, r13\t\n" |
491 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
492 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
493 | "test r14,r14\t\n" |
494 | "jz skip_preload%=\t\n" |
495 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
496 | "skip_preload%=:\t\n" |
497 | "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n" |
498 | "vfmadd231ps zmm0,zmm9,zmm8\t\n" |
499 | "vfmadd231ps zmm1,zmm10,zmm8\t\n" |
500 | "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n" |
501 | "vfmadd231ps zmm2,zmm9,zmm8\t\n" |
502 | "vfmadd231ps zmm3,zmm10,zmm8\t\n" |
503 | "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n" |
504 | "vfmadd231ps zmm4,zmm9,zmm8\t\n" |
505 | "vfmadd231ps zmm5,zmm10,zmm8\t\n" |
506 | "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n" |
507 | "vfmadd231ps zmm6,zmm9,zmm8\t\n" |
508 | "vfmadd231ps zmm7,zmm10,zmm8\t\n" |
509 | "mov r12, rcx\t\n" |
510 | "test r14,r14\t\n" |
511 | "jnz next_inner%=\t\n" |
512 | "add r10,64\t\n" |
513 | "jmp dump_C%=\t\n" |
514 | |
515 | "zero_regs%=:\t\n" |
516 | |
517 | "test r14,r14\t\n" |
518 | "jz skip_preload_b_zero%=\t\n" |
519 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
520 | "skip_preload_b_zero%=:\t\n" |
521 | "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n" |
522 | "vmulps zmm0,zmm9,zmm8\t\n" |
523 | "vmulps zmm1,zmm10,zmm8\t\n" |
524 | "add r12, r13\t\n" |
525 | "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n" |
526 | "vmulps zmm2,zmm9,zmm8\t\n" |
527 | "vmulps zmm3,zmm10,zmm8\t\n" |
528 | "add r12, r13\t\n" |
529 | "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n" |
530 | "vmulps zmm4,zmm9,zmm8\t\n" |
531 | "vmulps zmm5,zmm10,zmm8\t\n" |
532 | "add r12, r13\t\n" |
533 | "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n" |
534 | "vmulps zmm6,zmm9,zmm8\t\n" |
535 | "vmulps zmm7,zmm10,zmm8\t\n" |
536 | "mov r12, rcx\t\n" |
537 | "test r14,r14\t\n" |
538 | "jnz next_inner%=\t\n" |
539 | "add r10,64\t\n" |
540 | "jmp dump_C%=\t\n" |
541 | |
542 | "loop_inner%=:\t\n" |
543 | |
544 | "vmovaps zmm9,zmm31\t\n" |
545 | "vcvtph2ps zmm10,YMMWORD PTR [r10 + 32]\t\n" |
546 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
547 | "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n" |
548 | "vfmadd231ps zmm0,zmm9,zmm8\t\n" |
549 | "vfmadd231ps zmm1,zmm10,zmm8\t\n" |
550 | "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n" |
551 | "vfmadd231ps zmm2,zmm9,zmm8\t\n" |
552 | "vfmadd231ps zmm3,zmm10,zmm8\t\n" |
553 | "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n" |
554 | "vfmadd231ps zmm4,zmm9,zmm8\t\n" |
555 | "vfmadd231ps zmm5,zmm10,zmm8\t\n" |
556 | "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n" |
557 | "vfmadd231ps zmm6,zmm9,zmm8\t\n" |
558 | "vfmadd231ps zmm7,zmm10,zmm8\t\n" |
559 | |
560 | "next_inner%=:\t\n" |
561 | "add r9,16\t\n" |
562 | "add r10,64\t\n" |
563 | "dec r14\t\n" |
564 | "jnz loop_inner%=\t\n" |
565 | |
566 | "vmovaps zmm9,zmm31\t\n" |
567 | "vcvtph2ps zmm10,YMMWORD PTR [r10 + 32]\t\n" |
568 | "vbroadcastss zmm8,DWORD PTR [r9+0]\t\n" |
569 | "vfmadd231ps zmm0,zmm9,zmm8\t\n" |
570 | "vfmadd231ps zmm1,zmm10,zmm8\t\n" |
571 | "vbroadcastss zmm8,DWORD PTR [r9+4]\t\n" |
572 | "vfmadd231ps zmm2,zmm9,zmm8\t\n" |
573 | "vfmadd231ps zmm3,zmm10,zmm8\t\n" |
574 | "vbroadcastss zmm8,DWORD PTR [r9+8]\t\n" |
575 | "vfmadd231ps zmm4,zmm9,zmm8\t\n" |
576 | "vfmadd231ps zmm5,zmm10,zmm8\t\n" |
577 | "vbroadcastss zmm8,DWORD PTR [r9+12]\t\n" |
578 | "vfmadd231ps zmm6,zmm9,zmm8\t\n" |
579 | "vfmadd231ps zmm7,zmm10,zmm8\t\n" |
580 | "add r9,16\t\n" |
581 | "add r10,64\t\n" |
582 | // Dump C |
583 | "dump_C%=:\t\n" |
584 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
585 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
586 | "add r12, r13\t\n" |
587 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
588 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
589 | "add r12, r13\t\n" |
590 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
591 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
592 | "add r12, r13\t\n" |
593 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
594 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
595 | |
596 | // next outer iteration |
597 | "add rcx, 128\t\n" |
598 | "mov r12, rcx\t\n" |
599 | "mov r9, rax\t\n" |
600 | "inc rbx\t\n" |
601 | "cmp rbx, rdi\t\n" |
602 | "jl loop_outter%=\t\n" |
603 | : |
604 | : [gp] "rm" (gp) |
605 | : "r8" , |
606 | "r9" , |
607 | "r10" , |
608 | "r11" , |
609 | "r13" , |
610 | "r14" , |
611 | "rax" , |
612 | "rcx" , |
613 | "rsi" , |
614 | "rdi" , |
615 | "rbx" , |
616 | "r12" , |
617 | "r15" , |
618 | "memory" ); |
619 | } |
620 | void NOINLINE gemmkernel_5x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
621 | asm volatile( |
622 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
623 | "mov %[gp], %%r14\t\n" |
624 | ".intel_syntax noprefix\t\n" |
625 | #else |
626 | "mov r14, %[gp]\t\n" |
627 | #endif |
628 | |
629 | // Copy parameters |
630 | // k |
631 | "mov r8, [r14 + 0]\t\n" |
632 | "dec r8\t\n" |
633 | // A |
634 | "mov r9, [r14 + 8]\t\n" |
635 | // B |
636 | "mov r10, [r14 + 16]\t\n" |
637 | // beta |
638 | "lea r15, [r14 + 24]\t\n" |
639 | // C |
640 | "mov r12, [r14 + 32]\t\n" |
641 | // ldc |
642 | "mov r13, [r14 + 40]\t\n" |
643 | // b_block_cols |
644 | "mov rdi, [r14 + 48]\t\n" |
645 | // b_block_size |
646 | "mov rsi, [r14 + 56]\t\n" |
647 | |
648 | // Make copies of A and C |
649 | "mov rax, r9\t\n" |
650 | "mov rcx, r12\t\n" |
651 | |
652 | "xor ebx, ebx\t\n" |
653 | "loop_outter%=:\t\n" |
654 | "mov r14, r8\t\n" |
655 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
656 | "vcvtph2ps zmm11,YMMWORD PTR [r10 + 0]\t\n" |
657 | "vcvtph2ps zmm12,YMMWORD PTR [r10 + 32]\t\n" |
658 | "vxorps xmm0, xmm0, xmm0\t\n" |
659 | "vcomiss xmm31, xmm0\t\n" |
660 | "jz zero_regs%=\t\n" |
661 | |
662 | // Setup values with beta multiplication |
663 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
664 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
665 | "add r12, r13\t\n" |
666 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
667 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
668 | "add r12, r13\t\n" |
669 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
670 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
671 | "add r12, r13\t\n" |
672 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
673 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
674 | "add r12, r13\t\n" |
675 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
676 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
677 | "test r14,r14\t\n" |
678 | "jz skip_preload%=\t\n" |
679 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
680 | "skip_preload%=:\t\n" |
681 | "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n" |
682 | "vfmadd231ps zmm0,zmm11,zmm10\t\n" |
683 | "vfmadd231ps zmm1,zmm12,zmm10\t\n" |
684 | "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n" |
685 | "vfmadd231ps zmm2,zmm11,zmm10\t\n" |
686 | "vfmadd231ps zmm3,zmm12,zmm10\t\n" |
687 | "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n" |
688 | "vfmadd231ps zmm4,zmm11,zmm10\t\n" |
689 | "vfmadd231ps zmm5,zmm12,zmm10\t\n" |
690 | "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n" |
691 | "vfmadd231ps zmm6,zmm11,zmm10\t\n" |
692 | "vfmadd231ps zmm7,zmm12,zmm10\t\n" |
693 | "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n" |
694 | "vfmadd231ps zmm8,zmm11,zmm10\t\n" |
695 | "vfmadd231ps zmm9,zmm12,zmm10\t\n" |
696 | "mov r12, rcx\t\n" |
697 | "test r14,r14\t\n" |
698 | "jnz next_inner%=\t\n" |
699 | "add r10,64\t\n" |
700 | "jmp dump_C%=\t\n" |
701 | |
702 | "zero_regs%=:\t\n" |
703 | |
704 | "test r14,r14\t\n" |
705 | "jz skip_preload_b_zero%=\t\n" |
706 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
707 | "skip_preload_b_zero%=:\t\n" |
708 | "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n" |
709 | "vmulps zmm0,zmm11,zmm10\t\n" |
710 | "vmulps zmm1,zmm12,zmm10\t\n" |
711 | "add r12, r13\t\n" |
712 | "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n" |
713 | "vmulps zmm2,zmm11,zmm10\t\n" |
714 | "vmulps zmm3,zmm12,zmm10\t\n" |
715 | "add r12, r13\t\n" |
716 | "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n" |
717 | "vmulps zmm4,zmm11,zmm10\t\n" |
718 | "vmulps zmm5,zmm12,zmm10\t\n" |
719 | "add r12, r13\t\n" |
720 | "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n" |
721 | "vmulps zmm6,zmm11,zmm10\t\n" |
722 | "vmulps zmm7,zmm12,zmm10\t\n" |
723 | "add r12, r13\t\n" |
724 | "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n" |
725 | "vmulps zmm8,zmm11,zmm10\t\n" |
726 | "vmulps zmm9,zmm12,zmm10\t\n" |
727 | "mov r12, rcx\t\n" |
728 | "test r14,r14\t\n" |
729 | "jnz next_inner%=\t\n" |
730 | "add r10,64\t\n" |
731 | "jmp dump_C%=\t\n" |
732 | |
733 | "loop_inner%=:\t\n" |
734 | |
735 | "vmovaps zmm11,zmm31\t\n" |
736 | "vcvtph2ps zmm12,YMMWORD PTR [r10 + 32]\t\n" |
737 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
738 | "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n" |
739 | "vfmadd231ps zmm0,zmm11,zmm10\t\n" |
740 | "vfmadd231ps zmm1,zmm12,zmm10\t\n" |
741 | "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n" |
742 | "vfmadd231ps zmm2,zmm11,zmm10\t\n" |
743 | "vfmadd231ps zmm3,zmm12,zmm10\t\n" |
744 | "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n" |
745 | "vfmadd231ps zmm4,zmm11,zmm10\t\n" |
746 | "vfmadd231ps zmm5,zmm12,zmm10\t\n" |
747 | "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n" |
748 | "vfmadd231ps zmm6,zmm11,zmm10\t\n" |
749 | "vfmadd231ps zmm7,zmm12,zmm10\t\n" |
750 | "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n" |
751 | "vfmadd231ps zmm8,zmm11,zmm10\t\n" |
752 | "vfmadd231ps zmm9,zmm12,zmm10\t\n" |
753 | |
754 | "next_inner%=:\t\n" |
755 | "add r9,20\t\n" |
756 | "add r10,64\t\n" |
757 | "dec r14\t\n" |
758 | "jnz loop_inner%=\t\n" |
759 | |
760 | "vmovaps zmm11,zmm31\t\n" |
761 | "vcvtph2ps zmm12,YMMWORD PTR [r10 + 32]\t\n" |
762 | "vbroadcastss zmm10,DWORD PTR [r9+0]\t\n" |
763 | "vfmadd231ps zmm0,zmm11,zmm10\t\n" |
764 | "vfmadd231ps zmm1,zmm12,zmm10\t\n" |
765 | "vbroadcastss zmm10,DWORD PTR [r9+4]\t\n" |
766 | "vfmadd231ps zmm2,zmm11,zmm10\t\n" |
767 | "vfmadd231ps zmm3,zmm12,zmm10\t\n" |
768 | "vbroadcastss zmm10,DWORD PTR [r9+8]\t\n" |
769 | "vfmadd231ps zmm4,zmm11,zmm10\t\n" |
770 | "vfmadd231ps zmm5,zmm12,zmm10\t\n" |
771 | "vbroadcastss zmm10,DWORD PTR [r9+12]\t\n" |
772 | "vfmadd231ps zmm6,zmm11,zmm10\t\n" |
773 | "vfmadd231ps zmm7,zmm12,zmm10\t\n" |
774 | "vbroadcastss zmm10,DWORD PTR [r9+16]\t\n" |
775 | "vfmadd231ps zmm8,zmm11,zmm10\t\n" |
776 | "vfmadd231ps zmm9,zmm12,zmm10\t\n" |
777 | "add r9,20\t\n" |
778 | "add r10,64\t\n" |
779 | // Dump C |
780 | "dump_C%=:\t\n" |
781 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
782 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
783 | "add r12, r13\t\n" |
784 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
785 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
786 | "add r12, r13\t\n" |
787 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
788 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
789 | "add r12, r13\t\n" |
790 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
791 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
792 | "add r12, r13\t\n" |
793 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
794 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
795 | |
796 | // next outer iteration |
797 | "add rcx, 128\t\n" |
798 | "mov r12, rcx\t\n" |
799 | "mov r9, rax\t\n" |
800 | "inc rbx\t\n" |
801 | "cmp rbx, rdi\t\n" |
802 | "jl loop_outter%=\t\n" |
803 | : |
804 | : [gp] "rm" (gp) |
805 | : "r8" , |
806 | "r9" , |
807 | "r10" , |
808 | "r11" , |
809 | "r13" , |
810 | "r14" , |
811 | "rax" , |
812 | "rcx" , |
813 | "rsi" , |
814 | "rdi" , |
815 | "rbx" , |
816 | "r12" , |
817 | "r15" , |
818 | "memory" ); |
819 | } |
820 | void NOINLINE gemmkernel_6x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
821 | asm volatile( |
822 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
823 | "mov %[gp], %%r14\t\n" |
824 | ".intel_syntax noprefix\t\n" |
825 | #else |
826 | "mov r14, %[gp]\t\n" |
827 | #endif |
828 | |
829 | // Copy parameters |
830 | // k |
831 | "mov r8, [r14 + 0]\t\n" |
832 | "dec r8\t\n" |
833 | // A |
834 | "mov r9, [r14 + 8]\t\n" |
835 | // B |
836 | "mov r10, [r14 + 16]\t\n" |
837 | // beta |
838 | "lea r15, [r14 + 24]\t\n" |
839 | // C |
840 | "mov r12, [r14 + 32]\t\n" |
841 | // ldc |
842 | "mov r13, [r14 + 40]\t\n" |
843 | // b_block_cols |
844 | "mov rdi, [r14 + 48]\t\n" |
845 | // b_block_size |
846 | "mov rsi, [r14 + 56]\t\n" |
847 | |
848 | // Make copies of A and C |
849 | "mov rax, r9\t\n" |
850 | "mov rcx, r12\t\n" |
851 | |
852 | "xor ebx, ebx\t\n" |
853 | "loop_outter%=:\t\n" |
854 | "mov r14, r8\t\n" |
855 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
856 | "vcvtph2ps zmm13,YMMWORD PTR [r10 + 0]\t\n" |
857 | "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n" |
858 | "vxorps xmm0, xmm0, xmm0\t\n" |
859 | "vcomiss xmm31, xmm0\t\n" |
860 | "jz zero_regs%=\t\n" |
861 | |
862 | // Setup values with beta multiplication |
863 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
864 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
865 | "add r12, r13\t\n" |
866 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
867 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
868 | "add r12, r13\t\n" |
869 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
870 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
871 | "add r12, r13\t\n" |
872 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
873 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
874 | "add r12, r13\t\n" |
875 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
876 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
877 | "add r12, r13\t\n" |
878 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
879 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
880 | "test r14,r14\t\n" |
881 | "jz skip_preload%=\t\n" |
882 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
883 | "skip_preload%=:\t\n" |
884 | "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n" |
885 | "vfmadd231ps zmm0,zmm13,zmm12\t\n" |
886 | "vfmadd231ps zmm1,zmm14,zmm12\t\n" |
887 | "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n" |
888 | "vfmadd231ps zmm2,zmm13,zmm12\t\n" |
889 | "vfmadd231ps zmm3,zmm14,zmm12\t\n" |
890 | "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n" |
891 | "vfmadd231ps zmm4,zmm13,zmm12\t\n" |
892 | "vfmadd231ps zmm5,zmm14,zmm12\t\n" |
893 | "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n" |
894 | "vfmadd231ps zmm6,zmm13,zmm12\t\n" |
895 | "vfmadd231ps zmm7,zmm14,zmm12\t\n" |
896 | "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n" |
897 | "vfmadd231ps zmm8,zmm13,zmm12\t\n" |
898 | "vfmadd231ps zmm9,zmm14,zmm12\t\n" |
899 | "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n" |
900 | "vfmadd231ps zmm10,zmm13,zmm12\t\n" |
901 | "vfmadd231ps zmm11,zmm14,zmm12\t\n" |
902 | "mov r12, rcx\t\n" |
903 | "test r14,r14\t\n" |
904 | "jnz next_inner%=\t\n" |
905 | "add r10,64\t\n" |
906 | "jmp dump_C%=\t\n" |
907 | |
908 | "zero_regs%=:\t\n" |
909 | |
910 | "test r14,r14\t\n" |
911 | "jz skip_preload_b_zero%=\t\n" |
912 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
913 | "skip_preload_b_zero%=:\t\n" |
914 | "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n" |
915 | "vmulps zmm0,zmm13,zmm12\t\n" |
916 | "vmulps zmm1,zmm14,zmm12\t\n" |
917 | "add r12, r13\t\n" |
918 | "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n" |
919 | "vmulps zmm2,zmm13,zmm12\t\n" |
920 | "vmulps zmm3,zmm14,zmm12\t\n" |
921 | "add r12, r13\t\n" |
922 | "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n" |
923 | "vmulps zmm4,zmm13,zmm12\t\n" |
924 | "vmulps zmm5,zmm14,zmm12\t\n" |
925 | "add r12, r13\t\n" |
926 | "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n" |
927 | "vmulps zmm6,zmm13,zmm12\t\n" |
928 | "vmulps zmm7,zmm14,zmm12\t\n" |
929 | "add r12, r13\t\n" |
930 | "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n" |
931 | "vmulps zmm8,zmm13,zmm12\t\n" |
932 | "vmulps zmm9,zmm14,zmm12\t\n" |
933 | "add r12, r13\t\n" |
934 | "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n" |
935 | "vmulps zmm10,zmm13,zmm12\t\n" |
936 | "vmulps zmm11,zmm14,zmm12\t\n" |
937 | "mov r12, rcx\t\n" |
938 | "test r14,r14\t\n" |
939 | "jnz next_inner%=\t\n" |
940 | "add r10,64\t\n" |
941 | "jmp dump_C%=\t\n" |
942 | |
943 | "loop_inner%=:\t\n" |
944 | |
945 | "vmovaps zmm13,zmm31\t\n" |
946 | "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n" |
947 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
948 | "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n" |
949 | "vfmadd231ps zmm0,zmm13,zmm12\t\n" |
950 | "vfmadd231ps zmm1,zmm14,zmm12\t\n" |
951 | "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n" |
952 | "vfmadd231ps zmm2,zmm13,zmm12\t\n" |
953 | "vfmadd231ps zmm3,zmm14,zmm12\t\n" |
954 | "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n" |
955 | "vfmadd231ps zmm4,zmm13,zmm12\t\n" |
956 | "vfmadd231ps zmm5,zmm14,zmm12\t\n" |
957 | "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n" |
958 | "vfmadd231ps zmm6,zmm13,zmm12\t\n" |
959 | "vfmadd231ps zmm7,zmm14,zmm12\t\n" |
960 | "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n" |
961 | "vfmadd231ps zmm8,zmm13,zmm12\t\n" |
962 | "vfmadd231ps zmm9,zmm14,zmm12\t\n" |
963 | "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n" |
964 | "vfmadd231ps zmm10,zmm13,zmm12\t\n" |
965 | "vfmadd231ps zmm11,zmm14,zmm12\t\n" |
966 | |
967 | "next_inner%=:\t\n" |
968 | "add r9,24\t\n" |
969 | "add r10,64\t\n" |
970 | "dec r14\t\n" |
971 | "jnz loop_inner%=\t\n" |
972 | |
973 | "vmovaps zmm13,zmm31\t\n" |
974 | "vcvtph2ps zmm14,YMMWORD PTR [r10 + 32]\t\n" |
975 | "vbroadcastss zmm12,DWORD PTR [r9+0]\t\n" |
976 | "vfmadd231ps zmm0,zmm13,zmm12\t\n" |
977 | "vfmadd231ps zmm1,zmm14,zmm12\t\n" |
978 | "vbroadcastss zmm12,DWORD PTR [r9+4]\t\n" |
979 | "vfmadd231ps zmm2,zmm13,zmm12\t\n" |
980 | "vfmadd231ps zmm3,zmm14,zmm12\t\n" |
981 | "vbroadcastss zmm12,DWORD PTR [r9+8]\t\n" |
982 | "vfmadd231ps zmm4,zmm13,zmm12\t\n" |
983 | "vfmadd231ps zmm5,zmm14,zmm12\t\n" |
984 | "vbroadcastss zmm12,DWORD PTR [r9+12]\t\n" |
985 | "vfmadd231ps zmm6,zmm13,zmm12\t\n" |
986 | "vfmadd231ps zmm7,zmm14,zmm12\t\n" |
987 | "vbroadcastss zmm12,DWORD PTR [r9+16]\t\n" |
988 | "vfmadd231ps zmm8,zmm13,zmm12\t\n" |
989 | "vfmadd231ps zmm9,zmm14,zmm12\t\n" |
990 | "vbroadcastss zmm12,DWORD PTR [r9+20]\t\n" |
991 | "vfmadd231ps zmm10,zmm13,zmm12\t\n" |
992 | "vfmadd231ps zmm11,zmm14,zmm12\t\n" |
993 | "add r9,24\t\n" |
994 | "add r10,64\t\n" |
995 | // Dump C |
996 | "dump_C%=:\t\n" |
997 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
998 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
999 | "add r12, r13\t\n" |
1000 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
1001 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
1002 | "add r12, r13\t\n" |
1003 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
1004 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
1005 | "add r12, r13\t\n" |
1006 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
1007 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
1008 | "add r12, r13\t\n" |
1009 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
1010 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
1011 | "add r12, r13\t\n" |
1012 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
1013 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
1014 | |
1015 | // next outer iteration |
1016 | "add rcx, 128\t\n" |
1017 | "mov r12, rcx\t\n" |
1018 | "mov r9, rax\t\n" |
1019 | "inc rbx\t\n" |
1020 | "cmp rbx, rdi\t\n" |
1021 | "jl loop_outter%=\t\n" |
1022 | : |
1023 | : [gp] "rm" (gp) |
1024 | : "r8" , |
1025 | "r9" , |
1026 | "r10" , |
1027 | "r11" , |
1028 | "r13" , |
1029 | "r14" , |
1030 | "rax" , |
1031 | "rcx" , |
1032 | "rsi" , |
1033 | "rdi" , |
1034 | "rbx" , |
1035 | "r12" , |
1036 | "r15" , |
1037 | "memory" ); |
1038 | } |
1039 | void NOINLINE gemmkernel_7x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
1040 | asm volatile( |
1041 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
1042 | "mov %[gp], %%r14\t\n" |
1043 | ".intel_syntax noprefix\t\n" |
1044 | #else |
1045 | "mov r14, %[gp]\t\n" |
1046 | #endif |
1047 | |
1048 | // Copy parameters |
1049 | // k |
1050 | "mov r8, [r14 + 0]\t\n" |
1051 | "dec r8\t\n" |
1052 | // A |
1053 | "mov r9, [r14 + 8]\t\n" |
1054 | // B |
1055 | "mov r10, [r14 + 16]\t\n" |
1056 | // beta |
1057 | "lea r15, [r14 + 24]\t\n" |
1058 | // C |
1059 | "mov r12, [r14 + 32]\t\n" |
1060 | // ldc |
1061 | "mov r13, [r14 + 40]\t\n" |
1062 | // b_block_cols |
1063 | "mov rdi, [r14 + 48]\t\n" |
1064 | // b_block_size |
1065 | "mov rsi, [r14 + 56]\t\n" |
1066 | |
1067 | // Make copies of A and C |
1068 | "mov rax, r9\t\n" |
1069 | "mov rcx, r12\t\n" |
1070 | |
1071 | "xor ebx, ebx\t\n" |
1072 | "loop_outter%=:\t\n" |
1073 | "mov r14, r8\t\n" |
1074 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
1075 | "vcvtph2ps zmm15,YMMWORD PTR [r10 + 0]\t\n" |
1076 | "vcvtph2ps zmm16,YMMWORD PTR [r10 + 32]\t\n" |
1077 | "vxorps xmm0, xmm0, xmm0\t\n" |
1078 | "vcomiss xmm31, xmm0\t\n" |
1079 | "jz zero_regs%=\t\n" |
1080 | |
1081 | // Setup values with beta multiplication |
1082 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
1083 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
1084 | "add r12, r13\t\n" |
1085 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
1086 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
1087 | "add r12, r13\t\n" |
1088 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
1089 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
1090 | "add r12, r13\t\n" |
1091 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
1092 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
1093 | "add r12, r13\t\n" |
1094 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
1095 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
1096 | "add r12, r13\t\n" |
1097 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
1098 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
1099 | "add r12, r13\t\n" |
1100 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
1101 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
1102 | "test r14,r14\t\n" |
1103 | "jz skip_preload%=\t\n" |
1104 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1105 | "skip_preload%=:\t\n" |
1106 | "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n" |
1107 | "vfmadd231ps zmm0,zmm15,zmm14\t\n" |
1108 | "vfmadd231ps zmm1,zmm16,zmm14\t\n" |
1109 | "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n" |
1110 | "vfmadd231ps zmm2,zmm15,zmm14\t\n" |
1111 | "vfmadd231ps zmm3,zmm16,zmm14\t\n" |
1112 | "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n" |
1113 | "vfmadd231ps zmm4,zmm15,zmm14\t\n" |
1114 | "vfmadd231ps zmm5,zmm16,zmm14\t\n" |
1115 | "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n" |
1116 | "vfmadd231ps zmm6,zmm15,zmm14\t\n" |
1117 | "vfmadd231ps zmm7,zmm16,zmm14\t\n" |
1118 | "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n" |
1119 | "vfmadd231ps zmm8,zmm15,zmm14\t\n" |
1120 | "vfmadd231ps zmm9,zmm16,zmm14\t\n" |
1121 | "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n" |
1122 | "vfmadd231ps zmm10,zmm15,zmm14\t\n" |
1123 | "vfmadd231ps zmm11,zmm16,zmm14\t\n" |
1124 | "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n" |
1125 | "vfmadd231ps zmm12,zmm15,zmm14\t\n" |
1126 | "vfmadd231ps zmm13,zmm16,zmm14\t\n" |
1127 | "mov r12, rcx\t\n" |
1128 | "test r14,r14\t\n" |
1129 | "jnz next_inner%=\t\n" |
1130 | "add r10,64\t\n" |
1131 | "jmp dump_C%=\t\n" |
1132 | |
1133 | "zero_regs%=:\t\n" |
1134 | |
1135 | "test r14,r14\t\n" |
1136 | "jz skip_preload_b_zero%=\t\n" |
1137 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1138 | "skip_preload_b_zero%=:\t\n" |
1139 | "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n" |
1140 | "vmulps zmm0,zmm15,zmm14\t\n" |
1141 | "vmulps zmm1,zmm16,zmm14\t\n" |
1142 | "add r12, r13\t\n" |
1143 | "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n" |
1144 | "vmulps zmm2,zmm15,zmm14\t\n" |
1145 | "vmulps zmm3,zmm16,zmm14\t\n" |
1146 | "add r12, r13\t\n" |
1147 | "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n" |
1148 | "vmulps zmm4,zmm15,zmm14\t\n" |
1149 | "vmulps zmm5,zmm16,zmm14\t\n" |
1150 | "add r12, r13\t\n" |
1151 | "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n" |
1152 | "vmulps zmm6,zmm15,zmm14\t\n" |
1153 | "vmulps zmm7,zmm16,zmm14\t\n" |
1154 | "add r12, r13\t\n" |
1155 | "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n" |
1156 | "vmulps zmm8,zmm15,zmm14\t\n" |
1157 | "vmulps zmm9,zmm16,zmm14\t\n" |
1158 | "add r12, r13\t\n" |
1159 | "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n" |
1160 | "vmulps zmm10,zmm15,zmm14\t\n" |
1161 | "vmulps zmm11,zmm16,zmm14\t\n" |
1162 | "add r12, r13\t\n" |
1163 | "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n" |
1164 | "vmulps zmm12,zmm15,zmm14\t\n" |
1165 | "vmulps zmm13,zmm16,zmm14\t\n" |
1166 | "mov r12, rcx\t\n" |
1167 | "test r14,r14\t\n" |
1168 | "jnz next_inner%=\t\n" |
1169 | "add r10,64\t\n" |
1170 | "jmp dump_C%=\t\n" |
1171 | |
1172 | "loop_inner%=:\t\n" |
1173 | |
1174 | "vmovaps zmm15,zmm31\t\n" |
1175 | "vcvtph2ps zmm16,YMMWORD PTR [r10 + 32]\t\n" |
1176 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1177 | "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n" |
1178 | "vfmadd231ps zmm0,zmm15,zmm14\t\n" |
1179 | "vfmadd231ps zmm1,zmm16,zmm14\t\n" |
1180 | "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n" |
1181 | "vfmadd231ps zmm2,zmm15,zmm14\t\n" |
1182 | "vfmadd231ps zmm3,zmm16,zmm14\t\n" |
1183 | "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n" |
1184 | "vfmadd231ps zmm4,zmm15,zmm14\t\n" |
1185 | "vfmadd231ps zmm5,zmm16,zmm14\t\n" |
1186 | "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n" |
1187 | "vfmadd231ps zmm6,zmm15,zmm14\t\n" |
1188 | "vfmadd231ps zmm7,zmm16,zmm14\t\n" |
1189 | "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n" |
1190 | "vfmadd231ps zmm8,zmm15,zmm14\t\n" |
1191 | "vfmadd231ps zmm9,zmm16,zmm14\t\n" |
1192 | "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n" |
1193 | "vfmadd231ps zmm10,zmm15,zmm14\t\n" |
1194 | "vfmadd231ps zmm11,zmm16,zmm14\t\n" |
1195 | "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n" |
1196 | "vfmadd231ps zmm12,zmm15,zmm14\t\n" |
1197 | "vfmadd231ps zmm13,zmm16,zmm14\t\n" |
1198 | |
1199 | "next_inner%=:\t\n" |
1200 | "add r9,28\t\n" |
1201 | "add r10,64\t\n" |
1202 | "dec r14\t\n" |
1203 | "jnz loop_inner%=\t\n" |
1204 | |
1205 | "vmovaps zmm15,zmm31\t\n" |
1206 | "vcvtph2ps zmm16,YMMWORD PTR [r10 + 32]\t\n" |
1207 | "vbroadcastss zmm14,DWORD PTR [r9+0]\t\n" |
1208 | "vfmadd231ps zmm0,zmm15,zmm14\t\n" |
1209 | "vfmadd231ps zmm1,zmm16,zmm14\t\n" |
1210 | "vbroadcastss zmm14,DWORD PTR [r9+4]\t\n" |
1211 | "vfmadd231ps zmm2,zmm15,zmm14\t\n" |
1212 | "vfmadd231ps zmm3,zmm16,zmm14\t\n" |
1213 | "vbroadcastss zmm14,DWORD PTR [r9+8]\t\n" |
1214 | "vfmadd231ps zmm4,zmm15,zmm14\t\n" |
1215 | "vfmadd231ps zmm5,zmm16,zmm14\t\n" |
1216 | "vbroadcastss zmm14,DWORD PTR [r9+12]\t\n" |
1217 | "vfmadd231ps zmm6,zmm15,zmm14\t\n" |
1218 | "vfmadd231ps zmm7,zmm16,zmm14\t\n" |
1219 | "vbroadcastss zmm14,DWORD PTR [r9+16]\t\n" |
1220 | "vfmadd231ps zmm8,zmm15,zmm14\t\n" |
1221 | "vfmadd231ps zmm9,zmm16,zmm14\t\n" |
1222 | "vbroadcastss zmm14,DWORD PTR [r9+20]\t\n" |
1223 | "vfmadd231ps zmm10,zmm15,zmm14\t\n" |
1224 | "vfmadd231ps zmm11,zmm16,zmm14\t\n" |
1225 | "vbroadcastss zmm14,DWORD PTR [r9+24]\t\n" |
1226 | "vfmadd231ps zmm12,zmm15,zmm14\t\n" |
1227 | "vfmadd231ps zmm13,zmm16,zmm14\t\n" |
1228 | "add r9,28\t\n" |
1229 | "add r10,64\t\n" |
1230 | // Dump C |
1231 | "dump_C%=:\t\n" |
1232 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
1233 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
1234 | "add r12, r13\t\n" |
1235 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
1236 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
1237 | "add r12, r13\t\n" |
1238 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
1239 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
1240 | "add r12, r13\t\n" |
1241 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
1242 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
1243 | "add r12, r13\t\n" |
1244 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
1245 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
1246 | "add r12, r13\t\n" |
1247 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
1248 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
1249 | "add r12, r13\t\n" |
1250 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
1251 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
1252 | |
1253 | // next outer iteration |
1254 | "add rcx, 128\t\n" |
1255 | "mov r12, rcx\t\n" |
1256 | "mov r9, rax\t\n" |
1257 | "inc rbx\t\n" |
1258 | "cmp rbx, rdi\t\n" |
1259 | "jl loop_outter%=\t\n" |
1260 | : |
1261 | : [gp] "rm" (gp) |
1262 | : "r8" , |
1263 | "r9" , |
1264 | "r10" , |
1265 | "r11" , |
1266 | "r13" , |
1267 | "r14" , |
1268 | "rax" , |
1269 | "rcx" , |
1270 | "rsi" , |
1271 | "rdi" , |
1272 | "rbx" , |
1273 | "r12" , |
1274 | "r15" , |
1275 | "memory" ); |
1276 | } |
1277 | void NOINLINE gemmkernel_8x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
1278 | asm volatile( |
1279 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
1280 | "mov %[gp], %%r14\t\n" |
1281 | ".intel_syntax noprefix\t\n" |
1282 | #else |
1283 | "mov r14, %[gp]\t\n" |
1284 | #endif |
1285 | |
1286 | // Copy parameters |
1287 | // k |
1288 | "mov r8, [r14 + 0]\t\n" |
1289 | "dec r8\t\n" |
1290 | // A |
1291 | "mov r9, [r14 + 8]\t\n" |
1292 | // B |
1293 | "mov r10, [r14 + 16]\t\n" |
1294 | // beta |
1295 | "lea r15, [r14 + 24]\t\n" |
1296 | // C |
1297 | "mov r12, [r14 + 32]\t\n" |
1298 | // ldc |
1299 | "mov r13, [r14 + 40]\t\n" |
1300 | // b_block_cols |
1301 | "mov rdi, [r14 + 48]\t\n" |
1302 | // b_block_size |
1303 | "mov rsi, [r14 + 56]\t\n" |
1304 | |
1305 | // Make copies of A and C |
1306 | "mov rax, r9\t\n" |
1307 | "mov rcx, r12\t\n" |
1308 | |
1309 | "xor ebx, ebx\t\n" |
1310 | "loop_outter%=:\t\n" |
1311 | "mov r14, r8\t\n" |
1312 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
1313 | "vcvtph2ps zmm17,YMMWORD PTR [r10 + 0]\t\n" |
1314 | "vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n" |
1315 | "vxorps xmm0, xmm0, xmm0\t\n" |
1316 | "vcomiss xmm31, xmm0\t\n" |
1317 | "jz zero_regs%=\t\n" |
1318 | |
1319 | // Setup values with beta multiplication |
1320 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
1321 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
1322 | "add r12, r13\t\n" |
1323 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
1324 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
1325 | "add r12, r13\t\n" |
1326 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
1327 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
1328 | "add r12, r13\t\n" |
1329 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
1330 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
1331 | "add r12, r13\t\n" |
1332 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
1333 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
1334 | "add r12, r13\t\n" |
1335 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
1336 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
1337 | "add r12, r13\t\n" |
1338 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
1339 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
1340 | "add r12, r13\t\n" |
1341 | "vmulps zmm14, zmm31, [r12 + 0]\t\n" |
1342 | "vmulps zmm15, zmm31, [r12 + 64]\t\n" |
1343 | "test r14,r14\t\n" |
1344 | "jz skip_preload%=\t\n" |
1345 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1346 | "skip_preload%=:\t\n" |
1347 | "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n" |
1348 | "vfmadd231ps zmm0,zmm17,zmm16\t\n" |
1349 | "vfmadd231ps zmm1,zmm18,zmm16\t\n" |
1350 | "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n" |
1351 | "vfmadd231ps zmm2,zmm17,zmm16\t\n" |
1352 | "vfmadd231ps zmm3,zmm18,zmm16\t\n" |
1353 | "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n" |
1354 | "vfmadd231ps zmm4,zmm17,zmm16\t\n" |
1355 | "vfmadd231ps zmm5,zmm18,zmm16\t\n" |
1356 | "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n" |
1357 | "vfmadd231ps zmm6,zmm17,zmm16\t\n" |
1358 | "vfmadd231ps zmm7,zmm18,zmm16\t\n" |
1359 | "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n" |
1360 | "vfmadd231ps zmm8,zmm17,zmm16\t\n" |
1361 | "vfmadd231ps zmm9,zmm18,zmm16\t\n" |
1362 | "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n" |
1363 | "vfmadd231ps zmm10,zmm17,zmm16\t\n" |
1364 | "vfmadd231ps zmm11,zmm18,zmm16\t\n" |
1365 | "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n" |
1366 | "vfmadd231ps zmm12,zmm17,zmm16\t\n" |
1367 | "vfmadd231ps zmm13,zmm18,zmm16\t\n" |
1368 | "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n" |
1369 | "vfmadd231ps zmm14,zmm17,zmm16\t\n" |
1370 | "vfmadd231ps zmm15,zmm18,zmm16\t\n" |
1371 | "mov r12, rcx\t\n" |
1372 | "test r14,r14\t\n" |
1373 | "jnz next_inner%=\t\n" |
1374 | "add r10,64\t\n" |
1375 | "jmp dump_C%=\t\n" |
1376 | |
1377 | "zero_regs%=:\t\n" |
1378 | |
1379 | "test r14,r14\t\n" |
1380 | "jz skip_preload_b_zero%=\t\n" |
1381 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1382 | "skip_preload_b_zero%=:\t\n" |
1383 | "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n" |
1384 | "vmulps zmm0,zmm17,zmm16\t\n" |
1385 | "vmulps zmm1,zmm18,zmm16\t\n" |
1386 | "add r12, r13\t\n" |
1387 | "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n" |
1388 | "vmulps zmm2,zmm17,zmm16\t\n" |
1389 | "vmulps zmm3,zmm18,zmm16\t\n" |
1390 | "add r12, r13\t\n" |
1391 | "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n" |
1392 | "vmulps zmm4,zmm17,zmm16\t\n" |
1393 | "vmulps zmm5,zmm18,zmm16\t\n" |
1394 | "add r12, r13\t\n" |
1395 | "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n" |
1396 | "vmulps zmm6,zmm17,zmm16\t\n" |
1397 | "vmulps zmm7,zmm18,zmm16\t\n" |
1398 | "add r12, r13\t\n" |
1399 | "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n" |
1400 | "vmulps zmm8,zmm17,zmm16\t\n" |
1401 | "vmulps zmm9,zmm18,zmm16\t\n" |
1402 | "add r12, r13\t\n" |
1403 | "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n" |
1404 | "vmulps zmm10,zmm17,zmm16\t\n" |
1405 | "vmulps zmm11,zmm18,zmm16\t\n" |
1406 | "add r12, r13\t\n" |
1407 | "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n" |
1408 | "vmulps zmm12,zmm17,zmm16\t\n" |
1409 | "vmulps zmm13,zmm18,zmm16\t\n" |
1410 | "add r12, r13\t\n" |
1411 | "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n" |
1412 | "vmulps zmm14,zmm17,zmm16\t\n" |
1413 | "vmulps zmm15,zmm18,zmm16\t\n" |
1414 | "mov r12, rcx\t\n" |
1415 | "test r14,r14\t\n" |
1416 | "jnz next_inner%=\t\n" |
1417 | "add r10,64\t\n" |
1418 | "jmp dump_C%=\t\n" |
1419 | |
1420 | "loop_inner%=:\t\n" |
1421 | |
1422 | "vmovaps zmm17,zmm31\t\n" |
1423 | "vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n" |
1424 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1425 | "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n" |
1426 | "vfmadd231ps zmm0,zmm17,zmm16\t\n" |
1427 | "vfmadd231ps zmm1,zmm18,zmm16\t\n" |
1428 | "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n" |
1429 | "vfmadd231ps zmm2,zmm17,zmm16\t\n" |
1430 | "vfmadd231ps zmm3,zmm18,zmm16\t\n" |
1431 | "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n" |
1432 | "vfmadd231ps zmm4,zmm17,zmm16\t\n" |
1433 | "vfmadd231ps zmm5,zmm18,zmm16\t\n" |
1434 | "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n" |
1435 | "vfmadd231ps zmm6,zmm17,zmm16\t\n" |
1436 | "vfmadd231ps zmm7,zmm18,zmm16\t\n" |
1437 | "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n" |
1438 | "vfmadd231ps zmm8,zmm17,zmm16\t\n" |
1439 | "vfmadd231ps zmm9,zmm18,zmm16\t\n" |
1440 | "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n" |
1441 | "vfmadd231ps zmm10,zmm17,zmm16\t\n" |
1442 | "vfmadd231ps zmm11,zmm18,zmm16\t\n" |
1443 | "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n" |
1444 | "vfmadd231ps zmm12,zmm17,zmm16\t\n" |
1445 | "vfmadd231ps zmm13,zmm18,zmm16\t\n" |
1446 | "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n" |
1447 | "vfmadd231ps zmm14,zmm17,zmm16\t\n" |
1448 | "vfmadd231ps zmm15,zmm18,zmm16\t\n" |
1449 | |
1450 | "next_inner%=:\t\n" |
1451 | "add r9,32\t\n" |
1452 | "add r10,64\t\n" |
1453 | "dec r14\t\n" |
1454 | "jnz loop_inner%=\t\n" |
1455 | |
1456 | "vmovaps zmm17,zmm31\t\n" |
1457 | "vcvtph2ps zmm18,YMMWORD PTR [r10 + 32]\t\n" |
1458 | "vbroadcastss zmm16,DWORD PTR [r9+0]\t\n" |
1459 | "vfmadd231ps zmm0,zmm17,zmm16\t\n" |
1460 | "vfmadd231ps zmm1,zmm18,zmm16\t\n" |
1461 | "vbroadcastss zmm16,DWORD PTR [r9+4]\t\n" |
1462 | "vfmadd231ps zmm2,zmm17,zmm16\t\n" |
1463 | "vfmadd231ps zmm3,zmm18,zmm16\t\n" |
1464 | "vbroadcastss zmm16,DWORD PTR [r9+8]\t\n" |
1465 | "vfmadd231ps zmm4,zmm17,zmm16\t\n" |
1466 | "vfmadd231ps zmm5,zmm18,zmm16\t\n" |
1467 | "vbroadcastss zmm16,DWORD PTR [r9+12]\t\n" |
1468 | "vfmadd231ps zmm6,zmm17,zmm16\t\n" |
1469 | "vfmadd231ps zmm7,zmm18,zmm16\t\n" |
1470 | "vbroadcastss zmm16,DWORD PTR [r9+16]\t\n" |
1471 | "vfmadd231ps zmm8,zmm17,zmm16\t\n" |
1472 | "vfmadd231ps zmm9,zmm18,zmm16\t\n" |
1473 | "vbroadcastss zmm16,DWORD PTR [r9+20]\t\n" |
1474 | "vfmadd231ps zmm10,zmm17,zmm16\t\n" |
1475 | "vfmadd231ps zmm11,zmm18,zmm16\t\n" |
1476 | "vbroadcastss zmm16,DWORD PTR [r9+24]\t\n" |
1477 | "vfmadd231ps zmm12,zmm17,zmm16\t\n" |
1478 | "vfmadd231ps zmm13,zmm18,zmm16\t\n" |
1479 | "vbroadcastss zmm16,DWORD PTR [r9+28]\t\n" |
1480 | "vfmadd231ps zmm14,zmm17,zmm16\t\n" |
1481 | "vfmadd231ps zmm15,zmm18,zmm16\t\n" |
1482 | "add r9,32\t\n" |
1483 | "add r10,64\t\n" |
1484 | // Dump C |
1485 | "dump_C%=:\t\n" |
1486 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
1487 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
1488 | "add r12, r13\t\n" |
1489 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
1490 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
1491 | "add r12, r13\t\n" |
1492 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
1493 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
1494 | "add r12, r13\t\n" |
1495 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
1496 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
1497 | "add r12, r13\t\n" |
1498 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
1499 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
1500 | "add r12, r13\t\n" |
1501 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
1502 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
1503 | "add r12, r13\t\n" |
1504 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
1505 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
1506 | "add r12, r13\t\n" |
1507 | "vmovups zmmword PTR [r12 + 0], zmm14\t\n" |
1508 | "vmovups zmmword PTR [r12 + 64], zmm15\t\n" |
1509 | |
1510 | // next outer iteration |
1511 | "add rcx, 128\t\n" |
1512 | "mov r12, rcx\t\n" |
1513 | "mov r9, rax\t\n" |
1514 | "inc rbx\t\n" |
1515 | "cmp rbx, rdi\t\n" |
1516 | "jl loop_outter%=\t\n" |
1517 | : |
1518 | : [gp] "rm" (gp) |
1519 | : "r8" , |
1520 | "r9" , |
1521 | "r10" , |
1522 | "r11" , |
1523 | "r13" , |
1524 | "r14" , |
1525 | "rax" , |
1526 | "rcx" , |
1527 | "rsi" , |
1528 | "rdi" , |
1529 | "rbx" , |
1530 | "r12" , |
1531 | "r15" , |
1532 | "memory" ); |
1533 | } |
1534 | void NOINLINE gemmkernel_9x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
1535 | asm volatile( |
1536 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
1537 | "mov %[gp], %%r14\t\n" |
1538 | ".intel_syntax noprefix\t\n" |
1539 | #else |
1540 | "mov r14, %[gp]\t\n" |
1541 | #endif |
1542 | |
1543 | // Copy parameters |
1544 | // k |
1545 | "mov r8, [r14 + 0]\t\n" |
1546 | "dec r8\t\n" |
1547 | // A |
1548 | "mov r9, [r14 + 8]\t\n" |
1549 | // B |
1550 | "mov r10, [r14 + 16]\t\n" |
1551 | // beta |
1552 | "lea r15, [r14 + 24]\t\n" |
1553 | // C |
1554 | "mov r12, [r14 + 32]\t\n" |
1555 | // ldc |
1556 | "mov r13, [r14 + 40]\t\n" |
1557 | // b_block_cols |
1558 | "mov rdi, [r14 + 48]\t\n" |
1559 | // b_block_size |
1560 | "mov rsi, [r14 + 56]\t\n" |
1561 | |
1562 | // Make copies of A and C |
1563 | "mov rax, r9\t\n" |
1564 | "mov rcx, r12\t\n" |
1565 | |
1566 | "xor ebx, ebx\t\n" |
1567 | "loop_outter%=:\t\n" |
1568 | "mov r14, r8\t\n" |
1569 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
1570 | "vcvtph2ps zmm19,YMMWORD PTR [r10 + 0]\t\n" |
1571 | "vcvtph2ps zmm20,YMMWORD PTR [r10 + 32]\t\n" |
1572 | "vxorps xmm0, xmm0, xmm0\t\n" |
1573 | "vcomiss xmm31, xmm0\t\n" |
1574 | "jz zero_regs%=\t\n" |
1575 | |
1576 | // Setup values with beta multiplication |
1577 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
1578 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
1579 | "add r12, r13\t\n" |
1580 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
1581 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
1582 | "add r12, r13\t\n" |
1583 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
1584 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
1585 | "add r12, r13\t\n" |
1586 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
1587 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
1588 | "add r12, r13\t\n" |
1589 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
1590 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
1591 | "add r12, r13\t\n" |
1592 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
1593 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
1594 | "add r12, r13\t\n" |
1595 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
1596 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
1597 | "add r12, r13\t\n" |
1598 | "vmulps zmm14, zmm31, [r12 + 0]\t\n" |
1599 | "vmulps zmm15, zmm31, [r12 + 64]\t\n" |
1600 | "add r12, r13\t\n" |
1601 | "vmulps zmm16, zmm31, [r12 + 0]\t\n" |
1602 | "vmulps zmm17, zmm31, [r12 + 64]\t\n" |
1603 | "test r14,r14\t\n" |
1604 | "jz skip_preload%=\t\n" |
1605 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1606 | "skip_preload%=:\t\n" |
1607 | "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n" |
1608 | "vfmadd231ps zmm0,zmm19,zmm18\t\n" |
1609 | "vfmadd231ps zmm1,zmm20,zmm18\t\n" |
1610 | "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n" |
1611 | "vfmadd231ps zmm2,zmm19,zmm18\t\n" |
1612 | "vfmadd231ps zmm3,zmm20,zmm18\t\n" |
1613 | "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n" |
1614 | "vfmadd231ps zmm4,zmm19,zmm18\t\n" |
1615 | "vfmadd231ps zmm5,zmm20,zmm18\t\n" |
1616 | "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n" |
1617 | "vfmadd231ps zmm6,zmm19,zmm18\t\n" |
1618 | "vfmadd231ps zmm7,zmm20,zmm18\t\n" |
1619 | "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n" |
1620 | "vfmadd231ps zmm8,zmm19,zmm18\t\n" |
1621 | "vfmadd231ps zmm9,zmm20,zmm18\t\n" |
1622 | "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n" |
1623 | "vfmadd231ps zmm10,zmm19,zmm18\t\n" |
1624 | "vfmadd231ps zmm11,zmm20,zmm18\t\n" |
1625 | "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n" |
1626 | "vfmadd231ps zmm12,zmm19,zmm18\t\n" |
1627 | "vfmadd231ps zmm13,zmm20,zmm18\t\n" |
1628 | "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n" |
1629 | "vfmadd231ps zmm14,zmm19,zmm18\t\n" |
1630 | "vfmadd231ps zmm15,zmm20,zmm18\t\n" |
1631 | "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n" |
1632 | "vfmadd231ps zmm16,zmm19,zmm18\t\n" |
1633 | "vfmadd231ps zmm17,zmm20,zmm18\t\n" |
1634 | "mov r12, rcx\t\n" |
1635 | "test r14,r14\t\n" |
1636 | "jnz next_inner%=\t\n" |
1637 | "add r10,64\t\n" |
1638 | "jmp dump_C%=\t\n" |
1639 | |
1640 | "zero_regs%=:\t\n" |
1641 | |
1642 | "test r14,r14\t\n" |
1643 | "jz skip_preload_b_zero%=\t\n" |
1644 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1645 | "skip_preload_b_zero%=:\t\n" |
1646 | "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n" |
1647 | "vmulps zmm0,zmm19,zmm18\t\n" |
1648 | "vmulps zmm1,zmm20,zmm18\t\n" |
1649 | "add r12, r13\t\n" |
1650 | "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n" |
1651 | "vmulps zmm2,zmm19,zmm18\t\n" |
1652 | "vmulps zmm3,zmm20,zmm18\t\n" |
1653 | "add r12, r13\t\n" |
1654 | "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n" |
1655 | "vmulps zmm4,zmm19,zmm18\t\n" |
1656 | "vmulps zmm5,zmm20,zmm18\t\n" |
1657 | "add r12, r13\t\n" |
1658 | "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n" |
1659 | "vmulps zmm6,zmm19,zmm18\t\n" |
1660 | "vmulps zmm7,zmm20,zmm18\t\n" |
1661 | "add r12, r13\t\n" |
1662 | "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n" |
1663 | "vmulps zmm8,zmm19,zmm18\t\n" |
1664 | "vmulps zmm9,zmm20,zmm18\t\n" |
1665 | "add r12, r13\t\n" |
1666 | "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n" |
1667 | "vmulps zmm10,zmm19,zmm18\t\n" |
1668 | "vmulps zmm11,zmm20,zmm18\t\n" |
1669 | "add r12, r13\t\n" |
1670 | "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n" |
1671 | "vmulps zmm12,zmm19,zmm18\t\n" |
1672 | "vmulps zmm13,zmm20,zmm18\t\n" |
1673 | "add r12, r13\t\n" |
1674 | "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n" |
1675 | "vmulps zmm14,zmm19,zmm18\t\n" |
1676 | "vmulps zmm15,zmm20,zmm18\t\n" |
1677 | "add r12, r13\t\n" |
1678 | "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n" |
1679 | "vmulps zmm16,zmm19,zmm18\t\n" |
1680 | "vmulps zmm17,zmm20,zmm18\t\n" |
1681 | "mov r12, rcx\t\n" |
1682 | "test r14,r14\t\n" |
1683 | "jnz next_inner%=\t\n" |
1684 | "add r10,64\t\n" |
1685 | "jmp dump_C%=\t\n" |
1686 | |
1687 | "loop_inner%=:\t\n" |
1688 | |
1689 | "vmovaps zmm19,zmm31\t\n" |
1690 | "vcvtph2ps zmm20,YMMWORD PTR [r10 + 32]\t\n" |
1691 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1692 | "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n" |
1693 | "vfmadd231ps zmm0,zmm19,zmm18\t\n" |
1694 | "vfmadd231ps zmm1,zmm20,zmm18\t\n" |
1695 | "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n" |
1696 | "vfmadd231ps zmm2,zmm19,zmm18\t\n" |
1697 | "vfmadd231ps zmm3,zmm20,zmm18\t\n" |
1698 | "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n" |
1699 | "vfmadd231ps zmm4,zmm19,zmm18\t\n" |
1700 | "vfmadd231ps zmm5,zmm20,zmm18\t\n" |
1701 | "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n" |
1702 | "vfmadd231ps zmm6,zmm19,zmm18\t\n" |
1703 | "vfmadd231ps zmm7,zmm20,zmm18\t\n" |
1704 | "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n" |
1705 | "vfmadd231ps zmm8,zmm19,zmm18\t\n" |
1706 | "vfmadd231ps zmm9,zmm20,zmm18\t\n" |
1707 | "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n" |
1708 | "vfmadd231ps zmm10,zmm19,zmm18\t\n" |
1709 | "vfmadd231ps zmm11,zmm20,zmm18\t\n" |
1710 | "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n" |
1711 | "vfmadd231ps zmm12,zmm19,zmm18\t\n" |
1712 | "vfmadd231ps zmm13,zmm20,zmm18\t\n" |
1713 | "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n" |
1714 | "vfmadd231ps zmm14,zmm19,zmm18\t\n" |
1715 | "vfmadd231ps zmm15,zmm20,zmm18\t\n" |
1716 | "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n" |
1717 | "vfmadd231ps zmm16,zmm19,zmm18\t\n" |
1718 | "vfmadd231ps zmm17,zmm20,zmm18\t\n" |
1719 | |
1720 | "next_inner%=:\t\n" |
1721 | "add r9,36\t\n" |
1722 | "add r10,64\t\n" |
1723 | "dec r14\t\n" |
1724 | "jnz loop_inner%=\t\n" |
1725 | |
1726 | "vmovaps zmm19,zmm31\t\n" |
1727 | "vcvtph2ps zmm20,YMMWORD PTR [r10 + 32]\t\n" |
1728 | "vbroadcastss zmm18,DWORD PTR [r9+0]\t\n" |
1729 | "vfmadd231ps zmm0,zmm19,zmm18\t\n" |
1730 | "vfmadd231ps zmm1,zmm20,zmm18\t\n" |
1731 | "vbroadcastss zmm18,DWORD PTR [r9+4]\t\n" |
1732 | "vfmadd231ps zmm2,zmm19,zmm18\t\n" |
1733 | "vfmadd231ps zmm3,zmm20,zmm18\t\n" |
1734 | "vbroadcastss zmm18,DWORD PTR [r9+8]\t\n" |
1735 | "vfmadd231ps zmm4,zmm19,zmm18\t\n" |
1736 | "vfmadd231ps zmm5,zmm20,zmm18\t\n" |
1737 | "vbroadcastss zmm18,DWORD PTR [r9+12]\t\n" |
1738 | "vfmadd231ps zmm6,zmm19,zmm18\t\n" |
1739 | "vfmadd231ps zmm7,zmm20,zmm18\t\n" |
1740 | "vbroadcastss zmm18,DWORD PTR [r9+16]\t\n" |
1741 | "vfmadd231ps zmm8,zmm19,zmm18\t\n" |
1742 | "vfmadd231ps zmm9,zmm20,zmm18\t\n" |
1743 | "vbroadcastss zmm18,DWORD PTR [r9+20]\t\n" |
1744 | "vfmadd231ps zmm10,zmm19,zmm18\t\n" |
1745 | "vfmadd231ps zmm11,zmm20,zmm18\t\n" |
1746 | "vbroadcastss zmm18,DWORD PTR [r9+24]\t\n" |
1747 | "vfmadd231ps zmm12,zmm19,zmm18\t\n" |
1748 | "vfmadd231ps zmm13,zmm20,zmm18\t\n" |
1749 | "vbroadcastss zmm18,DWORD PTR [r9+28]\t\n" |
1750 | "vfmadd231ps zmm14,zmm19,zmm18\t\n" |
1751 | "vfmadd231ps zmm15,zmm20,zmm18\t\n" |
1752 | "vbroadcastss zmm18,DWORD PTR [r9+32]\t\n" |
1753 | "vfmadd231ps zmm16,zmm19,zmm18\t\n" |
1754 | "vfmadd231ps zmm17,zmm20,zmm18\t\n" |
1755 | "add r9,36\t\n" |
1756 | "add r10,64\t\n" |
1757 | // Dump C |
1758 | "dump_C%=:\t\n" |
1759 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
1760 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
1761 | "add r12, r13\t\n" |
1762 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
1763 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
1764 | "add r12, r13\t\n" |
1765 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
1766 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
1767 | "add r12, r13\t\n" |
1768 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
1769 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
1770 | "add r12, r13\t\n" |
1771 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
1772 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
1773 | "add r12, r13\t\n" |
1774 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
1775 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
1776 | "add r12, r13\t\n" |
1777 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
1778 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
1779 | "add r12, r13\t\n" |
1780 | "vmovups zmmword PTR [r12 + 0], zmm14\t\n" |
1781 | "vmovups zmmword PTR [r12 + 64], zmm15\t\n" |
1782 | "add r12, r13\t\n" |
1783 | "vmovups zmmword PTR [r12 + 0], zmm16\t\n" |
1784 | "vmovups zmmword PTR [r12 + 64], zmm17\t\n" |
1785 | |
1786 | // next outer iteration |
1787 | "add rcx, 128\t\n" |
1788 | "mov r12, rcx\t\n" |
1789 | "mov r9, rax\t\n" |
1790 | "inc rbx\t\n" |
1791 | "cmp rbx, rdi\t\n" |
1792 | "jl loop_outter%=\t\n" |
1793 | : |
1794 | : [gp] "rm" (gp) |
1795 | : "r8" , |
1796 | "r9" , |
1797 | "r10" , |
1798 | "r11" , |
1799 | "r13" , |
1800 | "r14" , |
1801 | "rax" , |
1802 | "rcx" , |
1803 | "rsi" , |
1804 | "rdi" , |
1805 | "rbx" , |
1806 | "r12" , |
1807 | "r15" , |
1808 | "memory" ); |
1809 | } |
1810 | void NOINLINE gemmkernel_10x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
1811 | asm volatile( |
1812 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
1813 | "mov %[gp], %%r14\t\n" |
1814 | ".intel_syntax noprefix\t\n" |
1815 | #else |
1816 | "mov r14, %[gp]\t\n" |
1817 | #endif |
1818 | |
1819 | // Copy parameters |
1820 | // k |
1821 | "mov r8, [r14 + 0]\t\n" |
1822 | "dec r8\t\n" |
1823 | // A |
1824 | "mov r9, [r14 + 8]\t\n" |
1825 | // B |
1826 | "mov r10, [r14 + 16]\t\n" |
1827 | // beta |
1828 | "lea r15, [r14 + 24]\t\n" |
1829 | // C |
1830 | "mov r12, [r14 + 32]\t\n" |
1831 | // ldc |
1832 | "mov r13, [r14 + 40]\t\n" |
1833 | // b_block_cols |
1834 | "mov rdi, [r14 + 48]\t\n" |
1835 | // b_block_size |
1836 | "mov rsi, [r14 + 56]\t\n" |
1837 | |
1838 | // Make copies of A and C |
1839 | "mov rax, r9\t\n" |
1840 | "mov rcx, r12\t\n" |
1841 | |
1842 | "xor ebx, ebx\t\n" |
1843 | "loop_outter%=:\t\n" |
1844 | "mov r14, r8\t\n" |
1845 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
1846 | "vcvtph2ps zmm21,YMMWORD PTR [r10 + 0]\t\n" |
1847 | "vcvtph2ps zmm22,YMMWORD PTR [r10 + 32]\t\n" |
1848 | "vxorps xmm0, xmm0, xmm0\t\n" |
1849 | "vcomiss xmm31, xmm0\t\n" |
1850 | "jz zero_regs%=\t\n" |
1851 | |
1852 | // Setup values with beta multiplication |
1853 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
1854 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
1855 | "add r12, r13\t\n" |
1856 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
1857 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
1858 | "add r12, r13\t\n" |
1859 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
1860 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
1861 | "add r12, r13\t\n" |
1862 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
1863 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
1864 | "add r12, r13\t\n" |
1865 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
1866 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
1867 | "add r12, r13\t\n" |
1868 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
1869 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
1870 | "add r12, r13\t\n" |
1871 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
1872 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
1873 | "add r12, r13\t\n" |
1874 | "vmulps zmm14, zmm31, [r12 + 0]\t\n" |
1875 | "vmulps zmm15, zmm31, [r12 + 64]\t\n" |
1876 | "add r12, r13\t\n" |
1877 | "vmulps zmm16, zmm31, [r12 + 0]\t\n" |
1878 | "vmulps zmm17, zmm31, [r12 + 64]\t\n" |
1879 | "add r12, r13\t\n" |
1880 | "vmulps zmm18, zmm31, [r12 + 0]\t\n" |
1881 | "vmulps zmm19, zmm31, [r12 + 64]\t\n" |
1882 | "test r14,r14\t\n" |
1883 | "jz skip_preload%=\t\n" |
1884 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1885 | "skip_preload%=:\t\n" |
1886 | "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n" |
1887 | "vfmadd231ps zmm0,zmm21,zmm20\t\n" |
1888 | "vfmadd231ps zmm1,zmm22,zmm20\t\n" |
1889 | "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n" |
1890 | "vfmadd231ps zmm2,zmm21,zmm20\t\n" |
1891 | "vfmadd231ps zmm3,zmm22,zmm20\t\n" |
1892 | "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n" |
1893 | "vfmadd231ps zmm4,zmm21,zmm20\t\n" |
1894 | "vfmadd231ps zmm5,zmm22,zmm20\t\n" |
1895 | "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n" |
1896 | "vfmadd231ps zmm6,zmm21,zmm20\t\n" |
1897 | "vfmadd231ps zmm7,zmm22,zmm20\t\n" |
1898 | "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n" |
1899 | "vfmadd231ps zmm8,zmm21,zmm20\t\n" |
1900 | "vfmadd231ps zmm9,zmm22,zmm20\t\n" |
1901 | "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n" |
1902 | "vfmadd231ps zmm10,zmm21,zmm20\t\n" |
1903 | "vfmadd231ps zmm11,zmm22,zmm20\t\n" |
1904 | "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n" |
1905 | "vfmadd231ps zmm12,zmm21,zmm20\t\n" |
1906 | "vfmadd231ps zmm13,zmm22,zmm20\t\n" |
1907 | "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n" |
1908 | "vfmadd231ps zmm14,zmm21,zmm20\t\n" |
1909 | "vfmadd231ps zmm15,zmm22,zmm20\t\n" |
1910 | "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n" |
1911 | "vfmadd231ps zmm16,zmm21,zmm20\t\n" |
1912 | "vfmadd231ps zmm17,zmm22,zmm20\t\n" |
1913 | "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n" |
1914 | "vfmadd231ps zmm18,zmm21,zmm20\t\n" |
1915 | "vfmadd231ps zmm19,zmm22,zmm20\t\n" |
1916 | "mov r12, rcx\t\n" |
1917 | "test r14,r14\t\n" |
1918 | "jnz next_inner%=\t\n" |
1919 | "add r10,64\t\n" |
1920 | "jmp dump_C%=\t\n" |
1921 | |
1922 | "zero_regs%=:\t\n" |
1923 | |
1924 | "test r14,r14\t\n" |
1925 | "jz skip_preload_b_zero%=\t\n" |
1926 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1927 | "skip_preload_b_zero%=:\t\n" |
1928 | "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n" |
1929 | "vmulps zmm0,zmm21,zmm20\t\n" |
1930 | "vmulps zmm1,zmm22,zmm20\t\n" |
1931 | "add r12, r13\t\n" |
1932 | "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n" |
1933 | "vmulps zmm2,zmm21,zmm20\t\n" |
1934 | "vmulps zmm3,zmm22,zmm20\t\n" |
1935 | "add r12, r13\t\n" |
1936 | "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n" |
1937 | "vmulps zmm4,zmm21,zmm20\t\n" |
1938 | "vmulps zmm5,zmm22,zmm20\t\n" |
1939 | "add r12, r13\t\n" |
1940 | "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n" |
1941 | "vmulps zmm6,zmm21,zmm20\t\n" |
1942 | "vmulps zmm7,zmm22,zmm20\t\n" |
1943 | "add r12, r13\t\n" |
1944 | "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n" |
1945 | "vmulps zmm8,zmm21,zmm20\t\n" |
1946 | "vmulps zmm9,zmm22,zmm20\t\n" |
1947 | "add r12, r13\t\n" |
1948 | "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n" |
1949 | "vmulps zmm10,zmm21,zmm20\t\n" |
1950 | "vmulps zmm11,zmm22,zmm20\t\n" |
1951 | "add r12, r13\t\n" |
1952 | "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n" |
1953 | "vmulps zmm12,zmm21,zmm20\t\n" |
1954 | "vmulps zmm13,zmm22,zmm20\t\n" |
1955 | "add r12, r13\t\n" |
1956 | "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n" |
1957 | "vmulps zmm14,zmm21,zmm20\t\n" |
1958 | "vmulps zmm15,zmm22,zmm20\t\n" |
1959 | "add r12, r13\t\n" |
1960 | "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n" |
1961 | "vmulps zmm16,zmm21,zmm20\t\n" |
1962 | "vmulps zmm17,zmm22,zmm20\t\n" |
1963 | "add r12, r13\t\n" |
1964 | "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n" |
1965 | "vmulps zmm18,zmm21,zmm20\t\n" |
1966 | "vmulps zmm19,zmm22,zmm20\t\n" |
1967 | "mov r12, rcx\t\n" |
1968 | "test r14,r14\t\n" |
1969 | "jnz next_inner%=\t\n" |
1970 | "add r10,64\t\n" |
1971 | "jmp dump_C%=\t\n" |
1972 | |
1973 | "loop_inner%=:\t\n" |
1974 | |
1975 | "vmovaps zmm21,zmm31\t\n" |
1976 | "vcvtph2ps zmm22,YMMWORD PTR [r10 + 32]\t\n" |
1977 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
1978 | "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n" |
1979 | "vfmadd231ps zmm0,zmm21,zmm20\t\n" |
1980 | "vfmadd231ps zmm1,zmm22,zmm20\t\n" |
1981 | "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n" |
1982 | "vfmadd231ps zmm2,zmm21,zmm20\t\n" |
1983 | "vfmadd231ps zmm3,zmm22,zmm20\t\n" |
1984 | "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n" |
1985 | "vfmadd231ps zmm4,zmm21,zmm20\t\n" |
1986 | "vfmadd231ps zmm5,zmm22,zmm20\t\n" |
1987 | "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n" |
1988 | "vfmadd231ps zmm6,zmm21,zmm20\t\n" |
1989 | "vfmadd231ps zmm7,zmm22,zmm20\t\n" |
1990 | "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n" |
1991 | "vfmadd231ps zmm8,zmm21,zmm20\t\n" |
1992 | "vfmadd231ps zmm9,zmm22,zmm20\t\n" |
1993 | "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n" |
1994 | "vfmadd231ps zmm10,zmm21,zmm20\t\n" |
1995 | "vfmadd231ps zmm11,zmm22,zmm20\t\n" |
1996 | "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n" |
1997 | "vfmadd231ps zmm12,zmm21,zmm20\t\n" |
1998 | "vfmadd231ps zmm13,zmm22,zmm20\t\n" |
1999 | "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n" |
2000 | "vfmadd231ps zmm14,zmm21,zmm20\t\n" |
2001 | "vfmadd231ps zmm15,zmm22,zmm20\t\n" |
2002 | "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n" |
2003 | "vfmadd231ps zmm16,zmm21,zmm20\t\n" |
2004 | "vfmadd231ps zmm17,zmm22,zmm20\t\n" |
2005 | "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n" |
2006 | "vfmadd231ps zmm18,zmm21,zmm20\t\n" |
2007 | "vfmadd231ps zmm19,zmm22,zmm20\t\n" |
2008 | |
2009 | "next_inner%=:\t\n" |
2010 | "add r9,40\t\n" |
2011 | "add r10,64\t\n" |
2012 | "dec r14\t\n" |
2013 | "jnz loop_inner%=\t\n" |
2014 | |
2015 | "vmovaps zmm21,zmm31\t\n" |
2016 | "vcvtph2ps zmm22,YMMWORD PTR [r10 + 32]\t\n" |
2017 | "vbroadcastss zmm20,DWORD PTR [r9+0]\t\n" |
2018 | "vfmadd231ps zmm0,zmm21,zmm20\t\n" |
2019 | "vfmadd231ps zmm1,zmm22,zmm20\t\n" |
2020 | "vbroadcastss zmm20,DWORD PTR [r9+4]\t\n" |
2021 | "vfmadd231ps zmm2,zmm21,zmm20\t\n" |
2022 | "vfmadd231ps zmm3,zmm22,zmm20\t\n" |
2023 | "vbroadcastss zmm20,DWORD PTR [r9+8]\t\n" |
2024 | "vfmadd231ps zmm4,zmm21,zmm20\t\n" |
2025 | "vfmadd231ps zmm5,zmm22,zmm20\t\n" |
2026 | "vbroadcastss zmm20,DWORD PTR [r9+12]\t\n" |
2027 | "vfmadd231ps zmm6,zmm21,zmm20\t\n" |
2028 | "vfmadd231ps zmm7,zmm22,zmm20\t\n" |
2029 | "vbroadcastss zmm20,DWORD PTR [r9+16]\t\n" |
2030 | "vfmadd231ps zmm8,zmm21,zmm20\t\n" |
2031 | "vfmadd231ps zmm9,zmm22,zmm20\t\n" |
2032 | "vbroadcastss zmm20,DWORD PTR [r9+20]\t\n" |
2033 | "vfmadd231ps zmm10,zmm21,zmm20\t\n" |
2034 | "vfmadd231ps zmm11,zmm22,zmm20\t\n" |
2035 | "vbroadcastss zmm20,DWORD PTR [r9+24]\t\n" |
2036 | "vfmadd231ps zmm12,zmm21,zmm20\t\n" |
2037 | "vfmadd231ps zmm13,zmm22,zmm20\t\n" |
2038 | "vbroadcastss zmm20,DWORD PTR [r9+28]\t\n" |
2039 | "vfmadd231ps zmm14,zmm21,zmm20\t\n" |
2040 | "vfmadd231ps zmm15,zmm22,zmm20\t\n" |
2041 | "vbroadcastss zmm20,DWORD PTR [r9+32]\t\n" |
2042 | "vfmadd231ps zmm16,zmm21,zmm20\t\n" |
2043 | "vfmadd231ps zmm17,zmm22,zmm20\t\n" |
2044 | "vbroadcastss zmm20,DWORD PTR [r9+36]\t\n" |
2045 | "vfmadd231ps zmm18,zmm21,zmm20\t\n" |
2046 | "vfmadd231ps zmm19,zmm22,zmm20\t\n" |
2047 | "add r9,40\t\n" |
2048 | "add r10,64\t\n" |
2049 | // Dump C |
2050 | "dump_C%=:\t\n" |
2051 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
2052 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
2053 | "add r12, r13\t\n" |
2054 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
2055 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
2056 | "add r12, r13\t\n" |
2057 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
2058 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
2059 | "add r12, r13\t\n" |
2060 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
2061 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
2062 | "add r12, r13\t\n" |
2063 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
2064 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
2065 | "add r12, r13\t\n" |
2066 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
2067 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
2068 | "add r12, r13\t\n" |
2069 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
2070 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
2071 | "add r12, r13\t\n" |
2072 | "vmovups zmmword PTR [r12 + 0], zmm14\t\n" |
2073 | "vmovups zmmword PTR [r12 + 64], zmm15\t\n" |
2074 | "add r12, r13\t\n" |
2075 | "vmovups zmmword PTR [r12 + 0], zmm16\t\n" |
2076 | "vmovups zmmword PTR [r12 + 64], zmm17\t\n" |
2077 | "add r12, r13\t\n" |
2078 | "vmovups zmmword PTR [r12 + 0], zmm18\t\n" |
2079 | "vmovups zmmword PTR [r12 + 64], zmm19\t\n" |
2080 | |
2081 | // next outer iteration |
2082 | "add rcx, 128\t\n" |
2083 | "mov r12, rcx\t\n" |
2084 | "mov r9, rax\t\n" |
2085 | "inc rbx\t\n" |
2086 | "cmp rbx, rdi\t\n" |
2087 | "jl loop_outter%=\t\n" |
2088 | : |
2089 | : [gp] "rm" (gp) |
2090 | : "r8" , |
2091 | "r9" , |
2092 | "r10" , |
2093 | "r11" , |
2094 | "r13" , |
2095 | "r14" , |
2096 | "rax" , |
2097 | "rcx" , |
2098 | "rsi" , |
2099 | "rdi" , |
2100 | "rbx" , |
2101 | "r12" , |
2102 | "r15" , |
2103 | "memory" ); |
2104 | } |
2105 | void NOINLINE gemmkernel_11x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
2106 | asm volatile( |
2107 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
2108 | "mov %[gp], %%r14\t\n" |
2109 | ".intel_syntax noprefix\t\n" |
2110 | #else |
2111 | "mov r14, %[gp]\t\n" |
2112 | #endif |
2113 | |
2114 | // Copy parameters |
2115 | // k |
2116 | "mov r8, [r14 + 0]\t\n" |
2117 | "dec r8\t\n" |
2118 | // A |
2119 | "mov r9, [r14 + 8]\t\n" |
2120 | // B |
2121 | "mov r10, [r14 + 16]\t\n" |
2122 | // beta |
2123 | "lea r15, [r14 + 24]\t\n" |
2124 | // C |
2125 | "mov r12, [r14 + 32]\t\n" |
2126 | // ldc |
2127 | "mov r13, [r14 + 40]\t\n" |
2128 | // b_block_cols |
2129 | "mov rdi, [r14 + 48]\t\n" |
2130 | // b_block_size |
2131 | "mov rsi, [r14 + 56]\t\n" |
2132 | |
2133 | // Make copies of A and C |
2134 | "mov rax, r9\t\n" |
2135 | "mov rcx, r12\t\n" |
2136 | |
2137 | "xor ebx, ebx\t\n" |
2138 | "loop_outter%=:\t\n" |
2139 | "mov r14, r8\t\n" |
2140 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
2141 | "vcvtph2ps zmm23,YMMWORD PTR [r10 + 0]\t\n" |
2142 | "vcvtph2ps zmm24,YMMWORD PTR [r10 + 32]\t\n" |
2143 | "vxorps xmm0, xmm0, xmm0\t\n" |
2144 | "vcomiss xmm31, xmm0\t\n" |
2145 | "jz zero_regs%=\t\n" |
2146 | |
2147 | // Setup values with beta multiplication |
2148 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
2149 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
2150 | "add r12, r13\t\n" |
2151 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
2152 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
2153 | "add r12, r13\t\n" |
2154 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
2155 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
2156 | "add r12, r13\t\n" |
2157 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
2158 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
2159 | "add r12, r13\t\n" |
2160 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
2161 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
2162 | "add r12, r13\t\n" |
2163 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
2164 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
2165 | "add r12, r13\t\n" |
2166 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
2167 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
2168 | "add r12, r13\t\n" |
2169 | "vmulps zmm14, zmm31, [r12 + 0]\t\n" |
2170 | "vmulps zmm15, zmm31, [r12 + 64]\t\n" |
2171 | "add r12, r13\t\n" |
2172 | "vmulps zmm16, zmm31, [r12 + 0]\t\n" |
2173 | "vmulps zmm17, zmm31, [r12 + 64]\t\n" |
2174 | "add r12, r13\t\n" |
2175 | "vmulps zmm18, zmm31, [r12 + 0]\t\n" |
2176 | "vmulps zmm19, zmm31, [r12 + 64]\t\n" |
2177 | "add r12, r13\t\n" |
2178 | "vmulps zmm20, zmm31, [r12 + 0]\t\n" |
2179 | "vmulps zmm21, zmm31, [r12 + 64]\t\n" |
2180 | "test r14,r14\t\n" |
2181 | "jz skip_preload%=\t\n" |
2182 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2183 | "skip_preload%=:\t\n" |
2184 | "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n" |
2185 | "vfmadd231ps zmm0,zmm23,zmm22\t\n" |
2186 | "vfmadd231ps zmm1,zmm24,zmm22\t\n" |
2187 | "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n" |
2188 | "vfmadd231ps zmm2,zmm23,zmm22\t\n" |
2189 | "vfmadd231ps zmm3,zmm24,zmm22\t\n" |
2190 | "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n" |
2191 | "vfmadd231ps zmm4,zmm23,zmm22\t\n" |
2192 | "vfmadd231ps zmm5,zmm24,zmm22\t\n" |
2193 | "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n" |
2194 | "vfmadd231ps zmm6,zmm23,zmm22\t\n" |
2195 | "vfmadd231ps zmm7,zmm24,zmm22\t\n" |
2196 | "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n" |
2197 | "vfmadd231ps zmm8,zmm23,zmm22\t\n" |
2198 | "vfmadd231ps zmm9,zmm24,zmm22\t\n" |
2199 | "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n" |
2200 | "vfmadd231ps zmm10,zmm23,zmm22\t\n" |
2201 | "vfmadd231ps zmm11,zmm24,zmm22\t\n" |
2202 | "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n" |
2203 | "vfmadd231ps zmm12,zmm23,zmm22\t\n" |
2204 | "vfmadd231ps zmm13,zmm24,zmm22\t\n" |
2205 | "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n" |
2206 | "vfmadd231ps zmm14,zmm23,zmm22\t\n" |
2207 | "vfmadd231ps zmm15,zmm24,zmm22\t\n" |
2208 | "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n" |
2209 | "vfmadd231ps zmm16,zmm23,zmm22\t\n" |
2210 | "vfmadd231ps zmm17,zmm24,zmm22\t\n" |
2211 | "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n" |
2212 | "vfmadd231ps zmm18,zmm23,zmm22\t\n" |
2213 | "vfmadd231ps zmm19,zmm24,zmm22\t\n" |
2214 | "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n" |
2215 | "vfmadd231ps zmm20,zmm23,zmm22\t\n" |
2216 | "vfmadd231ps zmm21,zmm24,zmm22\t\n" |
2217 | "mov r12, rcx\t\n" |
2218 | "test r14,r14\t\n" |
2219 | "jnz next_inner%=\t\n" |
2220 | "add r10,64\t\n" |
2221 | "jmp dump_C%=\t\n" |
2222 | |
2223 | "zero_regs%=:\t\n" |
2224 | |
2225 | "test r14,r14\t\n" |
2226 | "jz skip_preload_b_zero%=\t\n" |
2227 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2228 | "skip_preload_b_zero%=:\t\n" |
2229 | "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n" |
2230 | "vmulps zmm0,zmm23,zmm22\t\n" |
2231 | "vmulps zmm1,zmm24,zmm22\t\n" |
2232 | "add r12, r13\t\n" |
2233 | "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n" |
2234 | "vmulps zmm2,zmm23,zmm22\t\n" |
2235 | "vmulps zmm3,zmm24,zmm22\t\n" |
2236 | "add r12, r13\t\n" |
2237 | "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n" |
2238 | "vmulps zmm4,zmm23,zmm22\t\n" |
2239 | "vmulps zmm5,zmm24,zmm22\t\n" |
2240 | "add r12, r13\t\n" |
2241 | "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n" |
2242 | "vmulps zmm6,zmm23,zmm22\t\n" |
2243 | "vmulps zmm7,zmm24,zmm22\t\n" |
2244 | "add r12, r13\t\n" |
2245 | "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n" |
2246 | "vmulps zmm8,zmm23,zmm22\t\n" |
2247 | "vmulps zmm9,zmm24,zmm22\t\n" |
2248 | "add r12, r13\t\n" |
2249 | "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n" |
2250 | "vmulps zmm10,zmm23,zmm22\t\n" |
2251 | "vmulps zmm11,zmm24,zmm22\t\n" |
2252 | "add r12, r13\t\n" |
2253 | "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n" |
2254 | "vmulps zmm12,zmm23,zmm22\t\n" |
2255 | "vmulps zmm13,zmm24,zmm22\t\n" |
2256 | "add r12, r13\t\n" |
2257 | "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n" |
2258 | "vmulps zmm14,zmm23,zmm22\t\n" |
2259 | "vmulps zmm15,zmm24,zmm22\t\n" |
2260 | "add r12, r13\t\n" |
2261 | "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n" |
2262 | "vmulps zmm16,zmm23,zmm22\t\n" |
2263 | "vmulps zmm17,zmm24,zmm22\t\n" |
2264 | "add r12, r13\t\n" |
2265 | "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n" |
2266 | "vmulps zmm18,zmm23,zmm22\t\n" |
2267 | "vmulps zmm19,zmm24,zmm22\t\n" |
2268 | "add r12, r13\t\n" |
2269 | "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n" |
2270 | "vmulps zmm20,zmm23,zmm22\t\n" |
2271 | "vmulps zmm21,zmm24,zmm22\t\n" |
2272 | "mov r12, rcx\t\n" |
2273 | "test r14,r14\t\n" |
2274 | "jnz next_inner%=\t\n" |
2275 | "add r10,64\t\n" |
2276 | "jmp dump_C%=\t\n" |
2277 | |
2278 | "loop_inner%=:\t\n" |
2279 | |
2280 | "vmovaps zmm23,zmm31\t\n" |
2281 | "vcvtph2ps zmm24,YMMWORD PTR [r10 + 32]\t\n" |
2282 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2283 | "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n" |
2284 | "vfmadd231ps zmm0,zmm23,zmm22\t\n" |
2285 | "vfmadd231ps zmm1,zmm24,zmm22\t\n" |
2286 | "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n" |
2287 | "vfmadd231ps zmm2,zmm23,zmm22\t\n" |
2288 | "vfmadd231ps zmm3,zmm24,zmm22\t\n" |
2289 | "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n" |
2290 | "vfmadd231ps zmm4,zmm23,zmm22\t\n" |
2291 | "vfmadd231ps zmm5,zmm24,zmm22\t\n" |
2292 | "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n" |
2293 | "vfmadd231ps zmm6,zmm23,zmm22\t\n" |
2294 | "vfmadd231ps zmm7,zmm24,zmm22\t\n" |
2295 | "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n" |
2296 | "vfmadd231ps zmm8,zmm23,zmm22\t\n" |
2297 | "vfmadd231ps zmm9,zmm24,zmm22\t\n" |
2298 | "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n" |
2299 | "vfmadd231ps zmm10,zmm23,zmm22\t\n" |
2300 | "vfmadd231ps zmm11,zmm24,zmm22\t\n" |
2301 | "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n" |
2302 | "vfmadd231ps zmm12,zmm23,zmm22\t\n" |
2303 | "vfmadd231ps zmm13,zmm24,zmm22\t\n" |
2304 | "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n" |
2305 | "vfmadd231ps zmm14,zmm23,zmm22\t\n" |
2306 | "vfmadd231ps zmm15,zmm24,zmm22\t\n" |
2307 | "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n" |
2308 | "vfmadd231ps zmm16,zmm23,zmm22\t\n" |
2309 | "vfmadd231ps zmm17,zmm24,zmm22\t\n" |
2310 | "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n" |
2311 | "vfmadd231ps zmm18,zmm23,zmm22\t\n" |
2312 | "vfmadd231ps zmm19,zmm24,zmm22\t\n" |
2313 | "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n" |
2314 | "vfmadd231ps zmm20,zmm23,zmm22\t\n" |
2315 | "vfmadd231ps zmm21,zmm24,zmm22\t\n" |
2316 | |
2317 | "next_inner%=:\t\n" |
2318 | "add r9,44\t\n" |
2319 | "add r10,64\t\n" |
2320 | "dec r14\t\n" |
2321 | "jnz loop_inner%=\t\n" |
2322 | |
2323 | "vmovaps zmm23,zmm31\t\n" |
2324 | "vcvtph2ps zmm24,YMMWORD PTR [r10 + 32]\t\n" |
2325 | "vbroadcastss zmm22,DWORD PTR [r9+0]\t\n" |
2326 | "vfmadd231ps zmm0,zmm23,zmm22\t\n" |
2327 | "vfmadd231ps zmm1,zmm24,zmm22\t\n" |
2328 | "vbroadcastss zmm22,DWORD PTR [r9+4]\t\n" |
2329 | "vfmadd231ps zmm2,zmm23,zmm22\t\n" |
2330 | "vfmadd231ps zmm3,zmm24,zmm22\t\n" |
2331 | "vbroadcastss zmm22,DWORD PTR [r9+8]\t\n" |
2332 | "vfmadd231ps zmm4,zmm23,zmm22\t\n" |
2333 | "vfmadd231ps zmm5,zmm24,zmm22\t\n" |
2334 | "vbroadcastss zmm22,DWORD PTR [r9+12]\t\n" |
2335 | "vfmadd231ps zmm6,zmm23,zmm22\t\n" |
2336 | "vfmadd231ps zmm7,zmm24,zmm22\t\n" |
2337 | "vbroadcastss zmm22,DWORD PTR [r9+16]\t\n" |
2338 | "vfmadd231ps zmm8,zmm23,zmm22\t\n" |
2339 | "vfmadd231ps zmm9,zmm24,zmm22\t\n" |
2340 | "vbroadcastss zmm22,DWORD PTR [r9+20]\t\n" |
2341 | "vfmadd231ps zmm10,zmm23,zmm22\t\n" |
2342 | "vfmadd231ps zmm11,zmm24,zmm22\t\n" |
2343 | "vbroadcastss zmm22,DWORD PTR [r9+24]\t\n" |
2344 | "vfmadd231ps zmm12,zmm23,zmm22\t\n" |
2345 | "vfmadd231ps zmm13,zmm24,zmm22\t\n" |
2346 | "vbroadcastss zmm22,DWORD PTR [r9+28]\t\n" |
2347 | "vfmadd231ps zmm14,zmm23,zmm22\t\n" |
2348 | "vfmadd231ps zmm15,zmm24,zmm22\t\n" |
2349 | "vbroadcastss zmm22,DWORD PTR [r9+32]\t\n" |
2350 | "vfmadd231ps zmm16,zmm23,zmm22\t\n" |
2351 | "vfmadd231ps zmm17,zmm24,zmm22\t\n" |
2352 | "vbroadcastss zmm22,DWORD PTR [r9+36]\t\n" |
2353 | "vfmadd231ps zmm18,zmm23,zmm22\t\n" |
2354 | "vfmadd231ps zmm19,zmm24,zmm22\t\n" |
2355 | "vbroadcastss zmm22,DWORD PTR [r9+40]\t\n" |
2356 | "vfmadd231ps zmm20,zmm23,zmm22\t\n" |
2357 | "vfmadd231ps zmm21,zmm24,zmm22\t\n" |
2358 | "add r9,44\t\n" |
2359 | "add r10,64\t\n" |
2360 | // Dump C |
2361 | "dump_C%=:\t\n" |
2362 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
2363 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
2364 | "add r12, r13\t\n" |
2365 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
2366 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
2367 | "add r12, r13\t\n" |
2368 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
2369 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
2370 | "add r12, r13\t\n" |
2371 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
2372 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
2373 | "add r12, r13\t\n" |
2374 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
2375 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
2376 | "add r12, r13\t\n" |
2377 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
2378 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
2379 | "add r12, r13\t\n" |
2380 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
2381 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
2382 | "add r12, r13\t\n" |
2383 | "vmovups zmmword PTR [r12 + 0], zmm14\t\n" |
2384 | "vmovups zmmword PTR [r12 + 64], zmm15\t\n" |
2385 | "add r12, r13\t\n" |
2386 | "vmovups zmmword PTR [r12 + 0], zmm16\t\n" |
2387 | "vmovups zmmword PTR [r12 + 64], zmm17\t\n" |
2388 | "add r12, r13\t\n" |
2389 | "vmovups zmmword PTR [r12 + 0], zmm18\t\n" |
2390 | "vmovups zmmword PTR [r12 + 64], zmm19\t\n" |
2391 | "add r12, r13\t\n" |
2392 | "vmovups zmmword PTR [r12 + 0], zmm20\t\n" |
2393 | "vmovups zmmword PTR [r12 + 64], zmm21\t\n" |
2394 | |
2395 | // next outer iteration |
2396 | "add rcx, 128\t\n" |
2397 | "mov r12, rcx\t\n" |
2398 | "mov r9, rax\t\n" |
2399 | "inc rbx\t\n" |
2400 | "cmp rbx, rdi\t\n" |
2401 | "jl loop_outter%=\t\n" |
2402 | : |
2403 | : [gp] "rm" (gp) |
2404 | : "r8" , |
2405 | "r9" , |
2406 | "r10" , |
2407 | "r11" , |
2408 | "r13" , |
2409 | "r14" , |
2410 | "rax" , |
2411 | "rcx" , |
2412 | "rsi" , |
2413 | "rdi" , |
2414 | "rbx" , |
2415 | "r12" , |
2416 | "r15" , |
2417 | "memory" ); |
2418 | } |
2419 | void NOINLINE gemmkernel_12x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
2420 | asm volatile( |
2421 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
2422 | "mov %[gp], %%r14\t\n" |
2423 | ".intel_syntax noprefix\t\n" |
2424 | #else |
2425 | "mov r14, %[gp]\t\n" |
2426 | #endif |
2427 | |
2428 | // Copy parameters |
2429 | // k |
2430 | "mov r8, [r14 + 0]\t\n" |
2431 | "dec r8\t\n" |
2432 | // A |
2433 | "mov r9, [r14 + 8]\t\n" |
2434 | // B |
2435 | "mov r10, [r14 + 16]\t\n" |
2436 | // beta |
2437 | "lea r15, [r14 + 24]\t\n" |
2438 | // C |
2439 | "mov r12, [r14 + 32]\t\n" |
2440 | // ldc |
2441 | "mov r13, [r14 + 40]\t\n" |
2442 | // b_block_cols |
2443 | "mov rdi, [r14 + 48]\t\n" |
2444 | // b_block_size |
2445 | "mov rsi, [r14 + 56]\t\n" |
2446 | |
2447 | // Make copies of A and C |
2448 | "mov rax, r9\t\n" |
2449 | "mov rcx, r12\t\n" |
2450 | |
2451 | "xor ebx, ebx\t\n" |
2452 | "loop_outter%=:\t\n" |
2453 | "mov r14, r8\t\n" |
2454 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
2455 | "vcvtph2ps zmm25,YMMWORD PTR [r10 + 0]\t\n" |
2456 | "vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n" |
2457 | "vxorps xmm0, xmm0, xmm0\t\n" |
2458 | "vcomiss xmm31, xmm0\t\n" |
2459 | "jz zero_regs%=\t\n" |
2460 | |
2461 | // Setup values with beta multiplication |
2462 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
2463 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
2464 | "add r12, r13\t\n" |
2465 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
2466 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
2467 | "add r12, r13\t\n" |
2468 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
2469 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
2470 | "add r12, r13\t\n" |
2471 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
2472 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
2473 | "add r12, r13\t\n" |
2474 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
2475 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
2476 | "add r12, r13\t\n" |
2477 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
2478 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
2479 | "add r12, r13\t\n" |
2480 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
2481 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
2482 | "add r12, r13\t\n" |
2483 | "vmulps zmm14, zmm31, [r12 + 0]\t\n" |
2484 | "vmulps zmm15, zmm31, [r12 + 64]\t\n" |
2485 | "add r12, r13\t\n" |
2486 | "vmulps zmm16, zmm31, [r12 + 0]\t\n" |
2487 | "vmulps zmm17, zmm31, [r12 + 64]\t\n" |
2488 | "add r12, r13\t\n" |
2489 | "vmulps zmm18, zmm31, [r12 + 0]\t\n" |
2490 | "vmulps zmm19, zmm31, [r12 + 64]\t\n" |
2491 | "add r12, r13\t\n" |
2492 | "vmulps zmm20, zmm31, [r12 + 0]\t\n" |
2493 | "vmulps zmm21, zmm31, [r12 + 64]\t\n" |
2494 | "add r12, r13\t\n" |
2495 | "vmulps zmm22, zmm31, [r12 + 0]\t\n" |
2496 | "vmulps zmm23, zmm31, [r12 + 64]\t\n" |
2497 | "test r14,r14\t\n" |
2498 | "jz skip_preload%=\t\n" |
2499 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2500 | "skip_preload%=:\t\n" |
2501 | "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n" |
2502 | "vfmadd231ps zmm0,zmm25,zmm24\t\n" |
2503 | "vfmadd231ps zmm1,zmm26,zmm24\t\n" |
2504 | "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n" |
2505 | "vfmadd231ps zmm2,zmm25,zmm24\t\n" |
2506 | "vfmadd231ps zmm3,zmm26,zmm24\t\n" |
2507 | "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n" |
2508 | "vfmadd231ps zmm4,zmm25,zmm24\t\n" |
2509 | "vfmadd231ps zmm5,zmm26,zmm24\t\n" |
2510 | "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n" |
2511 | "vfmadd231ps zmm6,zmm25,zmm24\t\n" |
2512 | "vfmadd231ps zmm7,zmm26,zmm24\t\n" |
2513 | "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n" |
2514 | "vfmadd231ps zmm8,zmm25,zmm24\t\n" |
2515 | "vfmadd231ps zmm9,zmm26,zmm24\t\n" |
2516 | "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n" |
2517 | "vfmadd231ps zmm10,zmm25,zmm24\t\n" |
2518 | "vfmadd231ps zmm11,zmm26,zmm24\t\n" |
2519 | "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n" |
2520 | "vfmadd231ps zmm12,zmm25,zmm24\t\n" |
2521 | "vfmadd231ps zmm13,zmm26,zmm24\t\n" |
2522 | "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n" |
2523 | "vfmadd231ps zmm14,zmm25,zmm24\t\n" |
2524 | "vfmadd231ps zmm15,zmm26,zmm24\t\n" |
2525 | "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n" |
2526 | "vfmadd231ps zmm16,zmm25,zmm24\t\n" |
2527 | "vfmadd231ps zmm17,zmm26,zmm24\t\n" |
2528 | "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n" |
2529 | "vfmadd231ps zmm18,zmm25,zmm24\t\n" |
2530 | "vfmadd231ps zmm19,zmm26,zmm24\t\n" |
2531 | "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n" |
2532 | "vfmadd231ps zmm20,zmm25,zmm24\t\n" |
2533 | "vfmadd231ps zmm21,zmm26,zmm24\t\n" |
2534 | "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n" |
2535 | "vfmadd231ps zmm22,zmm25,zmm24\t\n" |
2536 | "vfmadd231ps zmm23,zmm26,zmm24\t\n" |
2537 | "mov r12, rcx\t\n" |
2538 | "test r14,r14\t\n" |
2539 | "jnz next_inner%=\t\n" |
2540 | "add r10,64\t\n" |
2541 | "jmp dump_C%=\t\n" |
2542 | |
2543 | "zero_regs%=:\t\n" |
2544 | |
2545 | "test r14,r14\t\n" |
2546 | "jz skip_preload_b_zero%=\t\n" |
2547 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2548 | "skip_preload_b_zero%=:\t\n" |
2549 | "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n" |
2550 | "vmulps zmm0,zmm25,zmm24\t\n" |
2551 | "vmulps zmm1,zmm26,zmm24\t\n" |
2552 | "add r12, r13\t\n" |
2553 | "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n" |
2554 | "vmulps zmm2,zmm25,zmm24\t\n" |
2555 | "vmulps zmm3,zmm26,zmm24\t\n" |
2556 | "add r12, r13\t\n" |
2557 | "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n" |
2558 | "vmulps zmm4,zmm25,zmm24\t\n" |
2559 | "vmulps zmm5,zmm26,zmm24\t\n" |
2560 | "add r12, r13\t\n" |
2561 | "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n" |
2562 | "vmulps zmm6,zmm25,zmm24\t\n" |
2563 | "vmulps zmm7,zmm26,zmm24\t\n" |
2564 | "add r12, r13\t\n" |
2565 | "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n" |
2566 | "vmulps zmm8,zmm25,zmm24\t\n" |
2567 | "vmulps zmm9,zmm26,zmm24\t\n" |
2568 | "add r12, r13\t\n" |
2569 | "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n" |
2570 | "vmulps zmm10,zmm25,zmm24\t\n" |
2571 | "vmulps zmm11,zmm26,zmm24\t\n" |
2572 | "add r12, r13\t\n" |
2573 | "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n" |
2574 | "vmulps zmm12,zmm25,zmm24\t\n" |
2575 | "vmulps zmm13,zmm26,zmm24\t\n" |
2576 | "add r12, r13\t\n" |
2577 | "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n" |
2578 | "vmulps zmm14,zmm25,zmm24\t\n" |
2579 | "vmulps zmm15,zmm26,zmm24\t\n" |
2580 | "add r12, r13\t\n" |
2581 | "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n" |
2582 | "vmulps zmm16,zmm25,zmm24\t\n" |
2583 | "vmulps zmm17,zmm26,zmm24\t\n" |
2584 | "add r12, r13\t\n" |
2585 | "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n" |
2586 | "vmulps zmm18,zmm25,zmm24\t\n" |
2587 | "vmulps zmm19,zmm26,zmm24\t\n" |
2588 | "add r12, r13\t\n" |
2589 | "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n" |
2590 | "vmulps zmm20,zmm25,zmm24\t\n" |
2591 | "vmulps zmm21,zmm26,zmm24\t\n" |
2592 | "add r12, r13\t\n" |
2593 | "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n" |
2594 | "vmulps zmm22,zmm25,zmm24\t\n" |
2595 | "vmulps zmm23,zmm26,zmm24\t\n" |
2596 | "mov r12, rcx\t\n" |
2597 | "test r14,r14\t\n" |
2598 | "jnz next_inner%=\t\n" |
2599 | "add r10,64\t\n" |
2600 | "jmp dump_C%=\t\n" |
2601 | |
2602 | "loop_inner%=:\t\n" |
2603 | |
2604 | "vmovaps zmm25,zmm31\t\n" |
2605 | "vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n" |
2606 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2607 | "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n" |
2608 | "vfmadd231ps zmm0,zmm25,zmm24\t\n" |
2609 | "vfmadd231ps zmm1,zmm26,zmm24\t\n" |
2610 | "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n" |
2611 | "vfmadd231ps zmm2,zmm25,zmm24\t\n" |
2612 | "vfmadd231ps zmm3,zmm26,zmm24\t\n" |
2613 | "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n" |
2614 | "vfmadd231ps zmm4,zmm25,zmm24\t\n" |
2615 | "vfmadd231ps zmm5,zmm26,zmm24\t\n" |
2616 | "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n" |
2617 | "vfmadd231ps zmm6,zmm25,zmm24\t\n" |
2618 | "vfmadd231ps zmm7,zmm26,zmm24\t\n" |
2619 | "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n" |
2620 | "vfmadd231ps zmm8,zmm25,zmm24\t\n" |
2621 | "vfmadd231ps zmm9,zmm26,zmm24\t\n" |
2622 | "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n" |
2623 | "vfmadd231ps zmm10,zmm25,zmm24\t\n" |
2624 | "vfmadd231ps zmm11,zmm26,zmm24\t\n" |
2625 | "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n" |
2626 | "vfmadd231ps zmm12,zmm25,zmm24\t\n" |
2627 | "vfmadd231ps zmm13,zmm26,zmm24\t\n" |
2628 | "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n" |
2629 | "vfmadd231ps zmm14,zmm25,zmm24\t\n" |
2630 | "vfmadd231ps zmm15,zmm26,zmm24\t\n" |
2631 | "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n" |
2632 | "vfmadd231ps zmm16,zmm25,zmm24\t\n" |
2633 | "vfmadd231ps zmm17,zmm26,zmm24\t\n" |
2634 | "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n" |
2635 | "vfmadd231ps zmm18,zmm25,zmm24\t\n" |
2636 | "vfmadd231ps zmm19,zmm26,zmm24\t\n" |
2637 | "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n" |
2638 | "vfmadd231ps zmm20,zmm25,zmm24\t\n" |
2639 | "vfmadd231ps zmm21,zmm26,zmm24\t\n" |
2640 | "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n" |
2641 | "vfmadd231ps zmm22,zmm25,zmm24\t\n" |
2642 | "vfmadd231ps zmm23,zmm26,zmm24\t\n" |
2643 | |
2644 | "next_inner%=:\t\n" |
2645 | "add r9,48\t\n" |
2646 | "add r10,64\t\n" |
2647 | "dec r14\t\n" |
2648 | "jnz loop_inner%=\t\n" |
2649 | |
2650 | "vmovaps zmm25,zmm31\t\n" |
2651 | "vcvtph2ps zmm26,YMMWORD PTR [r10 + 32]\t\n" |
2652 | "vbroadcastss zmm24,DWORD PTR [r9+0]\t\n" |
2653 | "vfmadd231ps zmm0,zmm25,zmm24\t\n" |
2654 | "vfmadd231ps zmm1,zmm26,zmm24\t\n" |
2655 | "vbroadcastss zmm24,DWORD PTR [r9+4]\t\n" |
2656 | "vfmadd231ps zmm2,zmm25,zmm24\t\n" |
2657 | "vfmadd231ps zmm3,zmm26,zmm24\t\n" |
2658 | "vbroadcastss zmm24,DWORD PTR [r9+8]\t\n" |
2659 | "vfmadd231ps zmm4,zmm25,zmm24\t\n" |
2660 | "vfmadd231ps zmm5,zmm26,zmm24\t\n" |
2661 | "vbroadcastss zmm24,DWORD PTR [r9+12]\t\n" |
2662 | "vfmadd231ps zmm6,zmm25,zmm24\t\n" |
2663 | "vfmadd231ps zmm7,zmm26,zmm24\t\n" |
2664 | "vbroadcastss zmm24,DWORD PTR [r9+16]\t\n" |
2665 | "vfmadd231ps zmm8,zmm25,zmm24\t\n" |
2666 | "vfmadd231ps zmm9,zmm26,zmm24\t\n" |
2667 | "vbroadcastss zmm24,DWORD PTR [r9+20]\t\n" |
2668 | "vfmadd231ps zmm10,zmm25,zmm24\t\n" |
2669 | "vfmadd231ps zmm11,zmm26,zmm24\t\n" |
2670 | "vbroadcastss zmm24,DWORD PTR [r9+24]\t\n" |
2671 | "vfmadd231ps zmm12,zmm25,zmm24\t\n" |
2672 | "vfmadd231ps zmm13,zmm26,zmm24\t\n" |
2673 | "vbroadcastss zmm24,DWORD PTR [r9+28]\t\n" |
2674 | "vfmadd231ps zmm14,zmm25,zmm24\t\n" |
2675 | "vfmadd231ps zmm15,zmm26,zmm24\t\n" |
2676 | "vbroadcastss zmm24,DWORD PTR [r9+32]\t\n" |
2677 | "vfmadd231ps zmm16,zmm25,zmm24\t\n" |
2678 | "vfmadd231ps zmm17,zmm26,zmm24\t\n" |
2679 | "vbroadcastss zmm24,DWORD PTR [r9+36]\t\n" |
2680 | "vfmadd231ps zmm18,zmm25,zmm24\t\n" |
2681 | "vfmadd231ps zmm19,zmm26,zmm24\t\n" |
2682 | "vbroadcastss zmm24,DWORD PTR [r9+40]\t\n" |
2683 | "vfmadd231ps zmm20,zmm25,zmm24\t\n" |
2684 | "vfmadd231ps zmm21,zmm26,zmm24\t\n" |
2685 | "vbroadcastss zmm24,DWORD PTR [r9+44]\t\n" |
2686 | "vfmadd231ps zmm22,zmm25,zmm24\t\n" |
2687 | "vfmadd231ps zmm23,zmm26,zmm24\t\n" |
2688 | "add r9,48\t\n" |
2689 | "add r10,64\t\n" |
2690 | // Dump C |
2691 | "dump_C%=:\t\n" |
2692 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
2693 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
2694 | "add r12, r13\t\n" |
2695 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
2696 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
2697 | "add r12, r13\t\n" |
2698 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
2699 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
2700 | "add r12, r13\t\n" |
2701 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
2702 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
2703 | "add r12, r13\t\n" |
2704 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
2705 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
2706 | "add r12, r13\t\n" |
2707 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
2708 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
2709 | "add r12, r13\t\n" |
2710 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
2711 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
2712 | "add r12, r13\t\n" |
2713 | "vmovups zmmword PTR [r12 + 0], zmm14\t\n" |
2714 | "vmovups zmmword PTR [r12 + 64], zmm15\t\n" |
2715 | "add r12, r13\t\n" |
2716 | "vmovups zmmword PTR [r12 + 0], zmm16\t\n" |
2717 | "vmovups zmmword PTR [r12 + 64], zmm17\t\n" |
2718 | "add r12, r13\t\n" |
2719 | "vmovups zmmword PTR [r12 + 0], zmm18\t\n" |
2720 | "vmovups zmmword PTR [r12 + 64], zmm19\t\n" |
2721 | "add r12, r13\t\n" |
2722 | "vmovups zmmword PTR [r12 + 0], zmm20\t\n" |
2723 | "vmovups zmmword PTR [r12 + 64], zmm21\t\n" |
2724 | "add r12, r13\t\n" |
2725 | "vmovups zmmword PTR [r12 + 0], zmm22\t\n" |
2726 | "vmovups zmmword PTR [r12 + 64], zmm23\t\n" |
2727 | |
2728 | // next outer iteration |
2729 | "add rcx, 128\t\n" |
2730 | "mov r12, rcx\t\n" |
2731 | "mov r9, rax\t\n" |
2732 | "inc rbx\t\n" |
2733 | "cmp rbx, rdi\t\n" |
2734 | "jl loop_outter%=\t\n" |
2735 | : |
2736 | : [gp] "rm" (gp) |
2737 | : "r8" , |
2738 | "r9" , |
2739 | "r10" , |
2740 | "r11" , |
2741 | "r13" , |
2742 | "r14" , |
2743 | "rax" , |
2744 | "rcx" , |
2745 | "rsi" , |
2746 | "rdi" , |
2747 | "rbx" , |
2748 | "r12" , |
2749 | "r15" , |
2750 | "memory" ); |
2751 | } |
2752 | void NOINLINE gemmkernel_13x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
2753 | asm volatile( |
2754 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
2755 | "mov %[gp], %%r14\t\n" |
2756 | ".intel_syntax noprefix\t\n" |
2757 | #else |
2758 | "mov r14, %[gp]\t\n" |
2759 | #endif |
2760 | |
2761 | // Copy parameters |
2762 | // k |
2763 | "mov r8, [r14 + 0]\t\n" |
2764 | "dec r8\t\n" |
2765 | // A |
2766 | "mov r9, [r14 + 8]\t\n" |
2767 | // B |
2768 | "mov r10, [r14 + 16]\t\n" |
2769 | // beta |
2770 | "lea r15, [r14 + 24]\t\n" |
2771 | // C |
2772 | "mov r12, [r14 + 32]\t\n" |
2773 | // ldc |
2774 | "mov r13, [r14 + 40]\t\n" |
2775 | // b_block_cols |
2776 | "mov rdi, [r14 + 48]\t\n" |
2777 | // b_block_size |
2778 | "mov rsi, [r14 + 56]\t\n" |
2779 | |
2780 | // Make copies of A and C |
2781 | "mov rax, r9\t\n" |
2782 | "mov rcx, r12\t\n" |
2783 | |
2784 | "xor ebx, ebx\t\n" |
2785 | "loop_outter%=:\t\n" |
2786 | "mov r14, r8\t\n" |
2787 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
2788 | "vcvtph2ps zmm27,YMMWORD PTR [r10 + 0]\t\n" |
2789 | "vcvtph2ps zmm28,YMMWORD PTR [r10 + 32]\t\n" |
2790 | "vxorps xmm0, xmm0, xmm0\t\n" |
2791 | "vcomiss xmm31, xmm0\t\n" |
2792 | "jz zero_regs%=\t\n" |
2793 | |
2794 | // Setup values with beta multiplication |
2795 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
2796 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
2797 | "add r12, r13\t\n" |
2798 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
2799 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
2800 | "add r12, r13\t\n" |
2801 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
2802 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
2803 | "add r12, r13\t\n" |
2804 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
2805 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
2806 | "add r12, r13\t\n" |
2807 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
2808 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
2809 | "add r12, r13\t\n" |
2810 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
2811 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
2812 | "add r12, r13\t\n" |
2813 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
2814 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
2815 | "add r12, r13\t\n" |
2816 | "vmulps zmm14, zmm31, [r12 + 0]\t\n" |
2817 | "vmulps zmm15, zmm31, [r12 + 64]\t\n" |
2818 | "add r12, r13\t\n" |
2819 | "vmulps zmm16, zmm31, [r12 + 0]\t\n" |
2820 | "vmulps zmm17, zmm31, [r12 + 64]\t\n" |
2821 | "add r12, r13\t\n" |
2822 | "vmulps zmm18, zmm31, [r12 + 0]\t\n" |
2823 | "vmulps zmm19, zmm31, [r12 + 64]\t\n" |
2824 | "add r12, r13\t\n" |
2825 | "vmulps zmm20, zmm31, [r12 + 0]\t\n" |
2826 | "vmulps zmm21, zmm31, [r12 + 64]\t\n" |
2827 | "add r12, r13\t\n" |
2828 | "vmulps zmm22, zmm31, [r12 + 0]\t\n" |
2829 | "vmulps zmm23, zmm31, [r12 + 64]\t\n" |
2830 | "add r12, r13\t\n" |
2831 | "vmulps zmm24, zmm31, [r12 + 0]\t\n" |
2832 | "vmulps zmm25, zmm31, [r12 + 64]\t\n" |
2833 | "test r14,r14\t\n" |
2834 | "jz skip_preload%=\t\n" |
2835 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2836 | "skip_preload%=:\t\n" |
2837 | "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n" |
2838 | "vfmadd231ps zmm0,zmm27,zmm26\t\n" |
2839 | "vfmadd231ps zmm1,zmm28,zmm26\t\n" |
2840 | "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n" |
2841 | "vfmadd231ps zmm2,zmm27,zmm26\t\n" |
2842 | "vfmadd231ps zmm3,zmm28,zmm26\t\n" |
2843 | "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n" |
2844 | "vfmadd231ps zmm4,zmm27,zmm26\t\n" |
2845 | "vfmadd231ps zmm5,zmm28,zmm26\t\n" |
2846 | "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n" |
2847 | "vfmadd231ps zmm6,zmm27,zmm26\t\n" |
2848 | "vfmadd231ps zmm7,zmm28,zmm26\t\n" |
2849 | "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n" |
2850 | "vfmadd231ps zmm8,zmm27,zmm26\t\n" |
2851 | "vfmadd231ps zmm9,zmm28,zmm26\t\n" |
2852 | "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n" |
2853 | "vfmadd231ps zmm10,zmm27,zmm26\t\n" |
2854 | "vfmadd231ps zmm11,zmm28,zmm26\t\n" |
2855 | "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n" |
2856 | "vfmadd231ps zmm12,zmm27,zmm26\t\n" |
2857 | "vfmadd231ps zmm13,zmm28,zmm26\t\n" |
2858 | "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n" |
2859 | "vfmadd231ps zmm14,zmm27,zmm26\t\n" |
2860 | "vfmadd231ps zmm15,zmm28,zmm26\t\n" |
2861 | "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n" |
2862 | "vfmadd231ps zmm16,zmm27,zmm26\t\n" |
2863 | "vfmadd231ps zmm17,zmm28,zmm26\t\n" |
2864 | "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n" |
2865 | "vfmadd231ps zmm18,zmm27,zmm26\t\n" |
2866 | "vfmadd231ps zmm19,zmm28,zmm26\t\n" |
2867 | "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n" |
2868 | "vfmadd231ps zmm20,zmm27,zmm26\t\n" |
2869 | "vfmadd231ps zmm21,zmm28,zmm26\t\n" |
2870 | "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n" |
2871 | "vfmadd231ps zmm22,zmm27,zmm26\t\n" |
2872 | "vfmadd231ps zmm23,zmm28,zmm26\t\n" |
2873 | "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n" |
2874 | "vfmadd231ps zmm24,zmm27,zmm26\t\n" |
2875 | "vfmadd231ps zmm25,zmm28,zmm26\t\n" |
2876 | "mov r12, rcx\t\n" |
2877 | "test r14,r14\t\n" |
2878 | "jnz next_inner%=\t\n" |
2879 | "add r10,64\t\n" |
2880 | "jmp dump_C%=\t\n" |
2881 | |
2882 | "zero_regs%=:\t\n" |
2883 | |
2884 | "test r14,r14\t\n" |
2885 | "jz skip_preload_b_zero%=\t\n" |
2886 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2887 | "skip_preload_b_zero%=:\t\n" |
2888 | "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n" |
2889 | "vmulps zmm0,zmm27,zmm26\t\n" |
2890 | "vmulps zmm1,zmm28,zmm26\t\n" |
2891 | "add r12, r13\t\n" |
2892 | "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n" |
2893 | "vmulps zmm2,zmm27,zmm26\t\n" |
2894 | "vmulps zmm3,zmm28,zmm26\t\n" |
2895 | "add r12, r13\t\n" |
2896 | "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n" |
2897 | "vmulps zmm4,zmm27,zmm26\t\n" |
2898 | "vmulps zmm5,zmm28,zmm26\t\n" |
2899 | "add r12, r13\t\n" |
2900 | "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n" |
2901 | "vmulps zmm6,zmm27,zmm26\t\n" |
2902 | "vmulps zmm7,zmm28,zmm26\t\n" |
2903 | "add r12, r13\t\n" |
2904 | "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n" |
2905 | "vmulps zmm8,zmm27,zmm26\t\n" |
2906 | "vmulps zmm9,zmm28,zmm26\t\n" |
2907 | "add r12, r13\t\n" |
2908 | "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n" |
2909 | "vmulps zmm10,zmm27,zmm26\t\n" |
2910 | "vmulps zmm11,zmm28,zmm26\t\n" |
2911 | "add r12, r13\t\n" |
2912 | "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n" |
2913 | "vmulps zmm12,zmm27,zmm26\t\n" |
2914 | "vmulps zmm13,zmm28,zmm26\t\n" |
2915 | "add r12, r13\t\n" |
2916 | "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n" |
2917 | "vmulps zmm14,zmm27,zmm26\t\n" |
2918 | "vmulps zmm15,zmm28,zmm26\t\n" |
2919 | "add r12, r13\t\n" |
2920 | "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n" |
2921 | "vmulps zmm16,zmm27,zmm26\t\n" |
2922 | "vmulps zmm17,zmm28,zmm26\t\n" |
2923 | "add r12, r13\t\n" |
2924 | "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n" |
2925 | "vmulps zmm18,zmm27,zmm26\t\n" |
2926 | "vmulps zmm19,zmm28,zmm26\t\n" |
2927 | "add r12, r13\t\n" |
2928 | "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n" |
2929 | "vmulps zmm20,zmm27,zmm26\t\n" |
2930 | "vmulps zmm21,zmm28,zmm26\t\n" |
2931 | "add r12, r13\t\n" |
2932 | "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n" |
2933 | "vmulps zmm22,zmm27,zmm26\t\n" |
2934 | "vmulps zmm23,zmm28,zmm26\t\n" |
2935 | "add r12, r13\t\n" |
2936 | "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n" |
2937 | "vmulps zmm24,zmm27,zmm26\t\n" |
2938 | "vmulps zmm25,zmm28,zmm26\t\n" |
2939 | "mov r12, rcx\t\n" |
2940 | "test r14,r14\t\n" |
2941 | "jnz next_inner%=\t\n" |
2942 | "add r10,64\t\n" |
2943 | "jmp dump_C%=\t\n" |
2944 | |
2945 | "loop_inner%=:\t\n" |
2946 | |
2947 | "vmovaps zmm27,zmm31\t\n" |
2948 | "vcvtph2ps zmm28,YMMWORD PTR [r10 + 32]\t\n" |
2949 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
2950 | "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n" |
2951 | "vfmadd231ps zmm0,zmm27,zmm26\t\n" |
2952 | "vfmadd231ps zmm1,zmm28,zmm26\t\n" |
2953 | "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n" |
2954 | "vfmadd231ps zmm2,zmm27,zmm26\t\n" |
2955 | "vfmadd231ps zmm3,zmm28,zmm26\t\n" |
2956 | "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n" |
2957 | "vfmadd231ps zmm4,zmm27,zmm26\t\n" |
2958 | "vfmadd231ps zmm5,zmm28,zmm26\t\n" |
2959 | "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n" |
2960 | "vfmadd231ps zmm6,zmm27,zmm26\t\n" |
2961 | "vfmadd231ps zmm7,zmm28,zmm26\t\n" |
2962 | "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n" |
2963 | "vfmadd231ps zmm8,zmm27,zmm26\t\n" |
2964 | "vfmadd231ps zmm9,zmm28,zmm26\t\n" |
2965 | "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n" |
2966 | "vfmadd231ps zmm10,zmm27,zmm26\t\n" |
2967 | "vfmadd231ps zmm11,zmm28,zmm26\t\n" |
2968 | "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n" |
2969 | "vfmadd231ps zmm12,zmm27,zmm26\t\n" |
2970 | "vfmadd231ps zmm13,zmm28,zmm26\t\n" |
2971 | "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n" |
2972 | "vfmadd231ps zmm14,zmm27,zmm26\t\n" |
2973 | "vfmadd231ps zmm15,zmm28,zmm26\t\n" |
2974 | "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n" |
2975 | "vfmadd231ps zmm16,zmm27,zmm26\t\n" |
2976 | "vfmadd231ps zmm17,zmm28,zmm26\t\n" |
2977 | "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n" |
2978 | "vfmadd231ps zmm18,zmm27,zmm26\t\n" |
2979 | "vfmadd231ps zmm19,zmm28,zmm26\t\n" |
2980 | "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n" |
2981 | "vfmadd231ps zmm20,zmm27,zmm26\t\n" |
2982 | "vfmadd231ps zmm21,zmm28,zmm26\t\n" |
2983 | "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n" |
2984 | "vfmadd231ps zmm22,zmm27,zmm26\t\n" |
2985 | "vfmadd231ps zmm23,zmm28,zmm26\t\n" |
2986 | "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n" |
2987 | "vfmadd231ps zmm24,zmm27,zmm26\t\n" |
2988 | "vfmadd231ps zmm25,zmm28,zmm26\t\n" |
2989 | |
2990 | "next_inner%=:\t\n" |
2991 | "add r9,52\t\n" |
2992 | "add r10,64\t\n" |
2993 | "dec r14\t\n" |
2994 | "jnz loop_inner%=\t\n" |
2995 | |
2996 | "vmovaps zmm27,zmm31\t\n" |
2997 | "vcvtph2ps zmm28,YMMWORD PTR [r10 + 32]\t\n" |
2998 | "vbroadcastss zmm26,DWORD PTR [r9+0]\t\n" |
2999 | "vfmadd231ps zmm0,zmm27,zmm26\t\n" |
3000 | "vfmadd231ps zmm1,zmm28,zmm26\t\n" |
3001 | "vbroadcastss zmm26,DWORD PTR [r9+4]\t\n" |
3002 | "vfmadd231ps zmm2,zmm27,zmm26\t\n" |
3003 | "vfmadd231ps zmm3,zmm28,zmm26\t\n" |
3004 | "vbroadcastss zmm26,DWORD PTR [r9+8]\t\n" |
3005 | "vfmadd231ps zmm4,zmm27,zmm26\t\n" |
3006 | "vfmadd231ps zmm5,zmm28,zmm26\t\n" |
3007 | "vbroadcastss zmm26,DWORD PTR [r9+12]\t\n" |
3008 | "vfmadd231ps zmm6,zmm27,zmm26\t\n" |
3009 | "vfmadd231ps zmm7,zmm28,zmm26\t\n" |
3010 | "vbroadcastss zmm26,DWORD PTR [r9+16]\t\n" |
3011 | "vfmadd231ps zmm8,zmm27,zmm26\t\n" |
3012 | "vfmadd231ps zmm9,zmm28,zmm26\t\n" |
3013 | "vbroadcastss zmm26,DWORD PTR [r9+20]\t\n" |
3014 | "vfmadd231ps zmm10,zmm27,zmm26\t\n" |
3015 | "vfmadd231ps zmm11,zmm28,zmm26\t\n" |
3016 | "vbroadcastss zmm26,DWORD PTR [r9+24]\t\n" |
3017 | "vfmadd231ps zmm12,zmm27,zmm26\t\n" |
3018 | "vfmadd231ps zmm13,zmm28,zmm26\t\n" |
3019 | "vbroadcastss zmm26,DWORD PTR [r9+28]\t\n" |
3020 | "vfmadd231ps zmm14,zmm27,zmm26\t\n" |
3021 | "vfmadd231ps zmm15,zmm28,zmm26\t\n" |
3022 | "vbroadcastss zmm26,DWORD PTR [r9+32]\t\n" |
3023 | "vfmadd231ps zmm16,zmm27,zmm26\t\n" |
3024 | "vfmadd231ps zmm17,zmm28,zmm26\t\n" |
3025 | "vbroadcastss zmm26,DWORD PTR [r9+36]\t\n" |
3026 | "vfmadd231ps zmm18,zmm27,zmm26\t\n" |
3027 | "vfmadd231ps zmm19,zmm28,zmm26\t\n" |
3028 | "vbroadcastss zmm26,DWORD PTR [r9+40]\t\n" |
3029 | "vfmadd231ps zmm20,zmm27,zmm26\t\n" |
3030 | "vfmadd231ps zmm21,zmm28,zmm26\t\n" |
3031 | "vbroadcastss zmm26,DWORD PTR [r9+44]\t\n" |
3032 | "vfmadd231ps zmm22,zmm27,zmm26\t\n" |
3033 | "vfmadd231ps zmm23,zmm28,zmm26\t\n" |
3034 | "vbroadcastss zmm26,DWORD PTR [r9+48]\t\n" |
3035 | "vfmadd231ps zmm24,zmm27,zmm26\t\n" |
3036 | "vfmadd231ps zmm25,zmm28,zmm26\t\n" |
3037 | "add r9,52\t\n" |
3038 | "add r10,64\t\n" |
3039 | // Dump C |
3040 | "dump_C%=:\t\n" |
3041 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
3042 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
3043 | "add r12, r13\t\n" |
3044 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
3045 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
3046 | "add r12, r13\t\n" |
3047 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
3048 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
3049 | "add r12, r13\t\n" |
3050 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
3051 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
3052 | "add r12, r13\t\n" |
3053 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
3054 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
3055 | "add r12, r13\t\n" |
3056 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
3057 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
3058 | "add r12, r13\t\n" |
3059 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
3060 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
3061 | "add r12, r13\t\n" |
3062 | "vmovups zmmword PTR [r12 + 0], zmm14\t\n" |
3063 | "vmovups zmmword PTR [r12 + 64], zmm15\t\n" |
3064 | "add r12, r13\t\n" |
3065 | "vmovups zmmword PTR [r12 + 0], zmm16\t\n" |
3066 | "vmovups zmmword PTR [r12 + 64], zmm17\t\n" |
3067 | "add r12, r13\t\n" |
3068 | "vmovups zmmword PTR [r12 + 0], zmm18\t\n" |
3069 | "vmovups zmmword PTR [r12 + 64], zmm19\t\n" |
3070 | "add r12, r13\t\n" |
3071 | "vmovups zmmword PTR [r12 + 0], zmm20\t\n" |
3072 | "vmovups zmmword PTR [r12 + 64], zmm21\t\n" |
3073 | "add r12, r13\t\n" |
3074 | "vmovups zmmword PTR [r12 + 0], zmm22\t\n" |
3075 | "vmovups zmmword PTR [r12 + 64], zmm23\t\n" |
3076 | "add r12, r13\t\n" |
3077 | "vmovups zmmword PTR [r12 + 0], zmm24\t\n" |
3078 | "vmovups zmmword PTR [r12 + 64], zmm25\t\n" |
3079 | |
3080 | // next outer iteration |
3081 | "add rcx, 128\t\n" |
3082 | "mov r12, rcx\t\n" |
3083 | "mov r9, rax\t\n" |
3084 | "inc rbx\t\n" |
3085 | "cmp rbx, rdi\t\n" |
3086 | "jl loop_outter%=\t\n" |
3087 | : |
3088 | : [gp] "rm" (gp) |
3089 | : "r8" , |
3090 | "r9" , |
3091 | "r10" , |
3092 | "r11" , |
3093 | "r13" , |
3094 | "r14" , |
3095 | "rax" , |
3096 | "rcx" , |
3097 | "rsi" , |
3098 | "rdi" , |
3099 | "rbx" , |
3100 | "r12" , |
3101 | "r15" , |
3102 | "memory" ); |
3103 | } |
3104 | void NOINLINE gemmkernel_14x2_Avx512_fp16_fA0fB0fC0(GemmParamsFP16* gp) { |
3105 | asm volatile( |
3106 | #if FBGEMM_USE_CLANG_INTEL_SYNTAX_ASM_HACK |
3107 | "mov %[gp], %%r14\t\n" |
3108 | ".intel_syntax noprefix\t\n" |
3109 | #else |
3110 | "mov r14, %[gp]\t\n" |
3111 | #endif |
3112 | |
3113 | // Copy parameters |
3114 | // k |
3115 | "mov r8, [r14 + 0]\t\n" |
3116 | "dec r8\t\n" |
3117 | // A |
3118 | "mov r9, [r14 + 8]\t\n" |
3119 | // B |
3120 | "mov r10, [r14 + 16]\t\n" |
3121 | // beta |
3122 | "lea r15, [r14 + 24]\t\n" |
3123 | // C |
3124 | "mov r12, [r14 + 32]\t\n" |
3125 | // ldc |
3126 | "mov r13, [r14 + 40]\t\n" |
3127 | // b_block_cols |
3128 | "mov rdi, [r14 + 48]\t\n" |
3129 | // b_block_size |
3130 | "mov rsi, [r14 + 56]\t\n" |
3131 | |
3132 | // Make copies of A and C |
3133 | "mov rax, r9\t\n" |
3134 | "mov rcx, r12\t\n" |
3135 | |
3136 | "xor ebx, ebx\t\n" |
3137 | "loop_outter%=:\t\n" |
3138 | "mov r14, r8\t\n" |
3139 | "vbroadcastss zmm31,DWORD PTR [r15]\t\n" |
3140 | "vcvtph2ps zmm29,YMMWORD PTR [r10 + 0]\t\n" |
3141 | "vcvtph2ps zmm30,YMMWORD PTR [r10 + 32]\t\n" |
3142 | "vxorps xmm0, xmm0, xmm0\t\n" |
3143 | "vcomiss xmm31, xmm0\t\n" |
3144 | "jz zero_regs%=\t\n" |
3145 | |
3146 | // Setup values with beta multiplication |
3147 | "vmulps zmm0, zmm31, [r12 + 0]\t\n" |
3148 | "vmulps zmm1, zmm31, [r12 + 64]\t\n" |
3149 | "add r12, r13\t\n" |
3150 | "vmulps zmm2, zmm31, [r12 + 0]\t\n" |
3151 | "vmulps zmm3, zmm31, [r12 + 64]\t\n" |
3152 | "add r12, r13\t\n" |
3153 | "vmulps zmm4, zmm31, [r12 + 0]\t\n" |
3154 | "vmulps zmm5, zmm31, [r12 + 64]\t\n" |
3155 | "add r12, r13\t\n" |
3156 | "vmulps zmm6, zmm31, [r12 + 0]\t\n" |
3157 | "vmulps zmm7, zmm31, [r12 + 64]\t\n" |
3158 | "add r12, r13\t\n" |
3159 | "vmulps zmm8, zmm31, [r12 + 0]\t\n" |
3160 | "vmulps zmm9, zmm31, [r12 + 64]\t\n" |
3161 | "add r12, r13\t\n" |
3162 | "vmulps zmm10, zmm31, [r12 + 0]\t\n" |
3163 | "vmulps zmm11, zmm31, [r12 + 64]\t\n" |
3164 | "add r12, r13\t\n" |
3165 | "vmulps zmm12, zmm31, [r12 + 0]\t\n" |
3166 | "vmulps zmm13, zmm31, [r12 + 64]\t\n" |
3167 | "add r12, r13\t\n" |
3168 | "vmulps zmm14, zmm31, [r12 + 0]\t\n" |
3169 | "vmulps zmm15, zmm31, [r12 + 64]\t\n" |
3170 | "add r12, r13\t\n" |
3171 | "vmulps zmm16, zmm31, [r12 + 0]\t\n" |
3172 | "vmulps zmm17, zmm31, [r12 + 64]\t\n" |
3173 | "add r12, r13\t\n" |
3174 | "vmulps zmm18, zmm31, [r12 + 0]\t\n" |
3175 | "vmulps zmm19, zmm31, [r12 + 64]\t\n" |
3176 | "add r12, r13\t\n" |
3177 | "vmulps zmm20, zmm31, [r12 + 0]\t\n" |
3178 | "vmulps zmm21, zmm31, [r12 + 64]\t\n" |
3179 | "add r12, r13\t\n" |
3180 | "vmulps zmm22, zmm31, [r12 + 0]\t\n" |
3181 | "vmulps zmm23, zmm31, [r12 + 64]\t\n" |
3182 | "add r12, r13\t\n" |
3183 | "vmulps zmm24, zmm31, [r12 + 0]\t\n" |
3184 | "vmulps zmm25, zmm31, [r12 + 64]\t\n" |
3185 | "add r12, r13\t\n" |
3186 | "vmulps zmm26, zmm31, [r12 + 0]\t\n" |
3187 | "vmulps zmm27, zmm31, [r12 + 64]\t\n" |
3188 | "test r14,r14\t\n" |
3189 | "jz skip_preload%=\t\n" |
3190 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
3191 | "skip_preload%=:\t\n" |
3192 | "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n" |
3193 | "vfmadd231ps zmm0,zmm29,zmm28\t\n" |
3194 | "vfmadd231ps zmm1,zmm30,zmm28\t\n" |
3195 | "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n" |
3196 | "vfmadd231ps zmm2,zmm29,zmm28\t\n" |
3197 | "vfmadd231ps zmm3,zmm30,zmm28\t\n" |
3198 | "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n" |
3199 | "vfmadd231ps zmm4,zmm29,zmm28\t\n" |
3200 | "vfmadd231ps zmm5,zmm30,zmm28\t\n" |
3201 | "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n" |
3202 | "vfmadd231ps zmm6,zmm29,zmm28\t\n" |
3203 | "vfmadd231ps zmm7,zmm30,zmm28\t\n" |
3204 | "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n" |
3205 | "vfmadd231ps zmm8,zmm29,zmm28\t\n" |
3206 | "vfmadd231ps zmm9,zmm30,zmm28\t\n" |
3207 | "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n" |
3208 | "vfmadd231ps zmm10,zmm29,zmm28\t\n" |
3209 | "vfmadd231ps zmm11,zmm30,zmm28\t\n" |
3210 | "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n" |
3211 | "vfmadd231ps zmm12,zmm29,zmm28\t\n" |
3212 | "vfmadd231ps zmm13,zmm30,zmm28\t\n" |
3213 | "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n" |
3214 | "vfmadd231ps zmm14,zmm29,zmm28\t\n" |
3215 | "vfmadd231ps zmm15,zmm30,zmm28\t\n" |
3216 | "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n" |
3217 | "vfmadd231ps zmm16,zmm29,zmm28\t\n" |
3218 | "vfmadd231ps zmm17,zmm30,zmm28\t\n" |
3219 | "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n" |
3220 | "vfmadd231ps zmm18,zmm29,zmm28\t\n" |
3221 | "vfmadd231ps zmm19,zmm30,zmm28\t\n" |
3222 | "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n" |
3223 | "vfmadd231ps zmm20,zmm29,zmm28\t\n" |
3224 | "vfmadd231ps zmm21,zmm30,zmm28\t\n" |
3225 | "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n" |
3226 | "vfmadd231ps zmm22,zmm29,zmm28\t\n" |
3227 | "vfmadd231ps zmm23,zmm30,zmm28\t\n" |
3228 | "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n" |
3229 | "vfmadd231ps zmm24,zmm29,zmm28\t\n" |
3230 | "vfmadd231ps zmm25,zmm30,zmm28\t\n" |
3231 | "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n" |
3232 | "vfmadd231ps zmm26,zmm29,zmm28\t\n" |
3233 | "vfmadd231ps zmm27,zmm30,zmm28\t\n" |
3234 | "mov r12, rcx\t\n" |
3235 | "test r14,r14\t\n" |
3236 | "jnz next_inner%=\t\n" |
3237 | "add r10,64\t\n" |
3238 | "jmp dump_C%=\t\n" |
3239 | |
3240 | "zero_regs%=:\t\n" |
3241 | |
3242 | "test r14,r14\t\n" |
3243 | "jz skip_preload_b_zero%=\t\n" |
3244 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
3245 | "skip_preload_b_zero%=:\t\n" |
3246 | "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n" |
3247 | "vmulps zmm0,zmm29,zmm28\t\n" |
3248 | "vmulps zmm1,zmm30,zmm28\t\n" |
3249 | "add r12, r13\t\n" |
3250 | "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n" |
3251 | "vmulps zmm2,zmm29,zmm28\t\n" |
3252 | "vmulps zmm3,zmm30,zmm28\t\n" |
3253 | "add r12, r13\t\n" |
3254 | "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n" |
3255 | "vmulps zmm4,zmm29,zmm28\t\n" |
3256 | "vmulps zmm5,zmm30,zmm28\t\n" |
3257 | "add r12, r13\t\n" |
3258 | "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n" |
3259 | "vmulps zmm6,zmm29,zmm28\t\n" |
3260 | "vmulps zmm7,zmm30,zmm28\t\n" |
3261 | "add r12, r13\t\n" |
3262 | "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n" |
3263 | "vmulps zmm8,zmm29,zmm28\t\n" |
3264 | "vmulps zmm9,zmm30,zmm28\t\n" |
3265 | "add r12, r13\t\n" |
3266 | "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n" |
3267 | "vmulps zmm10,zmm29,zmm28\t\n" |
3268 | "vmulps zmm11,zmm30,zmm28\t\n" |
3269 | "add r12, r13\t\n" |
3270 | "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n" |
3271 | "vmulps zmm12,zmm29,zmm28\t\n" |
3272 | "vmulps zmm13,zmm30,zmm28\t\n" |
3273 | "add r12, r13\t\n" |
3274 | "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n" |
3275 | "vmulps zmm14,zmm29,zmm28\t\n" |
3276 | "vmulps zmm15,zmm30,zmm28\t\n" |
3277 | "add r12, r13\t\n" |
3278 | "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n" |
3279 | "vmulps zmm16,zmm29,zmm28\t\n" |
3280 | "vmulps zmm17,zmm30,zmm28\t\n" |
3281 | "add r12, r13\t\n" |
3282 | "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n" |
3283 | "vmulps zmm18,zmm29,zmm28\t\n" |
3284 | "vmulps zmm19,zmm30,zmm28\t\n" |
3285 | "add r12, r13\t\n" |
3286 | "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n" |
3287 | "vmulps zmm20,zmm29,zmm28\t\n" |
3288 | "vmulps zmm21,zmm30,zmm28\t\n" |
3289 | "add r12, r13\t\n" |
3290 | "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n" |
3291 | "vmulps zmm22,zmm29,zmm28\t\n" |
3292 | "vmulps zmm23,zmm30,zmm28\t\n" |
3293 | "add r12, r13\t\n" |
3294 | "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n" |
3295 | "vmulps zmm24,zmm29,zmm28\t\n" |
3296 | "vmulps zmm25,zmm30,zmm28\t\n" |
3297 | "add r12, r13\t\n" |
3298 | "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n" |
3299 | "vmulps zmm26,zmm29,zmm28\t\n" |
3300 | "vmulps zmm27,zmm30,zmm28\t\n" |
3301 | "mov r12, rcx\t\n" |
3302 | "test r14,r14\t\n" |
3303 | "jnz next_inner%=\t\n" |
3304 | "add r10,64\t\n" |
3305 | "jmp dump_C%=\t\n" |
3306 | |
3307 | "loop_inner%=:\t\n" |
3308 | |
3309 | "vmovaps zmm29,zmm31\t\n" |
3310 | "vcvtph2ps zmm30,YMMWORD PTR [r10 + 32]\t\n" |
3311 | "vcvtph2ps zmm31,YMMWORD PTR [r10 + 64]\t\n" |
3312 | "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n" |
3313 | "vfmadd231ps zmm0,zmm29,zmm28\t\n" |
3314 | "vfmadd231ps zmm1,zmm30,zmm28\t\n" |
3315 | "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n" |
3316 | "vfmadd231ps zmm2,zmm29,zmm28\t\n" |
3317 | "vfmadd231ps zmm3,zmm30,zmm28\t\n" |
3318 | "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n" |
3319 | "vfmadd231ps zmm4,zmm29,zmm28\t\n" |
3320 | "vfmadd231ps zmm5,zmm30,zmm28\t\n" |
3321 | "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n" |
3322 | "vfmadd231ps zmm6,zmm29,zmm28\t\n" |
3323 | "vfmadd231ps zmm7,zmm30,zmm28\t\n" |
3324 | "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n" |
3325 | "vfmadd231ps zmm8,zmm29,zmm28\t\n" |
3326 | "vfmadd231ps zmm9,zmm30,zmm28\t\n" |
3327 | "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n" |
3328 | "vfmadd231ps zmm10,zmm29,zmm28\t\n" |
3329 | "vfmadd231ps zmm11,zmm30,zmm28\t\n" |
3330 | "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n" |
3331 | "vfmadd231ps zmm12,zmm29,zmm28\t\n" |
3332 | "vfmadd231ps zmm13,zmm30,zmm28\t\n" |
3333 | "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n" |
3334 | "vfmadd231ps zmm14,zmm29,zmm28\t\n" |
3335 | "vfmadd231ps zmm15,zmm30,zmm28\t\n" |
3336 | "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n" |
3337 | "vfmadd231ps zmm16,zmm29,zmm28\t\n" |
3338 | "vfmadd231ps zmm17,zmm30,zmm28\t\n" |
3339 | "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n" |
3340 | "vfmadd231ps zmm18,zmm29,zmm28\t\n" |
3341 | "vfmadd231ps zmm19,zmm30,zmm28\t\n" |
3342 | "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n" |
3343 | "vfmadd231ps zmm20,zmm29,zmm28\t\n" |
3344 | "vfmadd231ps zmm21,zmm30,zmm28\t\n" |
3345 | "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n" |
3346 | "vfmadd231ps zmm22,zmm29,zmm28\t\n" |
3347 | "vfmadd231ps zmm23,zmm30,zmm28\t\n" |
3348 | "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n" |
3349 | "vfmadd231ps zmm24,zmm29,zmm28\t\n" |
3350 | "vfmadd231ps zmm25,zmm30,zmm28\t\n" |
3351 | "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n" |
3352 | "vfmadd231ps zmm26,zmm29,zmm28\t\n" |
3353 | "vfmadd231ps zmm27,zmm30,zmm28\t\n" |
3354 | |
3355 | "next_inner%=:\t\n" |
3356 | "add r9,56\t\n" |
3357 | "add r10,64\t\n" |
3358 | "dec r14\t\n" |
3359 | "jnz loop_inner%=\t\n" |
3360 | |
3361 | "vmovaps zmm29,zmm31\t\n" |
3362 | "vcvtph2ps zmm30,YMMWORD PTR [r10 + 32]\t\n" |
3363 | "vbroadcastss zmm28,DWORD PTR [r9+0]\t\n" |
3364 | "vfmadd231ps zmm0,zmm29,zmm28\t\n" |
3365 | "vfmadd231ps zmm1,zmm30,zmm28\t\n" |
3366 | "vbroadcastss zmm28,DWORD PTR [r9+4]\t\n" |
3367 | "vfmadd231ps zmm2,zmm29,zmm28\t\n" |
3368 | "vfmadd231ps zmm3,zmm30,zmm28\t\n" |
3369 | "vbroadcastss zmm28,DWORD PTR [r9+8]\t\n" |
3370 | "vfmadd231ps zmm4,zmm29,zmm28\t\n" |
3371 | "vfmadd231ps zmm5,zmm30,zmm28\t\n" |
3372 | "vbroadcastss zmm28,DWORD PTR [r9+12]\t\n" |
3373 | "vfmadd231ps zmm6,zmm29,zmm28\t\n" |
3374 | "vfmadd231ps zmm7,zmm30,zmm28\t\n" |
3375 | "vbroadcastss zmm28,DWORD PTR [r9+16]\t\n" |
3376 | "vfmadd231ps zmm8,zmm29,zmm28\t\n" |
3377 | "vfmadd231ps zmm9,zmm30,zmm28\t\n" |
3378 | "vbroadcastss zmm28,DWORD PTR [r9+20]\t\n" |
3379 | "vfmadd231ps zmm10,zmm29,zmm28\t\n" |
3380 | "vfmadd231ps zmm11,zmm30,zmm28\t\n" |
3381 | "vbroadcastss zmm28,DWORD PTR [r9+24]\t\n" |
3382 | "vfmadd231ps zmm12,zmm29,zmm28\t\n" |
3383 | "vfmadd231ps zmm13,zmm30,zmm28\t\n" |
3384 | "vbroadcastss zmm28,DWORD PTR [r9+28]\t\n" |
3385 | "vfmadd231ps zmm14,zmm29,zmm28\t\n" |
3386 | "vfmadd231ps zmm15,zmm30,zmm28\t\n" |
3387 | "vbroadcastss zmm28,DWORD PTR [r9+32]\t\n" |
3388 | "vfmadd231ps zmm16,zmm29,zmm28\t\n" |
3389 | "vfmadd231ps zmm17,zmm30,zmm28\t\n" |
3390 | "vbroadcastss zmm28,DWORD PTR [r9+36]\t\n" |
3391 | "vfmadd231ps zmm18,zmm29,zmm28\t\n" |
3392 | "vfmadd231ps zmm19,zmm30,zmm28\t\n" |
3393 | "vbroadcastss zmm28,DWORD PTR [r9+40]\t\n" |
3394 | "vfmadd231ps zmm20,zmm29,zmm28\t\n" |
3395 | "vfmadd231ps zmm21,zmm30,zmm28\t\n" |
3396 | "vbroadcastss zmm28,DWORD PTR [r9+44]\t\n" |
3397 | "vfmadd231ps zmm22,zmm29,zmm28\t\n" |
3398 | "vfmadd231ps zmm23,zmm30,zmm28\t\n" |
3399 | "vbroadcastss zmm28,DWORD PTR [r9+48]\t\n" |
3400 | "vfmadd231ps zmm24,zmm29,zmm28\t\n" |
3401 | "vfmadd231ps zmm25,zmm30,zmm28\t\n" |
3402 | "vbroadcastss zmm28,DWORD PTR [r9+52]\t\n" |
3403 | "vfmadd231ps zmm26,zmm29,zmm28\t\n" |
3404 | "vfmadd231ps zmm27,zmm30,zmm28\t\n" |
3405 | "add r9,56\t\n" |
3406 | "add r10,64\t\n" |
3407 | // Dump C |
3408 | "dump_C%=:\t\n" |
3409 | "vmovups zmmword PTR [r12 + 0], zmm0\t\n" |
3410 | "vmovups zmmword PTR [r12 + 64], zmm1\t\n" |
3411 | "add r12, r13\t\n" |
3412 | "vmovups zmmword PTR [r12 + 0], zmm2\t\n" |
3413 | "vmovups zmmword PTR [r12 + 64], zmm3\t\n" |
3414 | "add r12, r13\t\n" |
3415 | "vmovups zmmword PTR [r12 + 0], zmm4\t\n" |
3416 | "vmovups zmmword PTR [r12 + 64], zmm5\t\n" |
3417 | "add r12, r13\t\n" |
3418 | "vmovups zmmword PTR [r12 + 0], zmm6\t\n" |
3419 | "vmovups zmmword PTR [r12 + 64], zmm7\t\n" |
3420 | "add r12, r13\t\n" |
3421 | "vmovups zmmword PTR [r12 + 0], zmm8\t\n" |
3422 | "vmovups zmmword PTR [r12 + 64], zmm9\t\n" |
3423 | "add r12, r13\t\n" |
3424 | "vmovups zmmword PTR [r12 + 0], zmm10\t\n" |
3425 | "vmovups zmmword PTR [r12 + 64], zmm11\t\n" |
3426 | "add r12, r13\t\n" |
3427 | "vmovups zmmword PTR [r12 + 0], zmm12\t\n" |
3428 | "vmovups zmmword PTR [r12 + 64], zmm13\t\n" |
3429 | "add r12, r13\t\n" |
3430 | "vmovups zmmword PTR [r12 + 0], zmm14\t\n" |
3431 | "vmovups zmmword PTR [r12 + 64], zmm15\t\n" |
3432 | "add r12, r13\t\n" |
3433 | "vmovups zmmword PTR [r12 + 0], zmm16\t\n" |
3434 | "vmovups zmmword PTR [r12 + 64], zmm17\t\n" |
3435 | "add r12, r13\t\n" |
3436 | "vmovups zmmword PTR [r12 + 0], zmm18\t\n" |
3437 | "vmovups zmmword PTR [r12 + 64], zmm19\t\n" |
3438 | "add r12, r13\t\n" |
3439 | "vmovups zmmword PTR [r12 + 0], zmm20\t\n" |
3440 | "vmovups zmmword PTR [r12 + 64], zmm21\t\n" |
3441 | "add r12, r13\t\n" |
3442 | "vmovups zmmword PTR [r12 + 0], zmm22\t\n" |
3443 | "vmovups zmmword PTR [r12 + 64], zmm23\t\n" |
3444 | "add r12, r13\t\n" |
3445 | "vmovups zmmword PTR [r12 + 0], zmm24\t\n" |
3446 | "vmovups zmmword PTR [r12 + 64], zmm25\t\n" |
3447 | "add r12, r13\t\n" |
3448 | "vmovups zmmword PTR [r12 + 0], zmm26\t\n" |
3449 | "vmovups zmmword PTR [r12 + 64], zmm27\t\n" |
3450 | |
3451 | // next outer iteration |
3452 | "add rcx, 128\t\n" |
3453 | "mov r12, rcx\t\n" |
3454 | "mov r9, rax\t\n" |
3455 | "inc rbx\t\n" |
3456 | "cmp rbx, rdi\t\n" |
3457 | "jl loop_outter%=\t\n" |
3458 | : |
3459 | : [gp] "rm" (gp) |
3460 | : "r8" , |
3461 | "r9" , |
3462 | "r10" , |
3463 | "r11" , |
3464 | "r13" , |
3465 | "r14" , |
3466 | "rax" , |
3467 | "rcx" , |
3468 | "rsi" , |
3469 | "rdi" , |
3470 | "rbx" , |
3471 | "r12" , |
3472 | "r15" , |
3473 | "memory" ); |
3474 | } |
3475 | |
3476 | } // namespace fbgemm |
3477 | |