1// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// kernel_SSE.h: a collection of Intel SSE optimized kernels.
16// Check in kernel_default.h which one(s) are actually used by default.
17// Others are mere experiments; they are still covered by tests
18// in case they might be useful some day.
19//
20
21#ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_
22#define GEMMLOWP_INTERNAL_KERNEL_SSE_H_
23
24#include "kernel.h"
25
26#include <string.h>
27#include <cassert>
28
29namespace gemmlowp {
30
31#ifdef GEMMLOWP_SSE4_32
32struct SSE4_32_Kernel4x4Depth2 : KernelBase {
33 typedef KernelFormat<
34 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>,
35 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
36 Format;
37
38 const char* Name() const override { return "SSE, 4x4, depth 2"; }
39
40 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
41 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
42 const std::uint8_t* rhs_ptr, std::size_t start_depth,
43 std::size_t run_depth) const override {
44 ScopedProfilingLabel label("optimized kernel");
45 assert(dst_row_stride == 1);
46 (void)dst_row_stride;
47 std::int32_t run_depth_cells = run_depth / Format::kDepth;
48 /* Main loop */
49
50 // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
51 // A 4x2 block Lhs is stored in 16bit in xmm0.
52 // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7.
53 //
54 // +-------+-------+-------+-------+
55 // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
56 // Rhs +-------+---------------+-------+
57 // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
58 // +-------+-------+-------+-------+
59 //
60 // | | | | |
61 //
62 // Lhs | | | | |
63 //
64 // +--+--+ - - - - +-------+-------+-------+-------+
65 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 |
66 // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 |
67 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 |
68 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 |
69 // +--+--+ - - - - +-------+-------+-------+-------+
70 //
71 // Accumulator
72
73 asm volatile(
74
75 // set accumulators to zero.
76 "pxor %%xmm4 , %%xmm4 \n\t"
77 "pxor %%xmm5 , %%xmm5 \n\t"
78 "pxor %%xmm6 , %%xmm6 \n\t"
79 "pxor %%xmm7 , %%xmm7 \n\t"
80
81 "movl %[run_depth_cells], %%eax\n\t"
82 "subl $2, %%eax\n\t"
83 "js outerLoop1%=\n\t"
84
85 // Loop for K unrolled by 4
86 "outerLoop2%=:\n\t"
87
88 // K = 1,2
89 // RHS cell to xmm1
90 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
91
92 // LHS cell
93 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
94 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
95 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
96 "pmaddwd %%xmm0, %%xmm2 \n\t"
97 "pmaddwd %%xmm0, %%xmm3 \n\t"
98 "paddd %%xmm2, %%xmm4 \n\t"
99 "paddd %%xmm3, %%xmm5 \n\t"
100
101 "prefetcht0 0x80(%[lhs_ptr]) \n\t"
102
103 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
104 "pmaddwd %%xmm0, %%xmm2 \n\t"
105 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
106 "pmaddwd %%xmm0, %%xmm3 \n\t"
107
108 "prefetcht0 0x80(%[rhs_ptr]) \n\t"
109
110 // K = 3,4
111 // RHS cell to xmm1
112 "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
113
114 "paddd %%xmm2, %%xmm6 \n\t"
115 "paddd %%xmm3, %%xmm7 \n\t"
116
117 // LHS cell
118 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
119 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
120 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
121 "pmaddwd %%xmm0, %%xmm2 \n\t"
122 "pmaddwd %%xmm0, %%xmm3 \n\t"
123 "paddd %%xmm2, %%xmm4 \n\t"
124 "paddd %%xmm3, %%xmm5 \n\t"
125 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
126 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
127
128 "addl $0x10, %[lhs_ptr] \n\t"
129 "addl $0x10, %[rhs_ptr] \n\t"
130
131 "pmaddwd %%xmm0, %%xmm3 \n\t"
132 "paddd %%xmm3, %%xmm7 \n\t"
133 "pmaddwd %%xmm0, %%xmm2 \n\t"
134 "paddd %%xmm2, %%xmm6 \n\t"
135
136 "subl $2, %[run_depth_cells]\n\t"
137 "ja outerLoop2%=\n\t"
138
139 "movl %[run_depth_cells], %%eax\n\t"
140 "decl %%eax\n\t"
141 "js finish%=\n\t"
142
143 // Loop for K unrolled by 2
144 "outerLoop1%=:\n\t"
145
146 // RHS cell to xmm1
147 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
148
149 // LHS cell
150 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
151 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
152 "pmaddwd %%xmm0, %%xmm2 \n\t"
153 "paddd %%xmm2, %%xmm4 \n\t"
154 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
155 "pmaddwd %%xmm0, %%xmm3 \n\t"
156 "paddd %%xmm3, %%xmm5 \n\t"
157
158 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
159 "pmaddwd %%xmm0, %%xmm2 \n\t"
160 "paddd %%xmm2, %%xmm6 \n\t"
161 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
162 "pmaddwd %%xmm0, %%xmm3 \n\t"
163 "paddd %%xmm3, %%xmm7 \n\t"
164
165 "addl $0x08, %[lhs_ptr]\n\t"
166 "addl $0x08, %[rhs_ptr]\n\t"
167
168 "decl %[run_depth_cells]\n\t"
169 "jnz outerLoop1%=\n\t"
170
171 "finish%=:\n\t"
172
173 "movl %[dst_col_stride], %%eax\n\t"
174 "shll $2, %%eax\n\t"
175
176 "movl %[start_depth], %%ecx\n\t"
177 "test %%ecx, %%ecx\n\t"
178 "jz storeDst%=\n\t"
179
180 "leal (%%eax,%%eax,0x2), %%ecx\n\t"
181 "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t"
182 "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t"
183 "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t"
184 "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t"
185
186 "storeDst%=:\n\t"
187
188 "leal (%%eax,%%eax,0x2), %%ecx\n\t"
189 "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t"
190 "movdqu %%xmm5 , 0x00(%[dst_ptr], %%eax, 1)\n\t"
191 "movdqu %%xmm6 , 0x00(%[dst_ptr], %%eax, 2)\n\t"
192 "movdqu %%xmm7 , 0x00(%[dst_ptr], %%ecx, 1)\n\t"
193
194 : // outputs
195 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
196 [dst_ptr] "+r"(dst_ptr)
197 : // inputs
198 [start_depth] "g"(start_depth), [dst_col_stride] "g"(dst_col_stride),
199 [run_depth_cells] "g"(run_depth_cells)
200 : // clobbers
201 "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
202 "%xmm6", "%xmm7", "%eax", "%ecx");
203 }
204};
205#endif
206#ifdef GEMMLOWP_SSE4_64
207struct SSE4_64_Kernel12x4Depth2 : KernelBase {
208 typedef KernelFormat<
209 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>,
210 KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> >
211 Format;
212
213 const char* Name() const override { return "SSE, 12x4, depth 2"; }
214
215 void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride,
216 std::size_t dst_col_stride, const std::uint8_t* lhs_ptr,
217 const std::uint8_t* rhs_ptr, std::size_t start_depth,
218 std::size_t run_depth) const override {
219 ScopedProfilingLabel label("optimized kernel");
220 assert(dst_row_stride == 1);
221 (void)dst_row_stride;
222 const std::int64_t run_depth_cells = run_depth / Format::kDepth;
223 const std::int64_t dst_col_stride_q = dst_col_stride;
224
225 /* Main loop */
226
227 // A 2x4 cell of Rhs is stored in 16bit in xmm1 .
228 // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced
229 // every Iteration.
230 // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15.
231 //
232 // +-------+-------+-------+-------+
233 // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]|
234 // Rhs +-------+---------------+-------+
235 // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]|
236 // +-------+-------+-------+-------+
237 //
238 // | | | | |
239 //
240 // Lhs | | | | |
241 //
242 // +--+--+ - - - - +-------+-------+-------+-------+
243 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 |
244 // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 |
245 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 |
246 // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 |
247 // +--+--+ - - - - +-------+-------+-------+-------+
248 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 |
249 // |xmm0 | (Iter2) | xmm8 | xmm9 | xmm10 | xmm11 |
250 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 |
251 // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 |
252 // +--+--+ - - - - +-------+-------+-------+-------+
253 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 |
254 // |xmm0 | (Iter3) | xmm12 | xmm13 | xmm14 | xmm15 |
255 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 |
256 // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 |
257 // +--+--+ - - - - +-------+-------+-------+-------+
258 //
259 // Accumulator
260
261 asm volatile(
262
263 // Set registers for destination
264 "movq %[dst_col_stride_q], %%r12\n\t"
265 "shlq $2, %%r12\n\t"
266 "leaq (%%r12,%%r12,0x2), %%r13\n\t"
267
268 // Set accumulators to zero.
269 "pxor %%xmm4 , %%xmm4 \n\t"
270 "pxor %%xmm5 , %%xmm5 \n\t"
271 "pxor %%xmm6 , %%xmm6 \n\t"
272 "pxor %%xmm7 , %%xmm7 \n\t"
273 "pxor %%xmm8 , %%xmm8 \n\t"
274 "pxor %%xmm9 , %%xmm9 \n\t"
275 "pxor %%xmm10 , %%xmm10\n\t"
276 "pxor %%xmm11 , %%xmm11\n\t"
277 "pxor %%xmm12 , %%xmm12\n\t"
278 "pxor %%xmm13 , %%xmm13\n\t"
279 "pxor %%xmm14 , %%xmm14\n\t"
280 "pxor %%xmm15 , %%xmm15\n\t"
281
282 "movq %[run_depth_cells], %%r14\n\t"
283 "subq $2, %%r14\n\t"
284 "js outerLoop1%=\n\t"
285
286 // Loop for K unrolled by 4
287 "outerLoop2%=:\n\t"
288
289 // K = 1,2
290 // RHS cell to xmm1
291
292 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
293
294 // LHS cell
295 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
296 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
297 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
298 "pmaddwd %%xmm0, %%xmm2 \n\t"
299 "pmaddwd %%xmm0, %%xmm3 \n\t"
300 "paddd %%xmm2, %%xmm4 \n\t"
301 "paddd %%xmm3, %%xmm5 \n\t"
302
303 "prefetcht0 0x80(%[lhs_ptr]) \n\t"
304
305 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
306 "pmaddwd %%xmm0, %%xmm2 \n\t"
307 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
308 "pmaddwd %%xmm0, %%xmm3 \n\t"
309
310 // next LHS cell
311 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
312
313 "paddd %%xmm2, %%xmm6 \n\t"
314 "paddd %%xmm3, %%xmm7 \n\t"
315
316 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
317 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
318 "pmaddwd %%xmm0, %%xmm2 \n\t"
319 "pmaddwd %%xmm0, %%xmm3 \n\t"
320 "paddd %%xmm2, %%xmm8 \n\t"
321 "paddd %%xmm3, %%xmm9 \n\t"
322
323 "prefetcht0 0x80(%[rhs_ptr]) \n\t"
324
325 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
326 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
327 "pmaddwd %%xmm0, %%xmm2 \n\t"
328 "pmaddwd %%xmm0, %%xmm3 \n\t"
329 "paddd %%xmm2, %%xmm10 \n\t"
330 "paddd %%xmm3, %%xmm11 \n\t"
331
332 // next LHS cell
333 "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
334 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
335 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
336 "pmaddwd %%xmm0, %%xmm2 \n\t"
337 "pmaddwd %%xmm0, %%xmm3 \n\t"
338 "paddd %%xmm2, %%xmm12 \n\t"
339 "paddd %%xmm3, %%xmm13 \n\t"
340
341 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
342 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
343 "pmaddwd %%xmm0, %%xmm2 \n\t"
344 "pmaddwd %%xmm0, %%xmm3 \n\t"
345 "paddd %%xmm2, %%xmm14 \n\t"
346 "paddd %%xmm3, %%xmm15 \n\t"
347
348 // K = 3,4
349 // RHS cell to xmm1
350 "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t"
351
352 // LHS cell
353 "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t"
354 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
355 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
356 "pmaddwd %%xmm0, %%xmm2 \n\t"
357 "pmaddwd %%xmm0, %%xmm3 \n\t"
358 "paddd %%xmm2, %%xmm4 \n\t"
359 "paddd %%xmm3, %%xmm5 \n\t"
360
361 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
362 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
363 "pmaddwd %%xmm0, %%xmm2 \n\t"
364 "pmaddwd %%xmm0, %%xmm3 \n\t"
365 "paddd %%xmm2, %%xmm6 \n\t"
366 "paddd %%xmm3, %%xmm7 \n\t"
367
368 // next LHS cell
369 "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t"
370 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
371 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
372 "pmaddwd %%xmm0, %%xmm2 \n\t"
373 "pmaddwd %%xmm0, %%xmm3 \n\t"
374 "paddd %%xmm2, %%xmm8 \n\t"
375 "paddd %%xmm3, %%xmm9 \n\t"
376
377 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
378 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
379 "pmaddwd %%xmm0, %%xmm2 \n\t"
380 "pmaddwd %%xmm0, %%xmm3 \n\t"
381 "paddd %%xmm2, %%xmm10 \n\t"
382 "paddd %%xmm3, %%xmm11 \n\t"
383
384 // next LHS cell
385 "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t"
386
387 "addq $0x30, %[lhs_ptr] \n\t"
388 "addq $0x10, %[rhs_ptr] \n\t"
389
390 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
391 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
392 "pmaddwd %%xmm0, %%xmm2 \n\t"
393 "pmaddwd %%xmm0, %%xmm3 \n\t"
394 "paddd %%xmm2, %%xmm12 \n\t"
395 "paddd %%xmm3, %%xmm13 \n\t"
396
397 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
398 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
399 "pmaddwd %%xmm0, %%xmm2 \n\t"
400 "pmaddwd %%xmm0, %%xmm3 \n\t"
401 "paddd %%xmm2, %%xmm14 \n\t"
402 "paddd %%xmm3, %%xmm15 \n\t"
403
404 "subq $2, %[run_depth_cells]\n\t"
405 "ja outerLoop2%=\n\t"
406
407 "movq %[run_depth_cells], %%r14\n\t"
408 "decq %%r14\n\t"
409 "js finish%=\n\t"
410
411 // Loop for K unrolled by 2
412 "outerLoop1%=:\n\t"
413
414 // RHS cell to xmm1
415 "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t"
416
417 // LHS cell
418 "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t"
419 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
420 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
421 "pmaddwd %%xmm0, %%xmm2 \n\t"
422 "pmaddwd %%xmm0, %%xmm3 \n\t"
423 "paddd %%xmm2, %%xmm4 \n\t"
424 "paddd %%xmm3, %%xmm5 \n\t"
425 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
426 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
427 "pmaddwd %%xmm0, %%xmm2 \n\t"
428 "pmaddwd %%xmm0, %%xmm3 \n\t"
429 "paddd %%xmm2, %%xmm6 \n\t"
430 "paddd %%xmm3, %%xmm7 \n\t"
431
432 // next LHS cell
433 "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t"
434 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
435 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
436 "pmaddwd %%xmm0, %%xmm2 \n\t"
437 "pmaddwd %%xmm0, %%xmm3 \n\t"
438 "paddd %%xmm2, %%xmm8 \n\t"
439 "paddd %%xmm3, %%xmm9 \n\t"
440 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
441 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
442 "pmaddwd %%xmm0, %%xmm2 \n\t"
443 "pmaddwd %%xmm0, %%xmm3 \n\t"
444 "paddd %%xmm2, %%xmm10 \n\t"
445 "paddd %%xmm3, %%xmm11 \n\t"
446
447 // next LHS cell
448 "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t"
449
450 "addq $0x18, %[lhs_ptr] \n\t"
451 "addq $0x08, %[rhs_ptr] \n\t"
452
453 "pshufd $0x00,%%xmm1,%%xmm2 \n\t"
454 "pshufd $0x55,%%xmm1,%%xmm3 \n\t"
455 "pmaddwd %%xmm0, %%xmm2 \n\t"
456 "pmaddwd %%xmm0, %%xmm3 \n\t"
457 "paddd %%xmm2, %%xmm12 \n\t"
458 "paddd %%xmm3, %%xmm13 \n\t"
459 "pshufd $0xaa,%%xmm1,%%xmm2 \n\t"
460 "pshufd $0xff,%%xmm1,%%xmm3 \n\t"
461 "pmaddwd %%xmm0, %%xmm2 \n\t"
462 "pmaddwd %%xmm0, %%xmm3 \n\t"
463 "paddd %%xmm2, %%xmm14 \n\t"
464 "paddd %%xmm3, %%xmm15 \n\t"
465
466 "decq %[run_depth_cells]\n\t"
467 "jnz outerLoop1%=\n\t"
468
469 "finish%=:\n\t"
470
471 "test %[start_depth], %[start_depth]\n\t"
472 "jz storeDst%=\n\t"
473
474 "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t"
475 "paddd 0x10(%[dst_ptr]) , %%xmm8 \n\t"
476 "paddd 0x20(%[dst_ptr]) , %%xmm12\n\t"
477 "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t"
478 "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t"
479 "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t"
480 "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t"
481 "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t"
482 "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t"
483 "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t"
484 "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t"
485 "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t"
486
487 "storeDst%=:\n\t"
488
489 "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t"
490 "movdqu %%xmm8 , 0x10(%[dst_ptr]) \n\t"
491 "movdqu %%xmm12 , 0x20(%[dst_ptr]) \n\t"
492 "movdqu %%xmm5 , 0x00(%[dst_ptr], %%r12, 1)\n\t"
493 "movdqu %%xmm9 , 0x10(%[dst_ptr], %%r12, 1)\n\t"
494 "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t"
495 "movdqu %%xmm6 , 0x00(%[dst_ptr], %%r12, 2)\n\t"
496 "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t"
497 "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t"
498 "movdqu %%xmm7 , 0x00(%[dst_ptr], %%r13, 1)\n\t"
499 "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t"
500 "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t"
501
502 : // outputs
503 [lhs_ptr] "+r"(lhs_ptr), [rhs_ptr] "+r"(rhs_ptr),
504 [dst_ptr] "+r"(dst_ptr)
505 : // inputs
506 [start_depth] "r"(start_depth),
507 [dst_col_stride_q] "r"(dst_col_stride_q),
508 [run_depth_cells] "r"(run_depth_cells)
509 : // clobbers
510 "cc", "memory", "%xmm0", "%xmm1", "%xmm3", "%xmm2", "%xmm4", "%xmm5",
511 "%xmm6", "%xmm7", "%xmm8", "%xmm9", "%xmm10", "%r12", "%r13", "%r14",
512 "%xmm11", "%xmm12", "%xmm13", "%xmm14", "%xmm15");
513 }
514};
515#endif
516
517} // namespace gemmlowp
518
519#endif // GEMMLOWP_INTERNAL_KERNEL_SSE_H_
520