1 | // Copyright 2015 The Gemmlowp Authors. All Rights Reserved. |
2 | // |
3 | // Licensed under the Apache License, Version 2.0 (the "License"); |
4 | // you may not use this file except in compliance with the License. |
5 | // You may obtain a copy of the License at |
6 | // |
7 | // http://www.apache.org/licenses/LICENSE-2.0 |
8 | // |
9 | // Unless required by applicable law or agreed to in writing, software |
10 | // distributed under the License is distributed on an "AS IS" BASIS, |
11 | // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
12 | // See the License for the specific language governing permissions and |
13 | // limitations under the License. |
14 | |
15 | // kernel_SSE.h: a collection of Intel SSE optimized kernels. |
16 | // Check in kernel_default.h which one(s) are actually used by default. |
17 | // Others are mere experiments; they are still covered by tests |
18 | // in case they might be useful some day. |
19 | // |
20 | |
21 | #ifndef GEMMLOWP_INTERNAL_KERNEL_SSE_H_ |
22 | #define GEMMLOWP_INTERNAL_KERNEL_SSE_H_ |
23 | |
24 | #include "kernel.h" |
25 | |
26 | #include <string.h> |
27 | #include <cassert> |
28 | |
29 | namespace gemmlowp { |
30 | |
31 | #ifdef GEMMLOWP_SSE4_32 |
32 | struct SSE4_32_Kernel4x4Depth2 : KernelBase { |
33 | typedef KernelFormat< |
34 | KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1>, |
35 | KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > |
36 | Format; |
37 | |
38 | const char* Name() const override { return "SSE, 4x4, depth 2" ; } |
39 | |
40 | void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, |
41 | std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, |
42 | const std::uint8_t* rhs_ptr, std::size_t start_depth, |
43 | std::size_t run_depth) const override { |
44 | ScopedProfilingLabel label("optimized kernel" ); |
45 | assert(dst_row_stride == 1); |
46 | (void)dst_row_stride; |
47 | std::int32_t run_depth_cells = run_depth / Format::kDepth; |
48 | /* Main loop */ |
49 | |
50 | // A 2x4 cell of Rhs is stored in 16bit in xmm1 . |
51 | // A 4x2 block Lhs is stored in 16bit in xmm0. |
52 | // A 4x4 block of accumulators is stored in 32bit in xmm4--xmm7. |
53 | // |
54 | // +-------+-------+-------+-------+ |
55 | // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| |
56 | // Rhs +-------+---------------+-------+ |
57 | // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| |
58 | // +-------+-------+-------+-------+ |
59 | // |
60 | // | | | | | |
61 | // |
62 | // Lhs | | | | | |
63 | // |
64 | // +--+--+ - - - - +-------+-------+-------+-------+ |
65 | // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | |
66 | // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | |
67 | // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | |
68 | // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | |
69 | // +--+--+ - - - - +-------+-------+-------+-------+ |
70 | // |
71 | // Accumulator |
72 | |
73 | asm volatile( |
74 | |
75 | // set accumulators to zero. |
76 | "pxor %%xmm4 , %%xmm4 \n\t" |
77 | "pxor %%xmm5 , %%xmm5 \n\t" |
78 | "pxor %%xmm6 , %%xmm6 \n\t" |
79 | "pxor %%xmm7 , %%xmm7 \n\t" |
80 | |
81 | "movl %[run_depth_cells], %%eax\n\t" |
82 | "subl $2, %%eax\n\t" |
83 | "js outerLoop1%=\n\t" |
84 | |
85 | // Loop for K unrolled by 4 |
86 | "outerLoop2%=:\n\t" |
87 | |
88 | // K = 1,2 |
89 | // RHS cell to xmm1 |
90 | "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" |
91 | |
92 | // LHS cell |
93 | "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" |
94 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
95 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
96 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
97 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
98 | "paddd %%xmm2, %%xmm4 \n\t" |
99 | "paddd %%xmm3, %%xmm5 \n\t" |
100 | |
101 | "prefetcht0 0x80(%[lhs_ptr]) \n\t" |
102 | |
103 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
104 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
105 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
106 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
107 | |
108 | "prefetcht0 0x80(%[rhs_ptr]) \n\t" |
109 | |
110 | // K = 3,4 |
111 | // RHS cell to xmm1 |
112 | "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" |
113 | |
114 | "paddd %%xmm2, %%xmm6 \n\t" |
115 | "paddd %%xmm3, %%xmm7 \n\t" |
116 | |
117 | // LHS cell |
118 | "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" |
119 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
120 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
121 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
122 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
123 | "paddd %%xmm2, %%xmm4 \n\t" |
124 | "paddd %%xmm3, %%xmm5 \n\t" |
125 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
126 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
127 | |
128 | "addl $0x10, %[lhs_ptr] \n\t" |
129 | "addl $0x10, %[rhs_ptr] \n\t" |
130 | |
131 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
132 | "paddd %%xmm3, %%xmm7 \n\t" |
133 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
134 | "paddd %%xmm2, %%xmm6 \n\t" |
135 | |
136 | "subl $2, %[run_depth_cells]\n\t" |
137 | "ja outerLoop2%=\n\t" |
138 | |
139 | "movl %[run_depth_cells], %%eax\n\t" |
140 | "decl %%eax\n\t" |
141 | "js finish%=\n\t" |
142 | |
143 | // Loop for K unrolled by 2 |
144 | "outerLoop1%=:\n\t" |
145 | |
146 | // RHS cell to xmm1 |
147 | "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" |
148 | |
149 | // LHS cell |
150 | "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" |
151 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
152 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
153 | "paddd %%xmm2, %%xmm4 \n\t" |
154 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
155 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
156 | "paddd %%xmm3, %%xmm5 \n\t" |
157 | |
158 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
159 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
160 | "paddd %%xmm2, %%xmm6 \n\t" |
161 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
162 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
163 | "paddd %%xmm3, %%xmm7 \n\t" |
164 | |
165 | "addl $0x08, %[lhs_ptr]\n\t" |
166 | "addl $0x08, %[rhs_ptr]\n\t" |
167 | |
168 | "decl %[run_depth_cells]\n\t" |
169 | "jnz outerLoop1%=\n\t" |
170 | |
171 | "finish%=:\n\t" |
172 | |
173 | "movl %[dst_col_stride], %%eax\n\t" |
174 | "shll $2, %%eax\n\t" |
175 | |
176 | "movl %[start_depth], %%ecx\n\t" |
177 | "test %%ecx, %%ecx\n\t" |
178 | "jz storeDst%=\n\t" |
179 | |
180 | "leal (%%eax,%%eax,0x2), %%ecx\n\t" |
181 | "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" |
182 | "paddd 0x00(%[dst_ptr], %%eax, 1) , %%xmm5 \n\t" |
183 | "paddd 0x00(%[dst_ptr], %%eax, 2) , %%xmm6 \n\t" |
184 | "paddd 0x00(%[dst_ptr], %%ecx, 1) , %%xmm7 \n\t" |
185 | |
186 | "storeDst%=:\n\t" |
187 | |
188 | "leal (%%eax,%%eax,0x2), %%ecx\n\t" |
189 | "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" |
190 | "movdqu %%xmm5 , 0x00(%[dst_ptr], %%eax, 1)\n\t" |
191 | "movdqu %%xmm6 , 0x00(%[dst_ptr], %%eax, 2)\n\t" |
192 | "movdqu %%xmm7 , 0x00(%[dst_ptr], %%ecx, 1)\n\t" |
193 | |
194 | : // outputs |
195 | [lhs_ptr] "+r" (lhs_ptr), [rhs_ptr] "+r" (rhs_ptr), |
196 | [dst_ptr] "+r" (dst_ptr) |
197 | : // inputs |
198 | [start_depth] "g" (start_depth), [dst_col_stride] "g" (dst_col_stride), |
199 | [run_depth_cells] "g" (run_depth_cells) |
200 | : // clobbers |
201 | "cc" , "memory" , "%xmm0" , "%xmm1" , "%xmm3" , "%xmm2" , "%xmm4" , "%xmm5" , |
202 | "%xmm6" , "%xmm7" , "%eax" , "%ecx" ); |
203 | } |
204 | }; |
205 | #endif |
206 | #ifdef GEMMLOWP_SSE4_64 |
207 | struct SSE4_64_Kernel12x4Depth2 : KernelBase { |
208 | typedef KernelFormat< |
209 | KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 3>, |
210 | KernelSideFormat<CellFormat<4, 2, CellOrder::WidthMajor>, 1> > |
211 | Format; |
212 | |
213 | const char* Name() const override { return "SSE, 12x4, depth 2" ; } |
214 | |
215 | void Run(std::int32_t* dst_ptr, std::size_t dst_row_stride, |
216 | std::size_t dst_col_stride, const std::uint8_t* lhs_ptr, |
217 | const std::uint8_t* rhs_ptr, std::size_t start_depth, |
218 | std::size_t run_depth) const override { |
219 | ScopedProfilingLabel label("optimized kernel" ); |
220 | assert(dst_row_stride == 1); |
221 | (void)dst_row_stride; |
222 | const std::int64_t run_depth_cells = run_depth / Format::kDepth; |
223 | const std::int64_t dst_col_stride_q = dst_col_stride; |
224 | |
225 | /* Main loop */ |
226 | |
227 | // A 2x4 cell of Rhs is stored in 16bit in xmm1 . |
228 | // A 12x2 block of 3 4x2 cells Lhs is stored in 16bit in xmm0, replaced |
229 | // every Iteration. |
230 | // A 12x4 block of accumulators is stored in 32bit in xmm4--xmm15. |
231 | // |
232 | // +-------+-------+-------+-------+ |
233 | // |xmm1[0]|xmm1[2]|xmm1[4]|xmm1[6]| |
234 | // Rhs +-------+---------------+-------+ |
235 | // |xmm1[1]|xmm1[3]|xmm1[5]|xmm1[7]| |
236 | // +-------+-------+-------+-------+ |
237 | // |
238 | // | | | | | |
239 | // |
240 | // Lhs | | | | | |
241 | // |
242 | // +--+--+ - - - - +-------+-------+-------+-------+ |
243 | // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | |
244 | // |xmm0 | (Iter1) | xmm4 | xmm5 | xmm6 | xmm7 | |
245 | // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | |
246 | // |xmm0 | | xmm4 | xmm5 | xmm6 | xmm7 | |
247 | // +--+--+ - - - - +-------+-------+-------+-------+ |
248 | // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | |
249 | // |xmm0 | (Iter2) | xmm8 | xmm9 | xmm10 | xmm11 | |
250 | // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | |
251 | // |xmm0 | | xmm8 | xmm9 | xmm10 | xmm11 | |
252 | // +--+--+ - - - - +-------+-------+-------+-------+ |
253 | // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | |
254 | // |xmm0 | (Iter3) | xmm12 | xmm13 | xmm14 | xmm15 | |
255 | // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | |
256 | // |xmm0 | | xmm12 | xmm13 | xmm14 | xmm15 | |
257 | // +--+--+ - - - - +-------+-------+-------+-------+ |
258 | // |
259 | // Accumulator |
260 | |
261 | asm volatile( |
262 | |
263 | // Set registers for destination |
264 | "movq %[dst_col_stride_q], %%r12\n\t" |
265 | "shlq $2, %%r12\n\t" |
266 | "leaq (%%r12,%%r12,0x2), %%r13\n\t" |
267 | |
268 | // Set accumulators to zero. |
269 | "pxor %%xmm4 , %%xmm4 \n\t" |
270 | "pxor %%xmm5 , %%xmm5 \n\t" |
271 | "pxor %%xmm6 , %%xmm6 \n\t" |
272 | "pxor %%xmm7 , %%xmm7 \n\t" |
273 | "pxor %%xmm8 , %%xmm8 \n\t" |
274 | "pxor %%xmm9 , %%xmm9 \n\t" |
275 | "pxor %%xmm10 , %%xmm10\n\t" |
276 | "pxor %%xmm11 , %%xmm11\n\t" |
277 | "pxor %%xmm12 , %%xmm12\n\t" |
278 | "pxor %%xmm13 , %%xmm13\n\t" |
279 | "pxor %%xmm14 , %%xmm14\n\t" |
280 | "pxor %%xmm15 , %%xmm15\n\t" |
281 | |
282 | "movq %[run_depth_cells], %%r14\n\t" |
283 | "subq $2, %%r14\n\t" |
284 | "js outerLoop1%=\n\t" |
285 | |
286 | // Loop for K unrolled by 4 |
287 | "outerLoop2%=:\n\t" |
288 | |
289 | // K = 1,2 |
290 | // RHS cell to xmm1 |
291 | |
292 | "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" |
293 | |
294 | // LHS cell |
295 | "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" |
296 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
297 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
298 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
299 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
300 | "paddd %%xmm2, %%xmm4 \n\t" |
301 | "paddd %%xmm3, %%xmm5 \n\t" |
302 | |
303 | "prefetcht0 0x80(%[lhs_ptr]) \n\t" |
304 | |
305 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
306 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
307 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
308 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
309 | |
310 | // next LHS cell |
311 | "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" |
312 | |
313 | "paddd %%xmm2, %%xmm6 \n\t" |
314 | "paddd %%xmm3, %%xmm7 \n\t" |
315 | |
316 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
317 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
318 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
319 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
320 | "paddd %%xmm2, %%xmm8 \n\t" |
321 | "paddd %%xmm3, %%xmm9 \n\t" |
322 | |
323 | "prefetcht0 0x80(%[rhs_ptr]) \n\t" |
324 | |
325 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
326 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
327 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
328 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
329 | "paddd %%xmm2, %%xmm10 \n\t" |
330 | "paddd %%xmm3, %%xmm11 \n\t" |
331 | |
332 | // next LHS cell |
333 | "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" |
334 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
335 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
336 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
337 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
338 | "paddd %%xmm2, %%xmm12 \n\t" |
339 | "paddd %%xmm3, %%xmm13 \n\t" |
340 | |
341 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
342 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
343 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
344 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
345 | "paddd %%xmm2, %%xmm14 \n\t" |
346 | "paddd %%xmm3, %%xmm15 \n\t" |
347 | |
348 | // K = 3,4 |
349 | // RHS cell to xmm1 |
350 | "pmovzxbw 0x08(%[rhs_ptr]), %%xmm1\n\t" |
351 | |
352 | // LHS cell |
353 | "pmovzxbw 0x18(%[lhs_ptr]), %%xmm0\n\t" |
354 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
355 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
356 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
357 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
358 | "paddd %%xmm2, %%xmm4 \n\t" |
359 | "paddd %%xmm3, %%xmm5 \n\t" |
360 | |
361 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
362 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
363 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
364 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
365 | "paddd %%xmm2, %%xmm6 \n\t" |
366 | "paddd %%xmm3, %%xmm7 \n\t" |
367 | |
368 | // next LHS cell |
369 | "pmovzxbw 0x20(%[lhs_ptr]), %%xmm0\n\t" |
370 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
371 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
372 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
373 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
374 | "paddd %%xmm2, %%xmm8 \n\t" |
375 | "paddd %%xmm3, %%xmm9 \n\t" |
376 | |
377 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
378 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
379 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
380 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
381 | "paddd %%xmm2, %%xmm10 \n\t" |
382 | "paddd %%xmm3, %%xmm11 \n\t" |
383 | |
384 | // next LHS cell |
385 | "pmovzxbw 0x28(%[lhs_ptr]), %%xmm0\n\t" |
386 | |
387 | "addq $0x30, %[lhs_ptr] \n\t" |
388 | "addq $0x10, %[rhs_ptr] \n\t" |
389 | |
390 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
391 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
392 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
393 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
394 | "paddd %%xmm2, %%xmm12 \n\t" |
395 | "paddd %%xmm3, %%xmm13 \n\t" |
396 | |
397 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
398 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
399 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
400 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
401 | "paddd %%xmm2, %%xmm14 \n\t" |
402 | "paddd %%xmm3, %%xmm15 \n\t" |
403 | |
404 | "subq $2, %[run_depth_cells]\n\t" |
405 | "ja outerLoop2%=\n\t" |
406 | |
407 | "movq %[run_depth_cells], %%r14\n\t" |
408 | "decq %%r14\n\t" |
409 | "js finish%=\n\t" |
410 | |
411 | // Loop for K unrolled by 2 |
412 | "outerLoop1%=:\n\t" |
413 | |
414 | // RHS cell to xmm1 |
415 | "pmovzxbw (%[rhs_ptr]), %%xmm1\n\t" |
416 | |
417 | // LHS cell |
418 | "pmovzxbw 0x00(%[lhs_ptr]), %%xmm0\n\t" |
419 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
420 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
421 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
422 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
423 | "paddd %%xmm2, %%xmm4 \n\t" |
424 | "paddd %%xmm3, %%xmm5 \n\t" |
425 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
426 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
427 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
428 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
429 | "paddd %%xmm2, %%xmm6 \n\t" |
430 | "paddd %%xmm3, %%xmm7 \n\t" |
431 | |
432 | // next LHS cell |
433 | "pmovzxbw 0x08(%[lhs_ptr]), %%xmm0\n\t" |
434 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
435 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
436 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
437 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
438 | "paddd %%xmm2, %%xmm8 \n\t" |
439 | "paddd %%xmm3, %%xmm9 \n\t" |
440 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
441 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
442 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
443 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
444 | "paddd %%xmm2, %%xmm10 \n\t" |
445 | "paddd %%xmm3, %%xmm11 \n\t" |
446 | |
447 | // next LHS cell |
448 | "pmovzxbw 0x10(%[lhs_ptr]), %%xmm0\n\t" |
449 | |
450 | "addq $0x18, %[lhs_ptr] \n\t" |
451 | "addq $0x08, %[rhs_ptr] \n\t" |
452 | |
453 | "pshufd $0x00,%%xmm1,%%xmm2 \n\t" |
454 | "pshufd $0x55,%%xmm1,%%xmm3 \n\t" |
455 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
456 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
457 | "paddd %%xmm2, %%xmm12 \n\t" |
458 | "paddd %%xmm3, %%xmm13 \n\t" |
459 | "pshufd $0xaa,%%xmm1,%%xmm2 \n\t" |
460 | "pshufd $0xff,%%xmm1,%%xmm3 \n\t" |
461 | "pmaddwd %%xmm0, %%xmm2 \n\t" |
462 | "pmaddwd %%xmm0, %%xmm3 \n\t" |
463 | "paddd %%xmm2, %%xmm14 \n\t" |
464 | "paddd %%xmm3, %%xmm15 \n\t" |
465 | |
466 | "decq %[run_depth_cells]\n\t" |
467 | "jnz outerLoop1%=\n\t" |
468 | |
469 | "finish%=:\n\t" |
470 | |
471 | "test %[start_depth], %[start_depth]\n\t" |
472 | "jz storeDst%=\n\t" |
473 | |
474 | "paddd 0x00(%[dst_ptr]) , %%xmm4 \n\t" |
475 | "paddd 0x10(%[dst_ptr]) , %%xmm8 \n\t" |
476 | "paddd 0x20(%[dst_ptr]) , %%xmm12\n\t" |
477 | "paddd 0x00(%[dst_ptr], %%r12, 1) , %%xmm5 \n\t" |
478 | "paddd 0x10(%[dst_ptr], %%r12, 1) , %%xmm9 \n\t" |
479 | "paddd 0x20(%[dst_ptr], %%r12, 1) , %%xmm13\n\t" |
480 | "paddd 0x00(%[dst_ptr], %%r12, 2) , %%xmm6 \n\t" |
481 | "paddd 0x10(%[dst_ptr], %%r12, 2) , %%xmm10\n\t" |
482 | "paddd 0x20(%[dst_ptr], %%r12, 2) , %%xmm14\n\t" |
483 | "paddd 0x00(%[dst_ptr], %%r13, 1) , %%xmm7 \n\t" |
484 | "paddd 0x10(%[dst_ptr], %%r13, 1) , %%xmm11\n\t" |
485 | "paddd 0x20(%[dst_ptr], %%r13, 1) , %%xmm15\n\t" |
486 | |
487 | "storeDst%=:\n\t" |
488 | |
489 | "movdqu %%xmm4 , 0x00(%[dst_ptr]) \n\t" |
490 | "movdqu %%xmm8 , 0x10(%[dst_ptr]) \n\t" |
491 | "movdqu %%xmm12 , 0x20(%[dst_ptr]) \n\t" |
492 | "movdqu %%xmm5 , 0x00(%[dst_ptr], %%r12, 1)\n\t" |
493 | "movdqu %%xmm9 , 0x10(%[dst_ptr], %%r12, 1)\n\t" |
494 | "movdqu %%xmm13 , 0x20(%[dst_ptr], %%r12, 1)\n\t" |
495 | "movdqu %%xmm6 , 0x00(%[dst_ptr], %%r12, 2)\n\t" |
496 | "movdqu %%xmm10 , 0x10(%[dst_ptr], %%r12, 2)\n\t" |
497 | "movdqu %%xmm14 , 0x20(%[dst_ptr], %%r12, 2)\n\t" |
498 | "movdqu %%xmm7 , 0x00(%[dst_ptr], %%r13, 1)\n\t" |
499 | "movdqu %%xmm11 , 0x10(%[dst_ptr], %%r13, 1)\n\t" |
500 | "movdqu %%xmm15 , 0x20(%[dst_ptr], %%r13, 1)\n\t" |
501 | |
502 | : // outputs |
503 | [lhs_ptr] "+r" (lhs_ptr), [rhs_ptr] "+r" (rhs_ptr), |
504 | [dst_ptr] "+r" (dst_ptr) |
505 | : // inputs |
506 | [start_depth] "r" (start_depth), |
507 | [dst_col_stride_q] "r" (dst_col_stride_q), |
508 | [run_depth_cells] "r" (run_depth_cells) |
509 | : // clobbers |
510 | "cc" , "memory" , "%xmm0" , "%xmm1" , "%xmm3" , "%xmm2" , "%xmm4" , "%xmm5" , |
511 | "%xmm6" , "%xmm7" , "%xmm8" , "%xmm9" , "%xmm10" , "%r12" , "%r13" , "%r14" , |
512 | "%xmm11" , "%xmm12" , "%xmm13" , "%xmm14" , "%xmm15" ); |
513 | } |
514 | }; |
515 | #endif |
516 | |
517 | } // namespace gemmlowp |
518 | |
519 | #endif // GEMMLOWP_INTERNAL_KERNEL_SSE_H_ |
520 | |