1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *xe_lp_gemm_nocopy_x8x8s32_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/ocl_math_utils.h" )==""\n"
21R"==(#include "gpu/ocl/gemm/ocl_gemm_attrs.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
23R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
24R"==(#if defined(S8S8) )==""\n"
25R"==(#define A_TYPE char )==""\n"
26R"==(#define A_TYPE2 char2 )==""\n"
27R"==(#define A_TYPE4 char4 )==""\n"
28R"==(#define B_TYPE char )==""\n"
29R"==(#define B_TYPE4 char4 )==""\n"
30R"==(#define AS_A_TYPE as_char )==""\n"
31R"==(#define AS_A_TYPE2 as_char2 )==""\n"
32R"==(#define AS_A_TYPE4 as_char4 )==""\n"
33R"==(#define AS_B_TYPE as_char )==""\n"
34R"==(#define AS_B_TYPE2 as_char2 )==""\n"
35R"==(#define AS_B_TYPE4 as_char4 )==""\n"
36R"==(#endif )==""\n"
37R"==(#if defined(U8S8) )==""\n"
38R"==(#define A_TYPE uchar )==""\n"
39R"==(#define A_TYPE2 uchar2 )==""\n"
40R"==(#define A_TYPE4 uchar4 )==""\n"
41R"==(#define B_TYPE char )==""\n"
42R"==(#define B_TYPE4 char4 )==""\n"
43R"==(#define AS_A_TYPE as_uchar )==""\n"
44R"==(#define AS_A_TYPE2 as_uchar2 )==""\n"
45R"==(#define AS_A_TYPE4 as_uchar4 )==""\n"
46R"==(#define AS_B_TYPE as_char )==""\n"
47R"==(#define AS_B_TYPE2 as_char2 )==""\n"
48R"==(#define AS_B_TYPE4 as_char4 )==""\n"
49R"==(#endif )==""\n"
50R"==(#if defined(S8U8) )==""\n"
51R"==(#define A_TYPE char )==""\n"
52R"==(#define A_TYPE2 char2 )==""\n"
53R"==(#define A_TYPE4 char4 )==""\n"
54R"==(#define B_TYPE uchar )==""\n"
55R"==(#define B_TYPE4 uchar4 )==""\n"
56R"==(#define AS_A_TYPE as_char )==""\n"
57R"==(#define AS_A_TYPE2 as_char2 )==""\n"
58R"==(#define AS_A_TYPE4 as_char4 )==""\n"
59R"==(#define AS_B_TYPE as_uchar )==""\n"
60R"==(#define AS_B_TYPE2 as_uchar2 )==""\n"
61R"==(#define AS_B_TYPE4 as_uchar4 )==""\n"
62R"==(#endif )==""\n"
63R"==(#if defined(U8U8) )==""\n"
64R"==(#define A_TYPE uchar )==""\n"
65R"==(#define A_TYPE2 uchar2 )==""\n"
66R"==(#define A_TYPE4 uchar4 )==""\n"
67R"==(#define B_TYPE uchar )==""\n"
68R"==(#define B_TYPE4 uchar4 )==""\n"
69R"==(#define AS_A_TYPE as_uchar )==""\n"
70R"==(#define AS_A_TYPE2 as_uchar2 )==""\n"
71R"==(#define AS_A_TYPE4 as_uchar4 )==""\n"
72R"==(#define AS_B_TYPE as_uchar )==""\n"
73R"==(#define AS_B_TYPE2 as_uchar2 )==""\n"
74R"==(#define AS_B_TYPE4 as_uchar4 )==""\n"
75R"==(#endif )==""\n"
76R"==(#if defined(TN) )==""\n"
77R"==(#define DO_FMA DO_FMA_TN )==""\n"
78R"==(#endif )==""\n"
79R"==(#if defined(NN) )==""\n"
80R"==(#define DO_FMA DO_FMA_NN )==""\n"
81R"==(#endif )==""\n"
82R"==(#if defined(NT) )==""\n"
83R"==(#define DO_FMA DO_FMA_NT )==""\n"
84R"==(#endif )==""\n"
85R"==(#if defined(TT) )==""\n"
86R"==(#define DO_FMA DO_FMA_TT )==""\n"
87R"==(#endif )==""\n"
88R"==(#define ADD_ROW_A(z) \ )==""\n"
89R"==(do { \ )==""\n"
90R"==(sumRowA[z] = ai[z].s0 + ai[z].s1 + ai[z].s2 + ai[z].s3; \ )==""\n"
91R"==(} while (0) )==""\n"
92R"==(#define ADD_ROW_AT() \ )==""\n"
93R"==(do { \ )==""\n"
94R"==(sumRowA[0] = ai[0].s0 + ai[1].s0 + ai[2].s0 + ai[3].s0; \ )==""\n"
95R"==(sumRowA[1] = ai[0].s1 + ai[1].s1 + ai[2].s1 + ai[3].s1; \ )==""\n"
96R"==(} while (0) )==""\n"
97R"==(#define ADD_COL_B() \ )==""\n"
98R"==(do { \ )==""\n"
99R"==(sumColB = bi.s0 + bi.s1 + bi.s2 + bi.s3; \ )==""\n"
100R"==(} while (0) )==""\n"
101R"==(#define ADD_COL_BT() \ )==""\n"
102R"==(do { \ )==""\n"
103R"==(sumColB = bi[0] + bi[1] + bi[2] + bi[3]; \ )==""\n"
104R"==(} while (0) )==""\n"
105R"==(#ifdef ALIGNED )==""\n"
106R"==(#define VLOAD4_A(z, p) \ )==""\n"
107R"==(do { \ )==""\n"
108R"==(ai[z] = *((global A_TYPE4 *)p); \ )==""\n"
109R"==(} while (0) )==""\n"
110R"==(#else )==""\n"
111R"==(#define VLOAD4_A(z, p) \ )==""\n"
112R"==(do { \ )==""\n"
113R"==(ai[z].s0 = *(p + 0); \ )==""\n"
114R"==(ai[z].s1 = *(p + 1); \ )==""\n"
115R"==(ai[z].s2 = *(p + 2); \ )==""\n"
116R"==(ai[z].s3 = *(p + 3); \ )==""\n"
117R"==(} while (0) )==""\n"
118R"==(#endif )==""\n"
119R"==(#ifdef ALIGNED )==""\n"
120R"==(#define BLOCK_READ_A(h, hh) \ )==""\n"
121R"==(do { \ )==""\n"
122R"==(ai[hh] = AS_A_TYPE2(intel_sub_group_block_read_uc2( \ )==""\n"
123R"==((global uchar *)(a_ptrs[hh] + h * lda))); \ )==""\n"
124R"==(} while (0) )==""\n"
125R"==(#else )==""\n"
126R"==(#define BLOCK_READ_A(h, hh) \ )==""\n"
127R"==(do { \ )==""\n"
128R"==(ai[hh].s0 = *((a_ptrs[hh] + h * lda) + 0); \ )==""\n"
129R"==(ai[hh].s1 = *((a_ptrs[hh] + h * lda) + 16); \ )==""\n"
130R"==(} while (0) )==""\n"
131R"==(#endif )==""\n"
132R"==(#ifdef ALIGNED )==""\n"
133R"==(#define BLOCK_READ_B(h, hh) \ )==""\n"
134R"==(do { \ )==""\n"
135R"==(bi[hh] = AS_B_TYPE(intel_sub_group_block_read_uc( \ )==""\n"
136R"==((global uchar *)(b_ptrs[hh] + h * ldb))); \ )==""\n"
137R"==(} while (0) )==""\n"
138R"==(#else )==""\n"
139R"==(#define BLOCK_READ_B(h, hh) \ )==""\n"
140R"==(do { \ )==""\n"
141R"==(bi[hh] = *(b_ptrs[hh] + h * ldb); \ )==""\n"
142R"==(} while (0) )==""\n"
143R"==(#endif )==""\n"
144R"==(#ifdef ALIGNED )==""\n"
145R"==(#define VLOAD4_B(p) \ )==""\n"
146R"==(do { \ )==""\n"
147R"==(bi = *((global B_TYPE4 *)p); \ )==""\n"
148R"==(} while (0) )==""\n"
149R"==(#else )==""\n"
150R"==(#define VLOAD4_B(p) \ )==""\n"
151R"==(do { \ )==""\n"
152R"==(bi.s0 = *(p + 0); \ )==""\n"
153R"==(bi.s1 = *(p + 1); \ )==""\n"
154R"==(bi.s2 = *(p + 2); \ )==""\n"
155R"==(bi.s3 = *(p + 3); \ )==""\n"
156R"==(} while (0) )==""\n"
157R"==(#endif )==""\n"
158R"==(#define LOADA_REM(z, p) \ )==""\n"
159R"==(do { \ )==""\n"
160R"==(if (krem == 3) { \ )==""\n"
161R"==(ai[z].s0 = *(p + 0); \ )==""\n"
162R"==(ai[z].s1 = *(p + 1); \ )==""\n"
163R"==(ai[z].s2 = *(p + 2); \ )==""\n"
164R"==(} \ )==""\n"
165R"==(if (krem == 2) { \ )==""\n"
166R"==(ai[z].s0 = *(p + 0); \ )==""\n"
167R"==(ai[z].s1 = *(p + 1); \ )==""\n"
168R"==(} \ )==""\n"
169R"==(if (krem == 1) { ai[z].s0 = *(p + 0); } \ )==""\n"
170R"==(} while (0) )==""\n"
171R"==(#define LOADB_REM(p) \ )==""\n"
172R"==(do { \ )==""\n"
173R"==(if (krem == 3) { \ )==""\n"
174R"==(bi.s0 = *(p + 0); \ )==""\n"
175R"==(bi.s1 = *(p + 1); \ )==""\n"
176R"==(bi.s2 = *(p + 2); \ )==""\n"
177R"==(} \ )==""\n"
178R"==(if (krem == 2) { \ )==""\n"
179R"==(bi.s0 = *(p + 0); \ )==""\n"
180R"==(bi.s1 = *(p + 1); \ )==""\n"
181R"==(} \ )==""\n"
182R"==(if (krem == 1) { bi.s0 = *(p + 0); } \ )==""\n"
183R"==(} while (0) )==""\n"
184R"==(#define COPYA() \ )==""\n"
185R"==(do { \ )==""\n"
186R"==(ait[0].s0 = ai[0].s0; \ )==""\n"
187R"==(ait[0].s1 = ai[1].s0; \ )==""\n"
188R"==(ait[0].s2 = ai[2].s0; \ )==""\n"
189R"==(ait[0].s3 = ai[3].s0; \ )==""\n"
190R"==(ait[1].s0 = ai[0].s1; \ )==""\n"
191R"==(ait[1].s1 = ai[1].s1; \ )==""\n"
192R"==(ait[1].s2 = ai[2].s1; \ )==""\n"
193R"==(ait[1].s3 = ai[3].s1; \ )==""\n"
194R"==(} while (0) )==""\n"
195R"==(#define COPYB() \ )==""\n"
196R"==(do { \ )==""\n"
197R"==(biit.s0 = bi[0]; \ )==""\n"
198R"==(biit.s1 = bi[1]; \ )==""\n"
199R"==(biit.s2 = bi[2]; \ )==""\n"
200R"==(biit.s3 = bi[3]; \ )==""\n"
201R"==(} while (0) )==""\n"
202R"==(#define DO_FMA_TN(h, i) \ )==""\n"
203R"==(do { \ )==""\n"
204R"==(ci[0][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(bi), i)), \ )==""\n"
205R"==(AS_A_TYPE4(ai[0]), ci[0][i]); \ )==""\n"
206R"==(ci[1][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(bi), i)), \ )==""\n"
207R"==(AS_A_TYPE4(ai[1]), ci[1][i]); \ )==""\n"
208R"==(} while (0) )==""\n"
209R"==(#define DO_FMA_NN(h, i) \ )==""\n"
210R"==(do { \ )==""\n"
211R"==(ci[0][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(bi), i)), \ )==""\n"
212R"==(AS_A_TYPE4(ait[0]), ci[0][i]); \ )==""\n"
213R"==(ci[1][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(bi), i)), \ )==""\n"
214R"==(AS_A_TYPE4(ait[1]), ci[1][i]); \ )==""\n"
215R"==(} while (0) )==""\n"
216R"==(#define DO_FMA_NT(h, i) \ )==""\n"
217R"==(do { \ )==""\n"
218R"==(ci[0][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(biit), i)), \ )==""\n"
219R"==(AS_A_TYPE4(ait[0]), ci[0][i]); \ )==""\n"
220R"==(ci[1][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(biit), i)), \ )==""\n"
221R"==(AS_A_TYPE4(ait[1]), ci[1][i]); \ )==""\n"
222R"==(} while (0) )==""\n"
223R"==(#define DO_FMA_TT(h, i) \ )==""\n"
224R"==(do { \ )==""\n"
225R"==(ci[0][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(biit), i)), \ )==""\n"
226R"==(AS_A_TYPE4(ai[0]), ci[0][i]); \ )==""\n"
227R"==(ci[1][i] = idot4(AS_B_TYPE4(sub_group_broadcast(as_int(biit), i)), \ )==""\n"
228R"==(AS_A_TYPE4(ai[1]), ci[1][i]); \ )==""\n"
229R"==(} while (0) )==""\n"
230R"==(#if WITH_ELTWISE == 1 )==""\n"
231R"==(#define POST_OP(val) \ )==""\n"
232R"==(do { \ )==""\n"
233R"==(if (apply_eltwise) \ )==""\n"
234R"==(val = fwd_eltwise( \ )==""\n"
235R"==(val, eltwise_alpha, eltwise_beta, eltwise_scale); \ )==""\n"
236R"==(} while (0) )==""\n"
237R"==(#else )==""\n"
238R"==(#define POST_OP(val) )==""\n"
239R"==(#endif )==""\n"
240R"==(#define FMA_I_LOOP(h) \ )==""\n"
241R"==(do { \ )==""\n"
242R"==(DO_FMA(h, 0); \ )==""\n"
243R"==(DO_FMA(h, 1); \ )==""\n"
244R"==(DO_FMA(h, 2); \ )==""\n"
245R"==(DO_FMA(h, 3); \ )==""\n"
246R"==(DO_FMA(h, 4); \ )==""\n"
247R"==(DO_FMA(h, 5); \ )==""\n"
248R"==(DO_FMA(h, 6); \ )==""\n"
249R"==(DO_FMA(h, 7); \ )==""\n"
250R"==(DO_FMA(h, 8); \ )==""\n"
251R"==(DO_FMA(h, 9); \ )==""\n"
252R"==(DO_FMA(h, 10); \ )==""\n"
253R"==(DO_FMA(h, 11); \ )==""\n"
254R"==(DO_FMA(h, 12); \ )==""\n"
255R"==(DO_FMA(h, 13); \ )==""\n"
256R"==(DO_FMA(h, 14); \ )==""\n"
257R"==(DO_FMA(h, 15); \ )==""\n"
258R"==(} while (0) )==""\n"
259R"==(#define ADD_BOFF(i) \ )==""\n"
260R"==(do { \ )==""\n"
261R"==(ci[0][i] -= ATTR_B0 * sumRowA[0]; \ )==""\n"
262R"==(ci[1][i] -= ATTR_B0 * sumRowA[1]; \ )==""\n"
263R"==(} while (0) )==""\n"
264R"==(#define ADD_BOFF_LOOP() \ )==""\n"
265R"==(do { \ )==""\n"
266R"==(ADD_BOFF(0); \ )==""\n"
267R"==(ADD_BOFF(1); \ )==""\n"
268R"==(ADD_BOFF(2); \ )==""\n"
269R"==(ADD_BOFF(3); \ )==""\n"
270R"==(ADD_BOFF(4); \ )==""\n"
271R"==(ADD_BOFF(5); \ )==""\n"
272R"==(ADD_BOFF(6); \ )==""\n"
273R"==(ADD_BOFF(7); \ )==""\n"
274R"==(ADD_BOFF(8); \ )==""\n"
275R"==(ADD_BOFF(9); \ )==""\n"
276R"==(ADD_BOFF(10); \ )==""\n"
277R"==(ADD_BOFF(11); \ )==""\n"
278R"==(ADD_BOFF(12); \ )==""\n"
279R"==(ADD_BOFF(13); \ )==""\n"
280R"==(ADD_BOFF(14); \ )==""\n"
281R"==(ADD_BOFF(15); \ )==""\n"
282R"==(} while (0) )==""\n"
283R"==(#define ADD_AOFF(h, i) \ )==""\n"
284R"==(do { \ )==""\n"
285R"==(ci[0][i] -= (ATTR_A0 * sub_group_broadcast(as_int(sumColB), i)) \ )==""\n"
286R"==(- (h * ATTR_A0 * ATTR_B0); \ )==""\n"
287R"==(ci[1][i] -= (ATTR_A0 * sub_group_broadcast(as_int(sumColB), i)) \ )==""\n"
288R"==(- (h * ATTR_A0 * ATTR_B0); \ )==""\n"
289R"==(} while (0) )==""\n"
290R"==(#define ADD_AOFF_LOOP(h) \ )==""\n"
291R"==(do { \ )==""\n"
292R"==(ADD_AOFF(h, 0); \ )==""\n"
293R"==(ADD_AOFF(h, 1); \ )==""\n"
294R"==(ADD_AOFF(h, 2); \ )==""\n"
295R"==(ADD_AOFF(h, 3); \ )==""\n"
296R"==(ADD_AOFF(h, 4); \ )==""\n"
297R"==(ADD_AOFF(h, 5); \ )==""\n"
298R"==(ADD_AOFF(h, 6); \ )==""\n"
299R"==(ADD_AOFF(h, 7); \ )==""\n"
300R"==(ADD_AOFF(h, 8); \ )==""\n"
301R"==(ADD_AOFF(h, 9); \ )==""\n"
302R"==(ADD_AOFF(h, 10); \ )==""\n"
303R"==(ADD_AOFF(h, 11); \ )==""\n"
304R"==(ADD_AOFF(h, 12); \ )==""\n"
305R"==(ADD_AOFF(h, 13); \ )==""\n"
306R"==(ADD_AOFF(h, 14); \ )==""\n"
307R"==(ADD_AOFF(h, 15); \ )==""\n"
308R"==(} while (0) )==""\n"
309R"==(#define UPDATE_C_COL(i, betaZero) \ )==""\n"
310R"==(do { \ )==""\n"
311R"==(if (jrem > i) { \ )==""\n"
312R"==(if (irem > 0) { \ )==""\n"
313R"==(if (c_offset_type == 0) { \ )==""\n"
314R"==(float val = ((betaZero) ? 0 : *c) + ci[0][i]; \ )==""\n"
315R"==(POST_OP(val); \ )==""\n"
316R"==(*c = convert_int_sat_rte(val + ((!apply_co) ? 0 : co[0])); \ )==""\n"
317R"==(} \ )==""\n"
318R"==(if (c_offset_type == 1) { \ )==""\n"
319R"==(float val = ((betaZero) ? 0 : *c) + ci[0][i]; \ )==""\n"
320R"==(POST_OP(val); \ )==""\n"
321R"==(*c = convert_int_sat_rte(val + ((!apply_co) ? 0 : co[0])); \ )==""\n"
322R"==(} \ )==""\n"
323R"==(if (c_offset_type == 2) { \ )==""\n"
324R"==(float val = ((betaZero) ? 0 : *c) + ci[0][i]; \ )==""\n"
325R"==(POST_OP(val); \ )==""\n"
326R"==(*c = convert_int_sat_rte(val + ((!apply_co) ? 0 : co[i])); \ )==""\n"
327R"==(} \ )==""\n"
328R"==(} \ )==""\n"
329R"==(if (irem > 16) { \ )==""\n"
330R"==(if (c_offset_type == 0) { \ )==""\n"
331R"==(float val = ((betaZero) ? 0 : *c2) + ci[1][i]; \ )==""\n"
332R"==(POST_OP(val); \ )==""\n"
333R"==(*c2 = convert_int_sat_rte( \ )==""\n"
334R"==(val + ((!apply_co) ? 0 : co[0])); \ )==""\n"
335R"==(} \ )==""\n"
336R"==(if (c_offset_type == 1) { \ )==""\n"
337R"==(float val = ((betaZero) ? 0 : *c2) + ci[1][i]; \ )==""\n"
338R"==(POST_OP(val); \ )==""\n"
339R"==(*c2 = convert_int_sat_rte( \ )==""\n"
340R"==(val + ((!apply_co) ? 0 : co[16])); \ )==""\n"
341R"==(} \ )==""\n"
342R"==(if (c_offset_type == 2) { \ )==""\n"
343R"==(float val = ((betaZero) ? 0 : *c2) + ci[1][i]; \ )==""\n"
344R"==(POST_OP(val); \ )==""\n"
345R"==(*c2 = convert_int_sat_rte( \ )==""\n"
346R"==(val + ((!apply_co) ? 0 : co[i])); \ )==""\n"
347R"==(} \ )==""\n"
348R"==(} \ )==""\n"
349R"==(} \ )==""\n"
350R"==(c = c + ldc; \ )==""\n"
351R"==(c2 = c2 + ldc; \ )==""\n"
352R"==(} while (0) )==""\n"
353R"==(#define UPDATE_C(betaZero) \ )==""\n"
354R"==(do { \ )==""\n"
355R"==(UPDATE_C_COL(0, betaZero); \ )==""\n"
356R"==(UPDATE_C_COL(1, betaZero); \ )==""\n"
357R"==(UPDATE_C_COL(2, betaZero); \ )==""\n"
358R"==(UPDATE_C_COL(3, betaZero); \ )==""\n"
359R"==(UPDATE_C_COL(4, betaZero); \ )==""\n"
360R"==(UPDATE_C_COL(5, betaZero); \ )==""\n"
361R"==(UPDATE_C_COL(6, betaZero); \ )==""\n"
362R"==(UPDATE_C_COL(7, betaZero); \ )==""\n"
363R"==(UPDATE_C_COL(8, betaZero); \ )==""\n"
364R"==(UPDATE_C_COL(9, betaZero); \ )==""\n"
365R"==(UPDATE_C_COL(10, betaZero); \ )==""\n"
366R"==(UPDATE_C_COL(11, betaZero); \ )==""\n"
367R"==(UPDATE_C_COL(12, betaZero); \ )==""\n"
368R"==(UPDATE_C_COL(13, betaZero); \ )==""\n"
369R"==(UPDATE_C_COL(14, betaZero); \ )==""\n"
370R"==(UPDATE_C_COL(15, betaZero); \ )==""\n"
371R"==(} while (0) )==""\n"
372R"==(#ifdef TN )==""\n"
373R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
374R"==(xe_lp_gemm_compute_x8x8s32(global A_TYPE *a, global B_TYPE *b, global int *c, )==""\n"
375R"==(int offsetA, int offsetB, int offsetC, int lda, int ldb, int ldc, int m, )==""\n"
376R"==(int n, int k, int beta, global int *ao, global int *bo, global int *co, )==""\n"
377R"==(int offsetCO, int apply_co, local A_TYPE *sa, local B_TYPE *sb, )==""\n"
378R"==(int apply_eltwise, float eltwise_alpha, float eltwise_beta, )==""\n"
379R"==(float eltwise_scale) { )==""\n"
380R"==(A_TYPE4 ai[2]; )==""\n"
381R"==(B_TYPE4 bi; )==""\n"
382R"==(int ci[2][16]; )==""\n"
383R"==(int sumRowA[2] = {0, 0}; )==""\n"
384R"==(int sumColB = 0; )==""\n"
385R"==(int idM = get_group_id(0); )==""\n"
386R"==(int idN = get_group_id(1); )==""\n"
387R"==(int idlM = get_local_id(0); )==""\n"
388R"==(int idlN = get_local_id(1); )==""\n"
389R"==(int lid = get_sub_group_local_id(); )==""\n"
390R"==(int lsm = get_enqueued_local_size(0); )==""\n"
391R"==(int lsn = 8; )==""\n"
392R"==(int i0 = (idM * lsm / 16) * 32 + (get_local_id(0) / 16) * 32; )==""\n"
393R"==(int j0 = idlN * 16 + (idN * lsn * 16); )==""\n"
394R"==(int irem = m - i0 - lid; )==""\n"
395R"==(int jrem = n - j0; )==""\n"
396R"==(int irem2 = m - i0; )==""\n"
397R"==(int jrem2 = n - j0; )==""\n"
398R"==(if (irem < 0) irem = 0; )==""\n"
399R"==(if (jrem < 0) jrem = 0; )==""\n"
400R"==(a += offsetA + (i0 * lda) + (lid * lda); )==""\n"
401R"==(b += offsetB + (j0 * ldb) + (lid * ldb); )==""\n"
402R"==(c += offsetC + (i0) + (j0 * ldc) + lid; )==""\n"
403R"==(int c_offset_type = 0; )==""\n"
404R"==(#ifdef FF )==""\n"
405R"==(co += offsetCO; )==""\n"
406R"==(c_offset_type = 0; )==""\n"
407R"==(#endif )==""\n"
408R"==(#ifdef RR )==""\n"
409R"==(co += offsetCO + i0 + lid; )==""\n"
410R"==(c_offset_type = 1; )==""\n"
411R"==(#endif )==""\n"
412R"==(#ifdef CC )==""\n"
413R"==(co += offsetCO + (j0); )==""\n"
414R"==(c_offset_type = 2; )==""\n"
415R"==(#endif )==""\n"
416R"==(global A_TYPE *a_ptrs[2] = {a, a + 16 * lda}; )==""\n"
417R"==(global B_TYPE *b_ptrs = {b}; )==""\n"
418R"==(for (int y = 0; y < 16; y++) { )==""\n"
419R"==(for (int z = 0; z < 2; z++) { )==""\n"
420R"==(ci[z][y] = 0; )==""\n"
421R"==(} )==""\n"
422R"==(} )==""\n"
423R"==(int k_align = k & ~3; )==""\n"
424R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
425R"==(if (irem2 >= 32 && jrem2 >= 16) { )==""\n"
426R"==(#endif )==""\n"
427R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
428R"==(for (int z = 0; z < 2; z++) { )==""\n"
429R"==(VLOAD4_A(z, (a_ptrs[z] + h)); )==""\n"
430R"==(#ifdef BOFFNONZERO )==""\n"
431R"==(ADD_ROW_A(z); )==""\n"
432R"==(#endif )==""\n"
433R"==(} )==""\n"
434R"==(VLOAD4_B((b_ptrs + h)); )==""\n"
435R"==(#ifdef AOFFNONZERO )==""\n"
436R"==(ADD_COL_B(); )==""\n"
437R"==(#endif )==""\n"
438R"==(FMA_I_LOOP(0); )==""\n"
439R"==(#ifdef BOFFNONZERO )==""\n"
440R"==(ADD_BOFF_LOOP(); )==""\n"
441R"==(#endif )==""\n"
442R"==(#ifdef AOFFNONZERO )==""\n"
443R"==(ADD_AOFF_LOOP(4); )==""\n"
444R"==(#endif )==""\n"
445R"==(} )==""\n"
446R"==(int krem = k & 3; )==""\n"
447R"==(if (krem > 0) { )==""\n"
448R"==(ai[0] = 0; )==""\n"
449R"==(ai[1] = 0; )==""\n"
450R"==(bi = 0; )==""\n"
451R"==(for (int z = 0; z < 2; z++) { )==""\n"
452R"==(LOADA_REM(z, (a_ptrs[z] + k_align)); )==""\n"
453R"==(#ifdef BOFFNONZERO )==""\n"
454R"==(ADD_ROW_A(z); )==""\n"
455R"==(#endif )==""\n"
456R"==(} )==""\n"
457R"==(LOADB_REM((b_ptrs + k_align)); )==""\n"
458R"==(#ifdef AOFFNONZERO )==""\n"
459R"==(ADD_COL_B(); )==""\n"
460R"==(#endif )==""\n"
461R"==(FMA_I_LOOP(0); )==""\n"
462R"==(#ifdef BOFFNONZERO )==""\n"
463R"==(ADD_BOFF_LOOP(); )==""\n"
464R"==(#endif )==""\n"
465R"==(#ifdef AOFFNONZERO )==""\n"
466R"==(ADD_AOFF_LOOP(krem); )==""\n"
467R"==(#endif )==""\n"
468R"==(} )==""\n"
469R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
470R"==(} else { )==""\n"
471R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
472R"==(for (int z = 0; z < 2; z++) { )==""\n"
473R"==(if (irem > z * 16) { )==""\n"
474R"==(VLOAD4_A(z, (a_ptrs[z] + h)); )==""\n"
475R"==(#ifdef BOFFNONZERO )==""\n"
476R"==(ADD_ROW_A(z); )==""\n"
477R"==(#endif )==""\n"
478R"==(} )==""\n"
479R"==(} )==""\n"
480R"==(if (jrem > lid) { )==""\n"
481R"==(VLOAD4_B((b_ptrs + h)); )==""\n"
482R"==(#ifdef AOFFNONZERO )==""\n"
483R"==(ADD_COL_B(); )==""\n"
484R"==(#endif )==""\n"
485R"==(} )==""\n"
486R"==(FMA_I_LOOP(0); )==""\n"
487R"==(#ifdef BOFFNONZERO )==""\n"
488R"==(ADD_BOFF_LOOP(); )==""\n"
489R"==(#endif )==""\n"
490R"==(#ifdef AOFFNONZERO )==""\n"
491R"==(ADD_AOFF_LOOP(4); )==""\n"
492R"==(#endif )==""\n"
493R"==(} )==""\n"
494R"==(int krem = k & 3; )==""\n"
495R"==(if (krem > 0) { )==""\n"
496R"==(ai[0] = 0; )==""\n"
497R"==(ai[1] = 0; )==""\n"
498R"==(bi = 0; )==""\n"
499R"==(for (int z = 0; z < 2; z++) { )==""\n"
500R"==(if (irem > z * 16) { )==""\n"
501R"==(LOADA_REM(z, (a_ptrs[z] + k_align)); )==""\n"
502R"==(#ifdef BOFFNONZERO )==""\n"
503R"==(ADD_ROW_A(z); )==""\n"
504R"==(#endif )==""\n"
505R"==(} )==""\n"
506R"==(} )==""\n"
507R"==(if (jrem > lid) { )==""\n"
508R"==(LOADB_REM((b_ptrs + k_align)); )==""\n"
509R"==(#ifdef AOFFNONZERO )==""\n"
510R"==(ADD_COL_B(); )==""\n"
511R"==(#endif )==""\n"
512R"==(} )==""\n"
513R"==(FMA_I_LOOP(0); )==""\n"
514R"==(#ifdef BOFFNONZERO )==""\n"
515R"==(ADD_BOFF_LOOP(); )==""\n"
516R"==(#endif )==""\n"
517R"==(#ifdef AOFFNONZERO )==""\n"
518R"==(ADD_AOFF_LOOP(krem); )==""\n"
519R"==(#endif )==""\n"
520R"==(} )==""\n"
521R"==(} )==""\n"
522R"==(#endif /* ALLOW_READ_OVERHEAD */ )==""\n"
523R"==(global int *c2 = c + 16; )==""\n"
524R"==(if (beta == 0) )==""\n"
525R"==(UPDATE_C(1); )==""\n"
526R"==(else )==""\n"
527R"==(UPDATE_C(0); )==""\n"
528R"==(} )==""\n"
529R"==(#endif )==""\n"
530R"==(#ifdef NN )==""\n"
531R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
532R"==(xe_lp_gemm_compute_x8x8s32(global A_TYPE *a, global B_TYPE *b, global int *c, )==""\n"
533R"==(int offsetA, int offsetB, int offsetC, int lda, int ldb, int ldc, int m, )==""\n"
534R"==(int n, int k, int beta, global int *ao, global int *bo, global int *co, )==""\n"
535R"==(int offsetCO, int apply_co, local A_TYPE *sa, local B_TYPE *sb, )==""\n"
536R"==(int apply_eltwise, float eltwise_alpha, float eltwise_beta, )==""\n"
537R"==(float eltwise_scale) { )==""\n"
538R"==(A_TYPE2 ai[4]; )==""\n"
539R"==(B_TYPE4 bi; )==""\n"
540R"==(int ci[2][16]; )==""\n"
541R"==(int sumRowA[2] = {0, 0}; )==""\n"
542R"==(int sumColB = 0; )==""\n"
543R"==(A_TYPE4 ait[2]; )==""\n"
544R"==(int idM = get_group_id(0); )==""\n"
545R"==(int idN = get_group_id(1); )==""\n"
546R"==(int idlM = get_local_id(0); )==""\n"
547R"==(int idlN = get_local_id(1); )==""\n"
548R"==(int lid = get_sub_group_local_id(); )==""\n"
549R"==(int lsm = get_enqueued_local_size(0); )==""\n"
550R"==(int lsn = 8; )==""\n"
551R"==(int i0 = (idM * lsm / 16) * 32 + (get_local_id(0) / 16) * 32; )==""\n"
552R"==(int j0 = idlN * 16 + (idN * lsn * 16); )==""\n"
553R"==(int irem = m - i0 - lid; )==""\n"
554R"==(int jrem = n - j0; )==""\n"
555R"==(int irem2 = m - i0; )==""\n"
556R"==(int jrem2 = n - j0; )==""\n"
557R"==(if (irem < 0) irem = 0; )==""\n"
558R"==(if (jrem < 0) jrem = 0; )==""\n"
559R"==(#ifdef ALIGNED )==""\n"
560R"==(a += offsetA + i0; )==""\n"
561R"==(#else )==""\n"
562R"==(a += offsetA + i0 + lid; )==""\n"
563R"==(#endif )==""\n"
564R"==(b += offsetB + (j0 * ldb) + (lid * ldb); )==""\n"
565R"==(c += offsetC + (i0) + (j0 * ldc) + lid; )==""\n"
566R"==(int c_offset_type = 0; )==""\n"
567R"==(#ifdef FF )==""\n"
568R"==(co += offsetCO; )==""\n"
569R"==(c_offset_type = 0; )==""\n"
570R"==(#endif )==""\n"
571R"==(#ifdef RR )==""\n"
572R"==(co += offsetCO + i0 + lid; )==""\n"
573R"==(c_offset_type = 1; )==""\n"
574R"==(#endif )==""\n"
575R"==(#ifdef CC )==""\n"
576R"==(co += offsetCO + (j0); )==""\n"
577R"==(c_offset_type = 2; )==""\n"
578R"==(#endif )==""\n"
579R"==(global A_TYPE *a_ptrs[4] = {a, a + 1 * lda, a + 2 * lda, a + 3 * lda}; )==""\n"
580R"==(global B_TYPE *b_ptrs = {b}; )==""\n"
581R"==(for (int y = 0; y < 16; y++) { )==""\n"
582R"==(for (int z = 0; z < 2; z++) { )==""\n"
583R"==(ci[z][y] = 0; )==""\n"
584R"==(} )==""\n"
585R"==(} )==""\n"
586R"==(int k_align = k & ~3; )==""\n"
587R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
588R"==(if (irem2 >= 32 && jrem2 >= 16) { )==""\n"
589R"==(#endif )==""\n"
590R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
591R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
592R"==(BLOCK_READ_A(h, hh); )==""\n"
593R"==(#ifdef BOFFNONZERO )==""\n"
594R"==(ADD_ROW_AT(); )==""\n"
595R"==(#endif )==""\n"
596R"==(} )==""\n"
597R"==(COPYA(); )==""\n"
598R"==(VLOAD4_B((b_ptrs + h)); )==""\n"
599R"==(#ifdef AOFFNONZERO )==""\n"
600R"==(ADD_COL_B(); )==""\n"
601R"==(#endif )==""\n"
602R"==(FMA_I_LOOP(0); )==""\n"
603R"==(#ifdef BOFFNONZERO )==""\n"
604R"==(ADD_BOFF_LOOP(); )==""\n"
605R"==(#endif )==""\n"
606R"==(#ifdef AOFFNONZERO )==""\n"
607R"==(ADD_AOFF_LOOP(4); )==""\n"
608R"==(#endif )==""\n"
609R"==(} )==""\n"
610R"==(int krem = k & 3; )==""\n"
611R"==(if (krem > 0) { )==""\n"
612R"==(ai[0] = 0; )==""\n"
613R"==(ai[1] = 0; )==""\n"
614R"==(ai[2] = 0; )==""\n"
615R"==(ai[3] = 0; )==""\n"
616R"==(bi = 0; )==""\n"
617R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
618R"==(BLOCK_READ_A(k_align, hh); )==""\n"
619R"==(#ifdef BOFFNONZERO )==""\n"
620R"==(ADD_ROW_AT(); )==""\n"
621R"==(#endif )==""\n"
622R"==(} )==""\n"
623R"==(COPYA(); )==""\n"
624R"==(LOADB_REM((b_ptrs + k_align)); )==""\n"
625R"==(#ifdef AOFFNONZERO )==""\n"
626R"==(ADD_COL_B(); )==""\n"
627R"==(#endif )==""\n"
628R"==(FMA_I_LOOP(0); )==""\n"
629R"==(#ifdef BOFFNONZERO )==""\n"
630R"==(ADD_BOFF_LOOP(); )==""\n"
631R"==(#endif )==""\n"
632R"==(#ifdef AOFFNONZERO )==""\n"
633R"==(ADD_AOFF_LOOP(krem); )==""\n"
634R"==(#endif )==""\n"
635R"==(} )==""\n"
636R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
637R"==(} else { )==""\n"
638R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
639R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
640R"==(if (irem2 > lid) { )==""\n"
641R"==(#ifdef ALIGNED )==""\n"
642R"==(ai[hh].s0 = *((a_ptrs[hh] + h * lda) + 0 + lid); )==""\n"
643R"==(ai[hh].s1 = *((a_ptrs[hh] + h * lda) + 16 + lid); )==""\n"
644R"==(#else )==""\n"
645R"==(ai[hh].s0 = *((a_ptrs[hh] + h * lda) + 0); )==""\n"
646R"==(ai[hh].s1 = *((a_ptrs[hh] + h * lda) + 16); )==""\n"
647R"==(#endif )==""\n"
648R"==(#ifdef BOFFNONZERO )==""\n"
649R"==(ADD_ROW_AT(); )==""\n"
650R"==(#endif )==""\n"
651R"==(} )==""\n"
652R"==(} )==""\n"
653R"==(COPYA(); )==""\n"
654R"==(if (jrem > lid) { )==""\n"
655R"==(VLOAD4_B((b_ptrs + h)); )==""\n"
656R"==(#ifdef AOFFNONZERO )==""\n"
657R"==(ADD_COL_B(); )==""\n"
658R"==(#endif )==""\n"
659R"==(} )==""\n"
660R"==(FMA_I_LOOP(0); )==""\n"
661R"==(#ifdef BOFFNONZERO )==""\n"
662R"==(ADD_BOFF_LOOP(); )==""\n"
663R"==(#endif )==""\n"
664R"==(#ifdef AOFFNONZERO )==""\n"
665R"==(ADD_AOFF_LOOP(4); )==""\n"
666R"==(#endif )==""\n"
667R"==(} )==""\n"
668R"==(int krem = k & 3; )==""\n"
669R"==(if (krem > 0) { )==""\n"
670R"==(ai[0] = 0; )==""\n"
671R"==(ai[1] = 0; )==""\n"
672R"==(ai[2] = 0; )==""\n"
673R"==(ai[3] = 0; )==""\n"
674R"==(bi = 0; )==""\n"
675R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
676R"==(if (irem2 > lid) { )==""\n"
677R"==(#ifdef ALIGNED )==""\n"
678R"==(ai[hh].s0 = *((a_ptrs[hh] + k_align * lda) + 0 + lid); )==""\n"
679R"==(ai[hh].s1 = *((a_ptrs[hh] + k_align * lda) + 16 + lid); )==""\n"
680R"==(#else )==""\n"
681R"==(ai[hh].s0 = *((a_ptrs[hh] + k_align * lda) + 0); )==""\n"
682R"==(ai[hh].s1 = *((a_ptrs[hh] + k_align * lda) + 16); )==""\n"
683R"==(#endif )==""\n"
684R"==(#ifdef BOFFNONZERO )==""\n"
685R"==(ADD_ROW_AT(); )==""\n"
686R"==(#endif )==""\n"
687R"==(} )==""\n"
688R"==(} )==""\n"
689R"==(COPYA(); )==""\n"
690R"==(if (jrem > lid) { )==""\n"
691R"==(LOADB_REM((b_ptrs + k_align)); )==""\n"
692R"==(#ifdef AOFFNONZERO )==""\n"
693R"==(ADD_COL_B(); )==""\n"
694R"==(#endif )==""\n"
695R"==(} )==""\n"
696R"==(FMA_I_LOOP(0); )==""\n"
697R"==(#ifdef BOFFNONZERO )==""\n"
698R"==(ADD_BOFF_LOOP(); )==""\n"
699R"==(#endif )==""\n"
700R"==(#ifdef AOFFNONZERO )==""\n"
701R"==(ADD_AOFF_LOOP(krem); )==""\n"
702R"==(#endif )==""\n"
703R"==(} )==""\n"
704R"==(} )==""\n"
705R"==(#endif /* ALLOW_READ_OVERHEAD */ )==""\n"
706R"==(global int *c2 = c + 16; )==""\n"
707R"==(if (beta == 0) )==""\n"
708R"==(UPDATE_C(1); )==""\n"
709R"==(else )==""\n"
710R"==(UPDATE_C(0); )==""\n"
711R"==(} )==""\n"
712R"==(#endif )==""\n"
713R"==(#ifdef NT )==""\n"
714R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
715R"==(xe_lp_gemm_compute_x8x8s32(global A_TYPE *a, global B_TYPE *b, global int *c, )==""\n"
716R"==(int offsetA, int offsetB, int offsetC, int lda, int ldb, int ldc, int m, )==""\n"
717R"==(int n, int k, int beta, global int *ao, global int *bo, global int *co, )==""\n"
718R"==(int offsetCO, int apply_co, local A_TYPE *sa, local B_TYPE *sb, )==""\n"
719R"==(int apply_eltwise, float eltwise_alpha, float eltwise_beta, )==""\n"
720R"==(float eltwise_scale) { )==""\n"
721R"==(A_TYPE2 ai[4]; )==""\n"
722R"==(B_TYPE bi[4]; )==""\n"
723R"==(int ci[2][16]; )==""\n"
724R"==(int sumRowA[2] = {0, 0}; )==""\n"
725R"==(int sumColB = 0; )==""\n"
726R"==(A_TYPE4 ait[2]; )==""\n"
727R"==(A_TYPE4 biit; )==""\n"
728R"==(int idM = get_group_id(0); )==""\n"
729R"==(int idN = get_group_id(1); )==""\n"
730R"==(int idlM = get_local_id(0); )==""\n"
731R"==(int idlN = get_local_id(1); )==""\n"
732R"==(int lid = get_sub_group_local_id(); )==""\n"
733R"==(int lsm = get_enqueued_local_size(0); )==""\n"
734R"==(int lsn = 8; )==""\n"
735R"==(int i0 = (idM * lsm / 16) * 32 + (get_local_id(0) / 16) * 32; )==""\n"
736R"==(int j0 = idlN * 16 + (idN * lsn * 16); )==""\n"
737R"==(int irem = m - i0 - lid; )==""\n"
738R"==(int jrem = n - j0; )==""\n"
739R"==(int irem2 = m - i0; )==""\n"
740R"==(int jrem2 = n - j0; )==""\n"
741R"==(if (irem < 0) irem = 0; )==""\n"
742R"==(if (jrem < 0) jrem = 0; )==""\n"
743R"==(#ifdef ALIGNED )==""\n"
744R"==(a += offsetA + i0; )==""\n"
745R"==(#else )==""\n"
746R"==(a += offsetA + i0 + lid; )==""\n"
747R"==(#endif )==""\n"
748R"==(#ifdef ALIGNED )==""\n"
749R"==(b += offsetB + j0; )==""\n"
750R"==(#else )==""\n"
751R"==(b += offsetB + j0 + lid; )==""\n"
752R"==(#endif )==""\n"
753R"==(c += offsetC + (i0) + (j0 * ldc) + lid; )==""\n"
754R"==(int c_offset_type = 0; )==""\n"
755R"==(#ifdef FF )==""\n"
756R"==(co += offsetCO; )==""\n"
757R"==(c_offset_type = 0; )==""\n"
758R"==(#endif )==""\n"
759R"==(#ifdef RR )==""\n"
760R"==(co += offsetCO + i0 + lid; )==""\n"
761R"==(c_offset_type = 1; )==""\n"
762R"==(#endif )==""\n"
763R"==(#ifdef CC )==""\n"
764R"==(co += offsetCO + (j0); )==""\n"
765R"==(c_offset_type = 2; )==""\n"
766R"==(#endif )==""\n"
767R"==(global A_TYPE *a_ptrs[4] = {a, a + 1 * lda, a + 2 * lda, a + 3 * lda}; )==""\n"
768R"==(global B_TYPE *b_ptrs[4] = {b, b + 1 * ldb, b + 2 * ldb, b + 3 * ldb}; )==""\n"
769R"==(for (int y = 0; y < 16; y++) { )==""\n"
770R"==(for (int z = 0; z < 2; z++) { )==""\n"
771R"==(ci[z][y] = 0; )==""\n"
772R"==(} )==""\n"
773R"==(} )==""\n"
774R"==(int insidea1 = 5; )==""\n"
775R"==(int insidea2 = 5; )==""\n"
776R"==(int insideb = 5; )==""\n"
777R"==(int k_align = k & ~3; )==""\n"
778R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
779R"==(if (irem2 >= 32 && jrem2 >= 16) { )==""\n"
780R"==(#endif )==""\n"
781R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
782R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
783R"==(BLOCK_READ_A(h, hh); )==""\n"
784R"==(#ifdef BOFFNONZERO )==""\n"
785R"==(ADD_ROW_AT(); )==""\n"
786R"==(#endif )==""\n"
787R"==(} )==""\n"
788R"==(COPYA(); )==""\n"
789R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
790R"==(BLOCK_READ_B(h, hh); )==""\n"
791R"==(#ifdef AOFFNONZERO )==""\n"
792R"==(ADD_COL_BT(); )==""\n"
793R"==(#endif )==""\n"
794R"==(} )==""\n"
795R"==(COPYB(); )==""\n"
796R"==(FMA_I_LOOP(0); )==""\n"
797R"==(#ifdef BOFFNONZERO )==""\n"
798R"==(ADD_BOFF_LOOP(); )==""\n"
799R"==(#endif )==""\n"
800R"==(#ifdef AOFFNONZERO )==""\n"
801R"==(ADD_AOFF_LOOP(4); )==""\n"
802R"==(#endif )==""\n"
803R"==(} )==""\n"
804R"==(int krem = k & 3; )==""\n"
805R"==(if (krem > 0) { )==""\n"
806R"==(ai[0] = 0; )==""\n"
807R"==(ai[1] = 0; )==""\n"
808R"==(ai[2] = 0; )==""\n"
809R"==(ai[3] = 0; )==""\n"
810R"==(bi[0] = 0; )==""\n"
811R"==(bi[1] = 0; )==""\n"
812R"==(bi[2] = 0; )==""\n"
813R"==(bi[3] = 0; )==""\n"
814R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
815R"==(BLOCK_READ_A(k_align, hh); )==""\n"
816R"==(#ifdef BOFFNONZERO )==""\n"
817R"==(ADD_ROW_AT(); )==""\n"
818R"==(#endif )==""\n"
819R"==(} )==""\n"
820R"==(COPYA(); )==""\n"
821R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
822R"==(BLOCK_READ_B(k_align, hh); )==""\n"
823R"==(#ifdef AOFFNONZERO )==""\n"
824R"==(ADD_COL_BT(); )==""\n"
825R"==(#endif )==""\n"
826R"==(} )==""\n"
827R"==(COPYB(); )==""\n"
828R"==(FMA_I_LOOP(0); )==""\n"
829R"==(#ifdef BOFFNONZERO )==""\n"
830R"==(ADD_BOFF_LOOP(); )==""\n"
831R"==(#endif )==""\n"
832R"==(#ifdef AOFFNONZERO )==""\n"
833R"==(ADD_AOFF_LOOP(krem); )==""\n"
834R"==(#endif )==""\n"
835R"==(} )==""\n"
836R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
837R"==(} else { )==""\n"
838R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
839R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
840R"==(if (irem2 > lid) { )==""\n"
841R"==(#ifdef ALIGNED )==""\n"
842R"==(ai[hh].s0 = *((a_ptrs[hh] + h * lda) + 0 + lid); )==""\n"
843R"==(ai[hh].s1 = *((a_ptrs[hh] + h * lda) + 16 + lid); )==""\n"
844R"==(#else )==""\n"
845R"==(ai[hh].s0 = *((a_ptrs[hh] + h * lda) + 0); )==""\n"
846R"==(ai[hh].s1 = *((a_ptrs[hh] + h * lda) + 16); )==""\n"
847R"==(#endif )==""\n"
848R"==(#ifdef BOFFNONZERO )==""\n"
849R"==(ADD_ROW_AT(); )==""\n"
850R"==(#endif )==""\n"
851R"==(} )==""\n"
852R"==(} )==""\n"
853R"==(COPYA(); )==""\n"
854R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
855R"==(if (jrem > lid) { )==""\n"
856R"==(#ifdef ALIGNED )==""\n"
857R"==(bi[hh] = *((b_ptrs[hh] + h * ldb) + lid); )==""\n"
858R"==(#else )==""\n"
859R"==(bi[hh] = *(b_ptrs[hh] + h * ldb); )==""\n"
860R"==(#endif )==""\n"
861R"==(#ifdef AOFFNONZERO )==""\n"
862R"==(ADD_COL_BT(); )==""\n"
863R"==(#endif )==""\n"
864R"==(} )==""\n"
865R"==(} )==""\n"
866R"==(COPYB(); )==""\n"
867R"==(FMA_I_LOOP(0); )==""\n"
868R"==(#ifdef BOFFNONZERO )==""\n"
869R"==(ADD_BOFF_LOOP(); )==""\n"
870R"==(#endif )==""\n"
871R"==(#ifdef AOFFNONZERO )==""\n"
872R"==(ADD_AOFF_LOOP(4); )==""\n"
873R"==(#endif )==""\n"
874R"==(} )==""\n"
875R"==(int krem = k & 3; )==""\n"
876R"==(if (krem > 0) { )==""\n"
877R"==(ai[0] = 0; )==""\n"
878R"==(ai[1] = 0; )==""\n"
879R"==(ai[2] = 0; )==""\n"
880R"==(ai[3] = 0; )==""\n"
881R"==(bi[0] = 0; )==""\n"
882R"==(bi[1] = 0; )==""\n"
883R"==(bi[2] = 0; )==""\n"
884R"==(bi[3] = 0; )==""\n"
885R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
886R"==(if (irem2 > lid) { )==""\n"
887R"==(#ifdef ALIGNED )==""\n"
888R"==(ai[hh].s0 = *((a_ptrs[hh] + k_align * lda) + 0 + lid); )==""\n"
889R"==(ai[hh].s1 = *((a_ptrs[hh] + k_align * lda) + 16 + lid); )==""\n"
890R"==(#else )==""\n"
891R"==(ai[hh].s0 = *((a_ptrs[hh] + k_align * lda) + 0); )==""\n"
892R"==(ai[hh].s1 = *((a_ptrs[hh] + k_align * lda) + 16); )==""\n"
893R"==(#endif )==""\n"
894R"==(#ifdef BOFFNONZERO )==""\n"
895R"==(ADD_ROW_AT(); )==""\n"
896R"==(#endif )==""\n"
897R"==(} )==""\n"
898R"==(} )==""\n"
899R"==(COPYA(); )==""\n"
900R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
901R"==(if (jrem > lid) { )==""\n"
902R"==(#ifdef ALIGNED )==""\n"
903R"==(bi[hh] = *((b_ptrs[hh] + k_align * ldb) + lid); )==""\n"
904R"==(#else )==""\n"
905R"==(bi[hh] = *(b_ptrs[hh] + k_align * ldb); )==""\n"
906R"==(#endif )==""\n"
907R"==(#ifdef AOFFNONZERO )==""\n"
908R"==(ADD_COL_BT(); )==""\n"
909R"==(#endif )==""\n"
910R"==(} )==""\n"
911R"==(} )==""\n"
912R"==(COPYB(); )==""\n"
913R"==(FMA_I_LOOP(0); )==""\n"
914R"==(#ifdef BOFFNONZERO )==""\n"
915R"==(ADD_BOFF_LOOP(); )==""\n"
916R"==(#endif )==""\n"
917R"==(#ifdef AOFFNONZERO )==""\n"
918R"==(ADD_AOFF_LOOP(krem); )==""\n"
919R"==(#endif )==""\n"
920R"==(} )==""\n"
921R"==(} )==""\n"
922R"==(#endif /* ALLOW_READ_OVERHEAD */ )==""\n"
923R"==(global int *c2 = c + 16; )==""\n"
924R"==(if (beta == 0) )==""\n"
925R"==(UPDATE_C(1); )==""\n"
926R"==(else )==""\n"
927R"==(UPDATE_C(0); )==""\n"
928R"==(} )==""\n"
929R"==(#endif )==""\n"
930R"==(#ifdef TT )==""\n"
931R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
932R"==(xe_lp_gemm_compute_x8x8s32(global A_TYPE *a, global B_TYPE *b, global int *c, )==""\n"
933R"==(int offsetA, int offsetB, int offsetC, int lda, int ldb, int ldc, int m, )==""\n"
934R"==(int n, int k, int beta, global int *ao, global int *bo, global int *co, )==""\n"
935R"==(int offsetCO, int apply_co, local A_TYPE *sa, local B_TYPE *sb, )==""\n"
936R"==(int apply_eltwise, float eltwise_alpha, float eltwise_beta, )==""\n"
937R"==(float eltwise_scale) { )==""\n"
938R"==(A_TYPE4 ai[2]; )==""\n"
939R"==(B_TYPE bi[4]; )==""\n"
940R"==(int ci[2][16]; )==""\n"
941R"==(int sumRowA[2] = {0, 0}; )==""\n"
942R"==(int sumColB = 0; )==""\n"
943R"==(A_TYPE4 biit; )==""\n"
944R"==(int idM = get_group_id(0); )==""\n"
945R"==(int idN = get_group_id(1); )==""\n"
946R"==(int idlM = get_local_id(0); )==""\n"
947R"==(int idlN = get_local_id(1); )==""\n"
948R"==(int lid = get_sub_group_local_id(); )==""\n"
949R"==(int lsm = get_enqueued_local_size(0); )==""\n"
950R"==(int lsn = 8; )==""\n"
951R"==(int i0 = (idM * lsm / 16) * 32 + (get_local_id(0) / 16) * 32; )==""\n"
952R"==(int j0 = idlN * 16 + (idN * lsn * 16); )==""\n"
953R"==(int irem = m - i0 - lid; )==""\n"
954R"==(int jrem = n - j0; )==""\n"
955R"==(if (irem < 0) irem = 0; )==""\n"
956R"==(if (jrem < 0) jrem = 0; )==""\n"
957R"==(int irem2 = m - i0; )==""\n"
958R"==(int jrem2 = n - j0; )==""\n"
959R"==(a += offsetA + (i0 * lda) + (lid * lda); )==""\n"
960R"==(#ifdef ALIGNED )==""\n"
961R"==(b += offsetB + j0; )==""\n"
962R"==(#else )==""\n"
963R"==(b += offsetB + j0 + lid; )==""\n"
964R"==(#endif )==""\n"
965R"==(c += offsetC + (i0) + (j0 * ldc) + lid; )==""\n"
966R"==(int c_offset_type = 0; )==""\n"
967R"==(#ifdef FF )==""\n"
968R"==(co += offsetCO; )==""\n"
969R"==(c_offset_type = 0; )==""\n"
970R"==(#endif )==""\n"
971R"==(#ifdef RR )==""\n"
972R"==(co += offsetCO + i0 + lid; )==""\n"
973R"==(c_offset_type = 1; )==""\n"
974R"==(#endif )==""\n"
975R"==(#ifdef CC )==""\n"
976R"==(co += offsetCO + (j0); )==""\n"
977R"==(c_offset_type = 2; )==""\n"
978R"==(#endif )==""\n"
979R"==(global A_TYPE *a_ptrs[2] = {a, a + 16 * lda}; )==""\n"
980R"==(global B_TYPE *b_ptrs[4] = {b, b + 1 * ldb, b + 2 * ldb, b + 3 * ldb}; )==""\n"
981R"==(for (int y = 0; y < 16; y++) { )==""\n"
982R"==(for (int z = 0; z < 2; z++) { )==""\n"
983R"==(ci[z][y] = 0; )==""\n"
984R"==(} )==""\n"
985R"==(} )==""\n"
986R"==(int k_align = k & ~3; )==""\n"
987R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
988R"==(if (irem2 >= 32 && jrem2 >= 16) { )==""\n"
989R"==(#endif )==""\n"
990R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
991R"==(for (int z = 0; z < 2; z++) { )==""\n"
992R"==(VLOAD4_A(z, ((a_ptrs[z]) + h)); )==""\n"
993R"==(#ifdef BOFFNONZERO )==""\n"
994R"==(ADD_ROW_A(z); )==""\n"
995R"==(#endif )==""\n"
996R"==(} )==""\n"
997R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
998R"==(BLOCK_READ_B(h, hh); )==""\n"
999R"==(#ifdef AOFFNONZERO )==""\n"
1000R"==(ADD_COL_BT(); )==""\n"
1001R"==(#endif )==""\n"
1002R"==(} )==""\n"
1003R"==(COPYB(); )==""\n"
1004R"==(FMA_I_LOOP(0); )==""\n"
1005R"==(#ifdef BOFFNONZERO )==""\n"
1006R"==(ADD_BOFF_LOOP(); )==""\n"
1007R"==(#endif )==""\n"
1008R"==(#ifdef AOFFNONZERO )==""\n"
1009R"==(ADD_AOFF_LOOP(4); )==""\n"
1010R"==(#endif )==""\n"
1011R"==(} )==""\n"
1012R"==(int krem = k & 3; )==""\n"
1013R"==(if (krem > 0) { )==""\n"
1014R"==(ai[0] = 0; )==""\n"
1015R"==(ai[1] = 0; )==""\n"
1016R"==(bi[0] = 0; )==""\n"
1017R"==(bi[1] = 0; )==""\n"
1018R"==(bi[2] = 0; )==""\n"
1019R"==(bi[3] = 0; )==""\n"
1020R"==(for (int z = 0; z < 2; z++) { )==""\n"
1021R"==(LOADA_REM(z, ((a_ptrs[z]) + k_align)); )==""\n"
1022R"==(#ifdef BOFFNONZERO )==""\n"
1023R"==(ADD_ROW_A(z); )==""\n"
1024R"==(#endif )==""\n"
1025R"==(} )==""\n"
1026R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
1027R"==(BLOCK_READ_B(k_align, hh); )==""\n"
1028R"==(#ifdef AOFFNONZERO )==""\n"
1029R"==(ADD_COL_BT(); )==""\n"
1030R"==(#endif )==""\n"
1031R"==(} )==""\n"
1032R"==(COPYB(); )==""\n"
1033R"==(FMA_I_LOOP(0); )==""\n"
1034R"==(#ifdef BOFFNONZERO )==""\n"
1035R"==(ADD_BOFF_LOOP(); )==""\n"
1036R"==(#endif )==""\n"
1037R"==(#ifdef AOFFNONZERO )==""\n"
1038R"==(ADD_AOFF_LOOP(krem); )==""\n"
1039R"==(#endif )==""\n"
1040R"==(} )==""\n"
1041R"==(#ifndef ALLOW_READ_OVERRUNS )==""\n"
1042R"==(} else { )==""\n"
1043R"==(for (int h = 0; h < k_align; h += 4) { )==""\n"
1044R"==(for (int z = 0; z < 2; z++) { )==""\n"
1045R"==(if (irem > z * 16) { )==""\n"
1046R"==(VLOAD4_A(z, (a_ptrs[z] + h)); )==""\n"
1047R"==(#ifdef BOFFNONZERO )==""\n"
1048R"==(ADD_ROW_A(z); )==""\n"
1049R"==(#endif )==""\n"
1050R"==(} )==""\n"
1051R"==(} )==""\n"
1052R"==(for (int hh = 0; hh < 4; hh++) { )==""\n"
1053R"==(if (jrem > lid) { )==""\n"
1054R"==(#ifdef ALIGNED )==""\n"
1055R"==(bi[hh] = *((b_ptrs[hh] + h * ldb) + lid); )==""\n"
1056R"==(#else )==""\n"
1057R"==(bi[hh] = *(b_ptrs[hh] + h * ldb); )==""\n"
1058R"==(#endif )==""\n"
1059R"==(#ifdef AOFFNONZERO )==""\n"
1060R"==(ADD_COL_BT(); )==""\n"
1061R"==(#endif )==""\n"
1062R"==(} )==""\n"
1063R"==(} )==""\n"
1064R"==(COPYB(); )==""\n"
1065R"==(FMA_I_LOOP(0); )==""\n"
1066R"==(#ifdef BOFFNONZERO )==""\n"
1067R"==(ADD_BOFF_LOOP(); )==""\n"
1068R"==(#endif )==""\n"
1069R"==(#ifdef AOFFNONZERO )==""\n"
1070R"==(ADD_AOFF_LOOP(4); )==""\n"
1071R"==(#endif )==""\n"
1072R"==(} )==""\n"
1073R"==(int krem = k & 3; )==""\n"
1074R"==(if (krem > 0) { )==""\n"
1075R"==(ai[0] = 0; )==""\n"
1076R"==(ai[1] = 0; )==""\n"
1077R"==(bi[0] = 0; )==""\n"
1078R"==(bi[1] = 0; )==""\n"
1079R"==(bi[2] = 0; )==""\n"
1080R"==(bi[3] = 0; )==""\n"
1081R"==(for (int z = 0; z < 2; z++) { )==""\n"
1082R"==(if (irem > z * 16) { )==""\n"
1083R"==(LOADA_REM(z, (a_ptrs[z] + k_align)); )==""\n"
1084R"==(#ifdef BOFFNONZERO )==""\n"
1085R"==(ADD_ROW_A(z); )==""\n"
1086R"==(#endif )==""\n"
1087R"==(} )==""\n"
1088R"==(} )==""\n"
1089R"==(for (int hh = 0; hh < krem; hh++) { )==""\n"
1090R"==(if (jrem > lid) { )==""\n"
1091R"==(#ifdef ALIGNED )==""\n"
1092R"==(bi[hh] = *((b_ptrs[hh] + k_align * ldb) + lid); )==""\n"
1093R"==(#else )==""\n"
1094R"==(bi[hh] = *(b_ptrs[hh] + k_align * ldb); )==""\n"
1095R"==(#endif )==""\n"
1096R"==(#ifdef AOFFNONZERO )==""\n"
1097R"==(ADD_COL_BT(); )==""\n"
1098R"==(#endif )==""\n"
1099R"==(} )==""\n"
1100R"==(} )==""\n"
1101R"==(COPYB(); )==""\n"
1102R"==(FMA_I_LOOP(0); )==""\n"
1103R"==(#ifdef BOFFNONZERO )==""\n"
1104R"==(ADD_BOFF_LOOP(); )==""\n"
1105R"==(#endif )==""\n"
1106R"==(#ifdef AOFFNONZERO )==""\n"
1107R"==(ADD_AOFF_LOOP(krem); )==""\n"
1108R"==(#endif )==""\n"
1109R"==(} )==""\n"
1110R"==(} )==""\n"
1111R"==(#endif /* ALLOW_READ_OVERHEAD */ )==""\n"
1112R"==(global int *c2 = c + 16; )==""\n"
1113R"==(if (beta == 0) )==""\n"
1114R"==(UPDATE_C(1); )==""\n"
1115R"==(else )==""\n"
1116R"==(UPDATE_C(0); )==""\n"
1117R"==(} )==""\n"
1118R"==(#endif )==""\n"
1119R"==()==";
1120}
1121}
1122}
1123}