xe_hpc_systolic_gemm_copy_kernel.cpp source code [oneDNN/build/src/gpu/ocl/xe_hpc_systolic_gemm_copy_kernel.cpp]

1	namespace dnnl {
2	namespace impl {
3	namespace gpu {
4	namespace ocl {
5	const char xe_hpc_systolic_gemm_copy_kernel = R"==(/****************************************************************************** )==""\n"
6	R"==(* Copyright 2021-2022 Intel Corporation )==""\n"
7	R"==(* )==""\n"
8	R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9	R"==(* you may not use this file except in compliance with the License. )==""\n"
10	R"==(* You may obtain a copy of the License at )==""\n"
11	R"==(* )==""\n"
12	R"==(* http: )==""\n"
13	R"==(* )==""\n"
14	R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15	R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16	R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17	R"==(* See the License for the specific language governing permissions and )==""\n"
18	R"==(* limitations under the License. )==""\n"
19	R"==(*******************************************************************************/ )==""\n"
20	R"==(#if ELEMENT_SIZE == 2 )==""\n"
21	R"==(#pragma OPENCL EXTENSION cl_intel_subgroups_short : enable )==""\n"
22	R"==(#define ELEMENT ushort )==""\n"
23	R"==(#define ELEMENT2 ushort2 )==""\n"
24	R"==(#define ELEMENT4 ushort4 )==""\n"
25	R"==(#define ELEMENT_WORD ushort )==""\n"
26	R"==(#define ELEMENT_WORD4 ushort4 )==""\n"
27	R"==(#define ELEMENT_INT ushort2 )==""\n"
28	R"==(#define ELEMENT_INT4 ushort8 )==""\n"
29	R"==(#define VLOAD_ELEMENT_INT vload2 )==""\n"
30	R"==(#define ELEMENTS_PER_INT 2 )==""\n"
31	R"==(#define BLOCK_READ_ELEMENT2 intel_sub_group_block_read_us2 )==""\n"
32	R"==(#define BLOCK_READ_ELEMENT4 intel_sub_group_block_read_us4 )==""\n"
33	R"==(#define BLOCK_READ_ELEMENT_WORD intel_sub_group_block_read_us )==""\n"
34	R"==(#define MASKED_BLOCK_READ_ELEMENT_WORD masked_block_read_element )==""\n"
35	R"==(#define BLOCK_WRITE_ELEMENT_WORD4 intel_sub_group_block_write_us4 )==""\n"
36	R"==(#define BLOCK_WRITE_ELEMENT_INT4 intel_sub_group_block_write_us8 )==""\n"
37	R"==(#elif ELEMENT_SIZE == 1 )==""\n"
38	R"==(#define ELEMENT uchar )==""\n"
39	R"==(#define ELEMENT2 uchar2 )==""\n"
40	R"==(#define ELEMENT4 uchar4 )==""\n"
41	R"==(#define ELEMENT_WORD uchar2 )==""\n"
42	R"==(#define ELEMENT_WORD4 uchar8 )==""\n"
43	R"==(#define ELEMENT_INT uchar4 )==""\n"
44	R"==(#define ELEMENT_INT4 uchar16 )==""\n"
45	R"==(#define VLOAD_ELEMENT_INT vload4 )==""\n"
46	R"==(#define BLOCK_READ_ELEMENT2 intel_sub_group_block_read_uc2 )==""\n"
47	R"==(#define BLOCK_READ_ELEMENT4 intel_sub_group_block_read_uc4 )==""\n"
48	R"==(#define BLOCK_READ_ELEMENT_WORD intel_sub_group_block_read_uc2 )==""\n"
49	R"==(#define MASKED_BLOCK_READ_ELEMENT_WORD masked_block_read_element2 )==""\n"
50	R"==(#define BLOCK_WRITE_ELEMENT_WORD4 intel_sub_group_block_write_uc8 )==""\n"
51	R"==(#define BLOCK_WRITE_ELEMENT_INT2 intel_sub_group_block_write_uc8 )==""\n"
52	R"==(#define BLOCK_WRITE_ELEMENT_INT4 intel_sub_group_block_write_uc16 )==""\n"
53	R"==(#define ELEMENTS_PER_INT 4 )==""\n"
54	R"==(#define SUM_T int )==""\n"
55	R"==(#define SUM_T2 int2 )==""\n"
56	R"==(#define SUM_T4 int4 )==""\n"
57	R"==(#define CONVERT_SUM_T convert_int )==""\n"
58	R"==(#define CONVERT_SUM_T2 convert_int2 )==""\n"
59	R"==(#define CONVERT_SUM_T4 convert_int4 )==""\n"
60	R"==(#if COPY_SIGNED )==""\n"
61	R"==(#define AS_SIGNED_ELEMENT as_char )==""\n"
62	R"==(#define AS_SIGNED_ELEMENT4 as_char4 )==""\n"
63	R"==(#define AS_SIGNED_ELEMENT_WORD as_char2 )==""\n"
64	R"==(#define AS_SIGNED_ELEMENT_INT as_char4 )==""\n"
65	R"==(#define SIGNED_ELEMENT_WORD char2 )==""\n"
66	R"==(#define SIGNED_ELEMENT_INT char4 )==""\n"
67	R"==(#else )==""\n"
68	R"==(#define AS_SIGNED_ELEMENT as_uchar )==""\n"
69	R"==(#define AS_SIGNED_ELEMENT4 as_uchar4 )==""\n"
70	R"==(#define AS_SIGNED_ELEMENT_WORD as_uchar2 )==""\n"
71	R"==(#define AS_SIGNED_ELEMENT_INT as_uchar4 )==""\n"
72	R"==(#define SIGNED_ELEMENT_WORD uchar2 )==""\n"
73	R"==(#define SIGNED_ELEMENT_INT uchar4 )==""\n"
74	R"==(#endif )==""\n"
75	R"==(#else )==""\n"
76	R"==(#error Unsupported element size. )==""\n"
77	R"==(#endif )==""\n"
78	R"==(#if !COPY_A && !COPY_B )==""\n"
79	R"==(#error Source matrix not defined. )==""\n"
80	R"==(#endif )==""\n"
81	R"==(inline ELEMENT masked_block_read_element(global ELEMENT *p, int rem) { )==""\n"
82	R"==(ELEMENT v; )==""\n"
83	R"==(int lid = get_sub_group_local_id(); )==""\n"
84	R"==(int sg = get_sub_group_size(); )==""\n"
85	R"==(v = (lid < rem) ? p[lid] : 0; )==""\n"
86	R"==(return v; )==""\n"
87	R"==(} )==""\n"
88	R"==(inline ELEMENT2 masked_block_read_element2(global ELEMENT *p, int rem) { )==""\n"
89	R"==(ELEMENT2 v; )==""\n"
90	R"==(int lid = get_sub_group_local_id(); )==""\n"
91	R"==(int sg = get_sub_group_size(); )==""\n"
92	R"==(v.s0 = (lid < rem) ? p[lid] : 0; )==""\n"
93	R"==(v.s1 = (lid + sg < rem) ? p[lid + sg] : 0; )==""\n"
94	R"==(return v; )==""\n"
95	R"==(} )==""\n"
96	R"==(inline ELEMENT4 masked_block_read_element4(global ELEMENT *p, int rem) { )==""\n"
97	R"==(ELEMENT4 v; )==""\n"
98	R"==(int lid = get_sub_group_local_id(); )==""\n"
99	R"==(int sg = get_sub_group_size(); )==""\n"
100	R"==(v.s0 = (lid < rem) ? p[lid] : 0; )==""\n"
101	R"==(v.s1 = (lid + sg < rem) ? p[lid + sg] : 0; )==""\n"
102	R"==(v.s2 = (lid + 2 * sg < rem) ? p[lid + 2 * sg] : 0; )==""\n"
103	R"==(v.s3 = (lid + 3 * sg < rem) ? p[lid + 3 * sg] : 0; )==""\n"
104	R"==(return v; )==""\n"
105	R"==(} )==""\n"
106	R"==(__attribute__((overloadable)) inline int sum(int v) { )==""\n"
107	R"==(return sub_group_reduce_add(v); )==""\n"
108	R"==(} )==""\n"
109	R"==(__attribute__((overloadable)) inline int sum(int2 v) { )==""\n"
110	R"==(return sub_group_reduce_add(v.s0) + sub_group_reduce_add(v.s1); )==""\n"
111	R"==(} )==""\n"
112	R"==(__attribute__((overloadable)) inline int sum(int4 v) { )==""\n"
113	R"==(return sub_group_reduce_add(v.s0) + sub_group_reduce_add(v.s1) )==""\n"
114	R"==(+ sub_group_reduce_add(v.s2) + sub_group_reduce_add(v.s3); )==""\n"
115	R"==(} )==""\n"
116	R"==(void dummy_dpas() { )==""\n"
117	R"==(if (get_sub_group_local_id() >= 16) { )==""\n"
118	R"==(int __builtin_IB_sub_group_idpas_s8_s8_8_1(int, int, int8) )==""\n"
119	R"==(__attribute__((const)); )==""\n"
120	R"==(global volatile int *_; )==""\n"
121	R"==(int z = __builtin_IB_sub_group_idpas_s8_s8_8_1(0, _[0], 1); )==""\n"
122	R"==(for (int i = 0; i < z; i++) )==""\n"
123	R"==((void)_[0]; )==""\n"
124	R"==(} )==""\n"
125	R"==(} )==""\n"
126	R"==(#define DUMMY_DPAS /dummy_dpas()/ )==""\n"
127	R"==(#if ELEMENT_SIZE == 2 )==""\n"
128	R"==(#define PARTIAL_LOAD(regs, rrem, crem, cc, p) \ )==""\n"
129	R"==(if ((2 * cc + 1) < crem) { \ )==""\n"
130	R"==(if (lid < rrem) regs[cc] = vload2(0, p); \ )==""\n"
131	R"==(} else if ((2 * cc) < crem) { \ )==""\n"
132	R"==(if (lid < rrem) regs[cc].s0 = *(p); \ )==""\n"
133	R"==(} )==""\n"
134	R"==(#elif ELEMENT_SIZE == 1 )==""\n"
135	R"==(#define PARTIAL_LOAD(regs, rrem, crem, cc, p) \ )==""\n"
136	R"==(if ((4 * cc + 3) < crem) { \ )==""\n"
137	R"==(if (lid < rrem) regs[cc] = vload4(0, p); \ )==""\n"
138	R"==(} else if ((4 * cc + 2) < crem) { \ )==""\n"
139	R"==(if (lid < rrem) regs[cc].s012 = vload3(0, p); \ )==""\n"
140	R"==(} else if ((4 * cc + 1) < crem) { \ )==""\n"
141	R"==(if (lid < rrem) regs[cc].s01 = vload2(0, p); \ )==""\n"
142	R"==(} else if (4 * cc < crem) { \ )==""\n"
143	R"==(if (lid < rrem) regs[cc].s0 = *(p); \ )==""\n"
144	R"==(} )==""\n"
145	R"==(#endif )==""\n"
146	R"==(#if COPY_A )==""\n"
147	R"==(#define UNROLL_M 64 )==""\n"
148	R"==(#define UNROLL_K (32 / ELEMENT_SIZE) )==""\n"
149	R"==(#if COPY_SUM )==""\n"
150	R"==(#define GET_A_SUM_ADDRESS \ )==""\n"
151	R"==(global int a_sum = (global int )(a_packed + offseta_packed \ )==""\n"
152	R"==(+ (m0 + UNROLL_M) * lda_packed - UNROLL_M * sizeof(int)); )==""\n"
153	R"==(#else )==""\n"
154	R"==(#define GET_A_SUM_ADDRESS )==""\n"
155	R"==(#endif )==""\n"
156	R"==(#if COPY_CLEAR_SUM )==""\n"
157	R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
158	R"==(xe_hpc_systolic_gemm_copy(long m, long k, global ELEMENT *a_packed, )==""\n"
159	R"==(int offseta_packed, int lda_packed) { )==""\n"
160	R"==(uint m0 = (sub_group_broadcast(get_global_id(0), 0) / 16) * UNROLL_M; )==""\n"
161	R"==(GET_A_SUM_ADDRESS; )==""\n"
162	R"==(uint4 zero = 0; )==""\n"
163	R"==(intel_sub_group_block_write4(a_sum, zero); )==""\n"
164	R"==(} )==""\n"
165	R"==(#elif !COPY_TRANS )==""\n"
166	R"==(#if ELEMENT_SIZE == 2 )==""\n"
167	R"==(#define REPACK_REG(rr, cc) \ )==""\n"
168	R"==(blk_r[rr].s##cc = (((uint)c[2 * cc + 1].s##rr) << 16) \| c[2 * cc].s##rr )==""\n"
169	R"==(#elif ELEMENT_SIZE == 1 )==""\n"
170	R"==(#define REPACK_REG(rr, cc) \ )==""\n"
171	R"==(blk_r[rr].s##cc = (((uint)c[4 * cc + 3].s##rr) << 24) \ )==""\n"
172	R"==(\| (((uint)c[4 * cc + 2].s##rr) << 16) \ )==""\n"
173	R"==(\| (((uint)c[4 * cc + 1].s##rr) << 8) \| c[4 * cc].s##rr )==""\n"
174	R"==(#endif )==""\n"
175	R"==(#define REPACK_CC(cc) \ )==""\n"
176	R"==(REPACK_REG(0, cc); \ )==""\n"
177	R"==(REPACK_REG(1, cc); \ )==""\n"
178	R"==(REPACK_REG(2, cc); \ )==""\n"
179	R"==(REPACK_REG(3, cc) )==""\n"
180	R"==(#define REPACK \ )==""\n"
181	R"==(REPACK_CC(0); \ )==""\n"
182	R"==(REPACK_CC(1); \ )==""\n"
183	R"==(REPACK_CC(2); \ )==""\n"
184	R"==(REPACK_CC(3); \ )==""\n"
185	R"==(REPACK_CC(4); \ )==""\n"
186	R"==(REPACK_CC(5); \ )==""\n"
187	R"==(REPACK_CC(6); \ )==""\n"
188	R"==(REPACK_CC(7) )==""\n"
189	R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
190	R"==(xe_hpc_systolic_gemm_copy(long m, long k, global ELEMENT *a, long offseta, )==""\n"
191	R"==(long lda, global ELEMENT *a_packed, int offseta_packed, )==""\n"
192	R"==(int lda_packed) { )==""\n"
193	R"==(int lid = get_sub_group_local_id(); )==""\n"
194	R"==(uint m0 = (sub_group_broadcast(get_global_id(0), 0) / 16) * UNROLL_M; )==""\n"
195	R"==(uint k0 = get_global_id(1) * UNROLL_K; )==""\n"
196	R"==(int mrem = m - m0; )==""\n"
197	R"==(int krem = k - k0; )==""\n"
198	R"==(bool aligned = ((as_long(a) \| lda \| offseta) & (ELEMENTS_PER_INT - 1)) == 0; )==""\n"
199	R"==(if (mrem <= 0 \|\| krem <= 0) return; )==""\n"
200	R"==(GET_A_SUM_ADDRESS; )==""\n"
201	R"==(a += offseta + m0 + k0 * lda; )==""\n"
202	R"==(a_packed += offseta_packed + m0 * lda_packed + k0 * UNROLL_M; )==""\n"
203	R"==(ELEMENT4 c[UNROLL_K]; )==""\n"
204	R"==(if (mrem >= UNROLL_M && krem >= UNROLL_K && aligned) { )==""\n"
205	R"==(for (int h = 0; h < UNROLL_K; h++) )==""\n"
206	R"==(c[h] = BLOCK_READ_ELEMENT4(a + h * lda); )==""\n"
207	R"==(} else { )==""\n"
208	R"==(for (int h = 0; h < UNROLL_K; h++) )==""\n"
209	R"==(if (h < krem) )==""\n"
210	R"==(c[h] = masked_block_read_element4(a + h * lda, mrem); )==""\n"
211	R"==(else )==""\n"
212	R"==(c[h] = 0; )==""\n"
213	R"==(} )==""\n"
214	R"==(uint8 blk_r[UNROLL_M / 16]; )==""\n"
215	R"==(REPACK; )==""\n"
216	R"==(for (int rr = 0; rr < UNROLL_M / 16; rr++) )==""\n"
217	R"==(intel_sub_group_block_write8( )==""\n"
218	R"==((global uint )(a_packed + rr UNROLL_K * 16), blk_r[rr]); )==""\n"
219	R"==(#if COPY_SUM )==""\n"
220	R"==(SUM_T4 sum = 0; )==""\n"
221	R"==(for (int h = 0; h < UNROLL_K; h++) )==""\n"
222	R"==(sum += CONVERT_SUM_T4(AS_SIGNED_ELEMENT4(c[h])); )==""\n"
223	R"==(atomic_add(a_sum + lid, sum.s0); )==""\n"
224	R"==(atomic_add(a_sum + lid + 16, sum.s1); )==""\n"
225	R"==(atomic_add(a_sum + lid + 32, sum.s2); )==""\n"
226	R"==(atomic_add(a_sum + lid + 48, sum.s3); )==""\n"
227	R"==(#endif )==""\n"
228	R"==(DUMMY_DPAS; )==""\n"
229	R"==(} )==""\n"
230	R"==(#else /* COPY_TRANS */ )==""\n"
231	R"==(__attribute__((intel_reqd_workgroup_walk_order(1, 0))) )==""\n"
232	R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
233	R"==(xe_hpc_systolic_gemm_copy(long m, long k, global ELEMENT *a, long offseta, )==""\n"
234	R"==(long lda, global ELEMENT *a_packed, int offseta_packed, )==""\n"
235	R"==(int lda_packed) { )==""\n"
236	R"==(int lid = get_sub_group_local_id(); )==""\n"
237	R"==(uint m0 = (sub_group_broadcast(get_global_id(0), 0) / 16) * UNROLL_M; )==""\n"
238	R"==(uint k0 = get_global_id(1) * UNROLL_K; )==""\n"
239	R"==(int mrem = m - m0; )==""\n"
240	R"==(int krem = k - k0; )==""\n"
241	R"==(if (mrem <= 0 \|\| krem <= 0) return; )==""\n"
242	R"==(GET_A_SUM_ADDRESS; )==""\n"
243	R"==(a += offseta + m0 * lda + k0; )==""\n"
244	R"==(a_packed += offseta_packed + m0 * lda_packed + k0 * UNROLL_M; )==""\n"
245	R"==(#if COPY_SUM )==""\n"
246	R"==(SUM_T sum[UNROLL_M / 16] = {0}; )==""\n"
247	R"==(#endif )==""\n"
248	R"==(for (int rr = 0; rr < UNROLL_M / 16; rr++, mrem -= 16) { )==""\n"
249	R"==(ELEMENT_INT regs[8]; )==""\n"
250	R"==(if (mrem >= UNROLL_M && krem >= UNROLL_K) { )==""\n"
251	R"==(for (int cc = 0; cc < UNROLL_K / ELEMENTS_PER_INT; cc++) )==""\n"
252	R"==(regs[cc] = VLOAD_ELEMENT_INT(0, )==""\n"
253	R"==(a + ((rr * 16) + lid) * lda + (cc * ELEMENTS_PER_INT)); )==""\n"
254	R"==(} else { )==""\n"
255	R"==(for (int cc = 0; cc < UNROLL_K / ELEMENTS_PER_INT; cc++) { )==""\n"
256	R"==(regs[cc] = 0; )==""\n"
257	R"==(PARTIAL_LOAD(regs, mrem, krem, cc, )==""\n"
258	R"==(a + ((rr * 16) + lid) * lda + (cc * ELEMENTS_PER_INT)); )==""\n"
259	R"==(} )==""\n"
260	R"==(} )==""\n"
261	R"==(uint8 blk_r; )==""\n"
262	R"==(blk_r.s0 = as_uint(regs[0]); )==""\n"
263	R"==(blk_r.s1 = as_uint(regs[1]); )==""\n"
264	R"==(blk_r.s2 = as_uint(regs[2]); )==""\n"
265	R"==(blk_r.s3 = as_uint(regs[3]); )==""\n"
266	R"==(blk_r.s4 = as_uint(regs[4]); )==""\n"
267	R"==(blk_r.s5 = as_uint(regs[5]); )==""\n"
268	R"==(blk_r.s6 = as_uint(regs[6]); )==""\n"
269	R"==(blk_r.s7 = as_uint(regs[7]); )==""\n"
270	R"==(#if COPY_SUM )==""\n"
271	R"==(for (int cc = 0; cc < UNROLL_K / ELEMENTS_PER_INT; cc++) { )==""\n"
272	R"==(sum[rr] += CONVERT_SUM_T(AS_SIGNED_ELEMENT(regs[cc].s0)); )==""\n"
273	R"==(sum[rr] += CONVERT_SUM_T(AS_SIGNED_ELEMENT(regs[cc].s1)); )==""\n"
274	R"==(sum[rr] += CONVERT_SUM_T(AS_SIGNED_ELEMENT(regs[cc].s2)); )==""\n"
275	R"==(sum[rr] += CONVERT_SUM_T(AS_SIGNED_ELEMENT(regs[cc].s3)); )==""\n"
276	R"==(} )==""\n"
277	R"==(#endif )==""\n"
278	R"==(intel_sub_group_block_write8( )==""\n"
279	R"==((global uint )(a_packed + rr UNROLL_K * 16), blk_r); )==""\n"
280	R"==(} )==""\n"
281	R"==(#if COPY_SUM )==""\n"
282	R"==(atomic_add(a_sum + lid, sum[0]); )==""\n"
283	R"==(atomic_add(a_sum + lid + 16, sum[1]); )==""\n"
284	R"==(atomic_add(a_sum + lid + 32, sum[2]); )==""\n"
285	R"==(atomic_add(a_sum + lid + 48, sum[3]); )==""\n"
286	R"==(#endif )==""\n"
287	R"==(DUMMY_DPAS; )==""\n"
288	R"==(} )==""\n"
289	R"==(#endif /* !COPY_TRANS */ )==""\n"
290	R"==(#endif /* COPY_A */ )==""\n"
291	R"==(#if COPY_B )==""\n"
292	R"==(#define UNROLL_K (32 / ELEMENT_SIZE) )==""\n"
293	R"==(#if ELEMENT_SIZE == 2 )==""\n"
294	R"==(#define REPACK_CC_WORD(cc) \ )==""\n"
295	R"==(do { \ )==""\n"
296	R"==(colgroups[cc].s0 = cols[cc * 4]; \ )==""\n"
297	R"==(colgroups[cc].s1 = cols[cc * 4 + 1]; \ )==""\n"
298	R"==(colgroups[cc].s2 = cols[cc * 4 + 2]; \ )==""\n"
299	R"==(colgroups[cc].s3 = cols[cc * 4 + 3]; \ )==""\n"
300	R"==(} while (false) )==""\n"
301	R"==(#define REPACK_CC(cc) \ )==""\n"
302	R"==(do { \ )==""\n"
303	R"==(colgroups[cc].s01 = cols[cc * 4]; \ )==""\n"
304	R"==(colgroups[cc].s23 = cols[cc * 4 + 1]; \ )==""\n"
305	R"==(colgroups[cc].s45 = cols[cc * 4 + 2]; \ )==""\n"
306	R"==(colgroups[cc].s67 = cols[cc * 4 + 3]; \ )==""\n"
307	R"==(} while (false) )==""\n"
308	R"==(#elif ELEMENT_SIZE == 1 )==""\n"
309	R"==(#define REPACK_CC_WORD(cc) \ )==""\n"
310	R"==(do { \ )==""\n"
311	R"==(colgroups[cc].s01 = cols[cc * 4]; \ )==""\n"
312	R"==(colgroups[cc].s23 = cols[cc * 4 + 1]; \ )==""\n"
313	R"==(colgroups[cc].s45 = cols[cc * 4 + 2]; \ )==""\n"
314	R"==(colgroups[cc].s67 = cols[cc * 4 + 3]; \ )==""\n"
315	R"==(} while (false) )==""\n"
316	R"==(#define REPACK_CC(cc) \ )==""\n"
317	R"==(do { \ )==""\n"
318	R"==(colgroups[cc].s0123 = cols[cc * 4]; \ )==""\n"
319	R"==(colgroups[cc].s4567 = cols[cc * 4 + 1]; \ )==""\n"
320	R"==(colgroups[cc].s89ab = cols[cc * 4 + 2]; \ )==""\n"
321	R"==(colgroups[cc].scdef = cols[cc * 4 + 3]; \ )==""\n"
322	R"==(} while (false) )==""\n"
323	R"==(#define REPACK_CC2(cc) \ )==""\n"
324	R"==(do { \ )==""\n"
325	R"==(colgroups[cc].s0246 = cols[cc * 2]; \ )==""\n"
326	R"==(colgroups[cc].s1357 = cols2[cc * 2]; \ )==""\n"
327	R"==(colgroups[cc].s8ace = cols[cc * 2 + 1]; \ )==""\n"
328	R"==(colgroups[cc].s9bdf = cols2[cc * 2 + 1]; \ )==""\n"
329	R"==(} while (false) )==""\n"
330	R"==(#endif )==""\n"
331	R"==(#if COPY_SUM )==""\n"
332	R"==(#define GET_B_SUM_ADDRESS \ )==""\n"
333	R"==(global int b_sum = (global int )(b_packed + offsetb_packed \ )==""\n"
334	R"==(+ (n0 + UNROLL_N) * ldb_packed - UNROLL_N * sizeof(int)); )==""\n"
335	R"==(#else )==""\n"
336	R"==(#define GET_B_SUM_ADDRESS )==""\n"
337	R"==(#endif )==""\n"
338	R"==(#if COPY_CLEAR_SUM )==""\n"
339	R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
340	R"==(xe_hpc_systolic_gemm_copy(long k, long n, global ELEMENT *b_packed, )==""\n"
341	R"==(int offsetb_packed, int ldb_packed) { )==""\n"
342	R"==(uint n0 = (sub_group_broadcast(get_global_id(0), 0) / 16) * UNROLL_N; )==""\n"
343	R"==(GET_B_SUM_ADDRESS; )==""\n"
344	R"==(uint2 zero = 0; )==""\n"
345	R"==(intel_sub_group_block_write2(b_sum, zero); )==""\n"
346	R"==(#if UNROLL_N > 32 )==""\n"
347	R"==(intel_sub_group_block_write(b_sum + 32, zero.s0); )==""\n"
348	R"==(#endif )==""\n"
349	R"==(} )==""\n"
350	R"==(#elif !COPY_TRANS )==""\n"
351	R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
352	R"==(xe_hpc_systolic_gemm_copy(long k, long n, global ELEMENT *b, long offsetb, )==""\n"
353	R"==(long ldb, global ELEMENT *b_packed, int offsetb_packed, )==""\n"
354	R"==(int ldb_packed) { )==""\n"
355	R"==(int lid = get_sub_group_local_id(); )==""\n"
356	R"==(uint k0 = (sub_group_broadcast(get_global_id(0), 0) / 16) * UNROLL_K; )==""\n"
357	R"==(uint n0 = get_global_id(1) * UNROLL_N; )==""\n"
358	R"==(int krem = k - k0; )==""\n"
359	R"==(int nrem = n - n0; )==""\n"
360	R"==(bool aligned = ((as_long(b) \| ldb \| offsetb) & (ELEMENTS_PER_INT - 1)) == 0; )==""\n"
361	R"==(if (nrem <= 0 \|\| krem <= 0) return; )==""\n"
362	R"==(GET_B_SUM_ADDRESS; )==""\n"
363	R"==(b += offsetb + k0 + n0 * ldb; )==""\n"
364	R"==(b_packed += offsetb_packed + n0 * ldb_packed + k0 * UNROLL_N; )==""\n"
365	R"==(#define UNROLL_N_CHUNK (UNROLL_N / 2) )==""\n"
366	R"==(#if COPY_SUM )==""\n"
367	R"==(SUM_T sums[UNROLL_N]; )==""\n"
368	R"==(#endif )==""\n"
369	R"==(ELEMENT_WORD cols[UNROLL_N / 2]; )==""\n"
370	R"==(for (int c0 = 0; c0 < UNROLL_N; )==""\n"
371	R"==(c0 += UNROLL_N_CHUNK, nrem -= UNROLL_N_CHUNK) { )==""\n"
372	R"==(if (krem >= UNROLL_K && nrem >= UNROLL_N_CHUNK && aligned) { )==""\n"
373	R"==(for (int c = 0; c < UNROLL_N_CHUNK; c++) )==""\n"
374	R"==(cols[c] = BLOCK_READ_ELEMENT_WORD(b + (c + c0) * ldb); )==""\n"
375	R"==(} else { )==""\n"
376	R"==(for (int c = 0; c < UNROLL_N_CHUNK; c++) )==""\n"
377	R"==(if (c < nrem) )==""\n"
378	R"==(cols[c] = MASKED_BLOCK_READ_ELEMENT_WORD( )==""\n"
379	R"==(b + (c + c0) * ldb, krem); )==""\n"
380	R"==(else )==""\n"
381	R"==(cols[c] = 0; )==""\n"
382	R"==(} )==""\n"
383	R"==(ELEMENT_WORD4 colgroups[UNROLL_N_CHUNK / 4]; )==""\n"
384	R"==(for (int cc = 0; cc < UNROLL_N_CHUNK / 4; cc++) )==""\n"
385	R"==(REPACK_CC_WORD(cc); )==""\n"
386	R"==(for (int cc = 0; cc < UNROLL_N_CHUNK / 4; cc++) )==""\n"
387	R"==(BLOCK_WRITE_ELEMENT_WORD4( )==""\n"
388	R"==(b_packed + (cc * 4 + c0) * UNROLL_K, colgroups[cc]); )==""\n"
389	R"==(#if COPY_SUM )==""\n"
390	R"==(for (int c = 0; c < UNROLL_N_CHUNK; c++) )==""\n"
391	R"==(sums[c + c0] = sum(CONVERT_SUM_T2(AS_SIGNED_ELEMENT_WORD(cols[c]))); )==""\n"
392	R"==(#endif )==""\n"
393	R"==(} )==""\n"
394	R"==(#if COPY_SUM )==""\n"
395	R"==(for (int c0 = 0; c0 < UNROLL_N; c0 += get_sub_group_size()) )==""\n"
396	R"==(atomic_add(b_sum + c0 + lid, sums[c0 + lid]); )==""\n"
397	R"==(#endif )==""\n"
398	R"==(DUMMY_DPAS; )==""\n"
399	R"==(} )==""\n"
400	R"==(#else /* COPY_TRANS */ )==""\n"
401	R"==(#define ADD_SUM(coln) \ )==""\n"
402	R"==(for (int cc = 0; cc < UNROLL_N / 4; cc++) { \ )==""\n"
403	R"==(sums[4 * cc + 0] \ )==""\n"
404	R"==(+= sum(CONVERT_SUM_T(AS_SIGNED_ELEMENT(coln[cc].s0))); \ )==""\n"
405	R"==(sums[4 * cc + 1] \ )==""\n"
406	R"==(+= sum(CONVERT_SUM_T(AS_SIGNED_ELEMENT(coln[cc].s1))); \ )==""\n"
407	R"==(sums[4 * cc + 2] \ )==""\n"
408	R"==(+= sum(CONVERT_SUM_T(AS_SIGNED_ELEMENT(coln[cc].s2))); \ )==""\n"
409	R"==(sums[4 * cc + 3] \ )==""\n"
410	R"==(+= sum(CONVERT_SUM_T(AS_SIGNED_ELEMENT(coln[cc].s3))); \ )==""\n"
411	R"==(} )==""\n"
412	R"==(__attribute__((intel_reqd_workgroup_walk_order(1, 0))) )==""\n"
413	R"==(__attribute__((intel_reqd_sub_group_size(16))) kernel void )==""\n"
414	R"==(xe_hpc_systolic_gemm_copy(long k, long n, global ELEMENT *b, long offsetb, )==""\n"
415	R"==(long ldb, global ELEMENT *b_packed, int offsetb_packed, )==""\n"
416	R"==(int ldb_packed) { )==""\n"
417	R"==(int lid = get_sub_group_local_id(); )==""\n"
418	R"==(uint k0 = (sub_group_broadcast(get_global_id(0), 0) / 16) * UNROLL_K; )==""\n"
419	R"==(uint n0 = get_global_id(1) * UNROLL_N; )==""\n"
420	R"==(int krem = k - k0; )==""\n"
421	R"==(int nrem = n - n0; )==""\n"
422	R"==(int sg = get_sub_group_size(); )==""\n"
423	R"==(if (nrem <= 0 \|\| krem <= 0) return; )==""\n"
424	R"==(GET_B_SUM_ADDRESS; )==""\n"
425	R"==(b += offsetb + n0 + k0 * ldb; )==""\n"
426	R"==(b_packed += offsetb_packed + n0 * ldb_packed + k0 * UNROLL_N; )==""\n"
427	R"==(ELEMENT_INT cols[UNROLL_N / ELEMENTS_PER_INT]; )==""\n"
428	R"==(ELEMENT_INT4 colgroups[UNROLL_N / 8]; )==""\n"
429	R"==(if (krem >= sg && nrem >= UNROLL_N) { )==""\n"
430	R"==(for (int cc = 0; cc < UNROLL_N / ELEMENTS_PER_INT; cc++) )==""\n"
431	R"==(cols[cc] = VLOAD_ELEMENT_INT( )==""\n"
432	R"==(0, b + cc * ELEMENTS_PER_INT + lid * ldb); )==""\n"
433	R"==(} else { )==""\n"
434	R"==(for (int cc = 0; cc < UNROLL_N / ELEMENTS_PER_INT; cc++) { )==""\n"
435	R"==(cols[cc] = 0; )==""\n"
436	R"==(PARTIAL_LOAD(cols, krem, nrem, cc, )==""\n"
437	R"==(b + cc * ELEMENTS_PER_INT + lid * ldb); )==""\n"
438	R"==(} )==""\n"
439	R"==(} )==""\n"
440	R"==(#if ELEMENT_SIZE == 2 )==""\n"
441	R"==(for (int cc = 0; cc < UNROLL_N / 8; cc++) )==""\n"
442	R"==(REPACK_CC(cc); )==""\n"
443	R"==(#else )==""\n"
444	R"==(#if COPY_SUM )==""\n"
445	R"==(SUM_T sums[UNROLL_N] = {0}; )==""\n"
446	R"==(ADD_SUM(cols); )==""\n"
447	R"==(#endif )==""\n"
448	R"==(ELEMENT_INT cols2[UNROLL_N / ELEMENTS_PER_INT]; )==""\n"
449	R"==(krem -= sg; )==""\n"
450	R"==(if (krem >= sg && nrem >= UNROLL_N) { )==""\n"
451	R"==(for (int cc = 0; cc < UNROLL_N / ELEMENTS_PER_INT; cc++) )==""\n"
452	R"==(cols2[cc] = VLOAD_ELEMENT_INT( )==""\n"
453	R"==(0, b + cc * ELEMENTS_PER_INT + (lid + sg) * ldb); )==""\n"
454	R"==(} else { )==""\n"
455	R"==(for (int cc = 0; cc < UNROLL_N / ELEMENTS_PER_INT; cc++) { )==""\n"
456	R"==(cols2[cc] = 0; )==""\n"
457	R"==(PARTIAL_LOAD(cols2, krem, nrem, cc, )==""\n"
458	R"==(b + cc * ELEMENTS_PER_INT + (lid + sg) * ldb); )==""\n"
459	R"==(} )==""\n"
460	R"==(} )==""\n"
461	R"==(for (int cc = 0; cc < UNROLL_N / 8; cc++) )==""\n"
462	R"==(REPACK_CC2(cc); )==""\n"
463	R"==(#if COPY_SUM )==""\n"
464	R"==(ADD_SUM(cols2); )==""\n"
465	R"==(#endif )==""\n"
466	R"==(#endif )==""\n"
467	R"==(for (int cc = 0; cc < UNROLL_N / 8; cc++) )==""\n"
468	R"==(BLOCK_WRITE_ELEMENT_INT4(b_packed + cc * 8 * UNROLL_K, colgroups[cc]); )==""\n"
469	R"==(#if COPY_SUM )==""\n"
470	R"==(for (int c0 = 0; c0 < UNROLL_N; c0 += get_sub_group_size()) )==""\n"
471	R"==(atomic_add(b_sum + c0 + lid, sums[c0 + lid]); )==""\n"
472	R"==(#endif )==""\n"
473	R"==(DUMMY_DPAS; )==""\n"
474	R"==(} )==""\n"
475	R"==(#endif /* !COPY_TRANS */ )==""\n"
476	R"==(#endif /* COPY_B */ )==""\n"
477	R"==()==";
478	}
479	}
480	}
481	}

Browse the source code of oneDNN/build/src/gpu/ocl/xe_hpc_systolic_gemm_copy_kernel.cpp