gen9_gemm_nocopy_x8x8s32_kernel.cpp source code [oneDNN/build/src/gpu/ocl/gen9_gemm_nocopy_x8x8s32_kernel.cpp]

1	namespace dnnl {
2	namespace impl {
3	namespace gpu {
4	namespace ocl {
5	const char gen9_gemm_nocopy_x8x8s32_kernel = R"==(/****************************************************************************** )==""\n"
6	R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7	R"==(* )==""\n"
8	R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9	R"==(* you may not use this file except in compliance with the License. )==""\n"
10	R"==(* You may obtain a copy of the License at )==""\n"
11	R"==(* )==""\n"
12	R"==(* http: )==""\n"
13	R"==(* )==""\n"
14	R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15	R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16	R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17	R"==(* See the License for the specific language governing permissions and )==""\n"
18	R"==(* limitations under the License. )==""\n"
19	R"==(*******************************************************************************/ )==""\n"
20	R"==(#include "gpu/ocl/gemm/ocl_gemm_attrs.h" )==""\n"
21	R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
22	R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
23	R"==(#undef GRX )==""\n"
24	R"==(#define GRX 8 )==""\n"
25	R"==(#if defined(S8S8) )==""\n"
26	R"==(#define FLOATA char )==""\n"
27	R"==(#define FLOATA2 char2 )==""\n"
28	R"==(#define FLOATA4 char4 )==""\n"
29	R"==(#define FLOATB char )==""\n"
30	R"==(#define FLOATB4 char4 )==""\n"
31	R"==(#define SHUFFLE(X, Y) as_char4(intel_sub_group_shuffle(as_int(X), Y)) )==""\n"
32	R"==(#endif )==""\n"
33	R"==(#if defined(U8S8) )==""\n"
34	R"==(#define FLOATA uchar )==""\n"
35	R"==(#define FLOATA2 uchar2 )==""\n"
36	R"==(#define FLOATA4 uchar4 )==""\n"
37	R"==(#define FLOATB char )==""\n"
38	R"==(#define FLOATB4 char4 )==""\n"
39	R"==(#define SHUFFLE(X, Y) as_char4(intel_sub_group_shuffle(as_int(X), Y)) )==""\n"
40	R"==(#endif )==""\n"
41	R"==(#if defined(S8U8) )==""\n"
42	R"==(#define FLOATA char )==""\n"
43	R"==(#define FLOATA2 char2 )==""\n"
44	R"==(#define FLOATA4 char4 )==""\n"
45	R"==(#define FLOATB uchar )==""\n"
46	R"==(#define FLOATB4 uchar4 )==""\n"
47	R"==(#define SHUFFLE(X, Y) as_uchar4(intel_sub_group_shuffle(as_int(X), Y)) )==""\n"
48	R"==(#endif )==""\n"
49	R"==(#if defined(U8U8) )==""\n"
50	R"==(#define FLOATA uchar )==""\n"
51	R"==(#define FLOATA2 uchar2 )==""\n"
52	R"==(#define FLOATA4 uchar4 )==""\n"
53	R"==(#define FLOATB uchar )==""\n"
54	R"==(#define FLOATB4 uchar4 )==""\n"
55	R"==(#define SHUFFLE(X, Y) as_uchar4(intel_sub_group_shuffle(as_int(X), Y)) )==""\n"
56	R"==(#endif )==""\n"
57	R"==(#define FLOATC int )==""\n"
58	R"==(#define FLOATC4 int4 )==""\n"
59	R"==(#if WITH_ELTWISE == 1 )==""\n"
60	R"==(#define POST_OP(val) \ )==""\n"
61	R"==(do { \ )==""\n"
62	R"==(if (apply_eltwise) \ )==""\n"
63	R"==(val = fwd_eltwise( \ )==""\n"
64	R"==(val, eltwise_alpha, eltwise_beta, eltwise_scale); \ )==""\n"
65	R"==(} while (0) )==""\n"
66	R"==(#else )==""\n"
67	R"==(#define POST_OP(val) )==""\n"
68	R"==(#endif )==""\n"
69	R"==(#define COMPUTE_C(X, Y, CO_IDX) \ )==""\n"
70	R"==(do { \ )==""\n"
71	R"==(float val = (!beta ? 0 : (X)) + (Y); \ )==""\n"
72	R"==(POST_OP(val); \ )==""\n"
73	R"==((X) = convert_int_sat_rte(val + (apply_co ? co[CO_IDX] : 0)); \ )==""\n"
74	R"==(} while (0) )==""\n"
75	R"==(#ifdef FF )==""\n"
76	R"==(#define ADD_EACH(X, OFF) \ )==""\n"
77	R"==(do { \ )==""\n"
78	R"==(if (n > X + OFF) { \ )==""\n"
79	R"==(if (m > 0) \ )==""\n"
80	R"==(COMPUTE_C(c[0], sc[X / 4 + 0].s##OFF + xa[0] + xb[0], 0); \ )==""\n"
81	R"==(if (m > 1) \ )==""\n"
82	R"==(COMPUTE_C(c[1], sc[X / 4 + 4].s##OFF + xa[1] + xb[0], 0); \ )==""\n"
83	R"==(if (m > 2) \ )==""\n"
84	R"==(COMPUTE_C(c[2], sc[X / 4 + 8].s##OFF + xa[2] + xb[0], 0); \ )==""\n"
85	R"==(if (m > 3) \ )==""\n"
86	R"==(COMPUTE_C(c[3], sc[X / 4 + 12].s##OFF + xa[3] + xb[0], 0); \ )==""\n"
87	R"==(xb++; \ )==""\n"
88	R"==(c += ldc; \ )==""\n"
89	R"==(} \ )==""\n"
90	R"==(} while (0) )==""\n"
91	R"==(#elif defined CC )==""\n"
92	R"==(#define ADD_EACH(X, OFF) \ )==""\n"
93	R"==(do { \ )==""\n"
94	R"==(if (n > X + OFF) { \ )==""\n"
95	R"==(if (m > 0) \ )==""\n"
96	R"==(COMPUTE_C(c[0], sc[X / 4 + 0].s##OFF + xa[0] + xb[0], 0); \ )==""\n"
97	R"==(if (m > 1) \ )==""\n"
98	R"==(COMPUTE_C(c[1], sc[X / 4 + 4].s##OFF + xa[1] + xb[0], 0); \ )==""\n"
99	R"==(if (m > 2) \ )==""\n"
100	R"==(COMPUTE_C(c[2], sc[X / 4 + 8].s##OFF + xa[2] + xb[0], 0); \ )==""\n"
101	R"==(if (m > 3) \ )==""\n"
102	R"==(COMPUTE_C(c[3], sc[X / 4 + 12].s##OFF + xa[3] + xb[0], 0); \ )==""\n"
103	R"==(xb++; \ )==""\n"
104	R"==(c += ldc; \ )==""\n"
105	R"==(co++; \ )==""\n"
106	R"==(} \ )==""\n"
107	R"==(} while (0) )==""\n"
108	R"==(#else )==""\n"
109	R"==(#define ADD_EACH(X, OFF) \ )==""\n"
110	R"==(do { \ )==""\n"
111	R"==(if (n > X + OFF) { \ )==""\n"
112	R"==(if (m > 0) \ )==""\n"
113	R"==(COMPUTE_C(c[0], sc[X / 4 + 0].s##OFF + xa[0] + xb[0], 0); \ )==""\n"
114	R"==(if (m > 1) \ )==""\n"
115	R"==(COMPUTE_C(c[1], sc[X / 4 + 4].s##OFF + xa[1] + xb[0], 1); \ )==""\n"
116	R"==(if (m > 2) \ )==""\n"
117	R"==(COMPUTE_C(c[2], sc[X / 4 + 8].s##OFF + xa[2] + xb[0], 2); \ )==""\n"
118	R"==(if (m > 3) \ )==""\n"
119	R"==(COMPUTE_C(c[3], sc[X / 4 + 12].s##OFF + xa[3] + xb[0], 3); \ )==""\n"
120	R"==(xb++; \ )==""\n"
121	R"==(c += ldc; \ )==""\n"
122	R"==(} \ )==""\n"
123	R"==(} while (0) )==""\n"
124	R"==(#endif )==""\n"
125	R"==(#define ADD_SCALE(X) \ )==""\n"
126	R"==(do { \ )==""\n"
127	R"==(ADD_EACH(X, 0); \ )==""\n"
128	R"==(ADD_EACH(X, 1); \ )==""\n"
129	R"==(ADD_EACH(X, 2); \ )==""\n"
130	R"==(ADD_EACH(X, 3); \ )==""\n"
131	R"==(} while (0) )==""\n"
132	R"==(#define ACCUMULATE_1(a, b) \ )==""\n"
133	R"==(((FLOATC)a.s0 * (FLOATC)b.s0) + ((FLOATC)a.s1 * (FLOATC)b.s1) \ )==""\n"
134	R"==(+ ((FLOATC)a.s2 * (FLOATC)b.s2) + ((FLOATC)a.s3 * (FLOATC)b.s3) )==""\n"
135	R"==(#define ACCUMULATE(a, b0, b1, b2, b3) \ )==""\n"
136	R"==((FLOATC4)(ACCUMULATE_1(a, b0), ACCUMULATE_1(a, b1), ACCUMULATE_1(a, b2), \ )==""\n"
137	R"==(ACCUMULATE_1(a, b3)) )==""\n"
138	R"==(#define GROUPSIZE_M (6 * UNROLL_M) )==""\n"
139	R"==(#define GROUPSIZE_N (4 * UNROLL_N) )==""\n"
140	R"==(__attribute__((intel_reqd_sub_group_size(GRX))) kernel void )==""\n"
141	R"==(gen9_gemm_compute_x8x8s32(global FLOATA a, global FLOATB b, global FLOATC *c, )==""\n"
142	R"==(long offsetA, long offsetB, long offsetC, long lda, long ldb, long ldc, )==""\n"
143	R"==(long m, long n, long k, int beta, global int ao, global int bo, )==""\n"
144	R"==(global int co, long offsetCO, int apply_co, local FLOATA sa, )==""\n"
145	R"==(local FLOATB *sb, int apply_eltwise, float eltwise_alpha, )==""\n"
146	R"==(float eltwise_beta, float eltwise_scale) { )==""\n"
147	R"==(long kk = (k + UNROLL_K - 1) & ~(UNROLL_K - 1); )==""\n"
148	R"==(long i, j, l, ll; )==""\n"
149	R"==(global FLOATC *c_ori; )==""\n"
150	R"==(int lid = get_local_id(0); )==""\n"
151	R"==(int idx = get_local_id(1); )==""\n"
152	R"==(int idy = get_local_id(2); )==""\n"
153	R"==(long gdx = get_group_id(1); )==""\n"
154	R"==(long gdy = get_group_id(2); )==""\n"
155	R"==(long szx = get_local_size(1); )==""\n"
156	R"==(long szy = get_local_size(2); )==""\n"
157	R"==(a += offsetA; )==""\n"
158	R"==(b += offsetB; )==""\n"
159	R"==(c += offsetC + UNROLL_M * idx + GROUPSIZE_M * gdx + UNROLL_M * lid / GRX )==""\n"
160	R"==(+ (UNROLL_N * idy + GROUPSIZE_N * gdy) * ldc; )==""\n"
161	R"==(c_ori = c; )==""\n"
162	R"==(if (apply_co) { )==""\n"
163	R"==(co += offsetCO; )==""\n"
164	R"==(#ifdef RR )==""\n"
165	R"==(co += GROUPSIZE_M * gdx + UNROLL_M * idx + UNROLL_M * lid / GRX; )==""\n"
166	R"==(#endif )==""\n"
167	R"==(#ifdef CC )==""\n"
168	R"==(co += GROUPSIZE_N * gdy + UNROLL_N * idy; )==""\n"
169	R"==(#endif )==""\n"
170	R"==(} )==""\n"
171	R"==(__local FLOATC xa = (__local FLOATC )sa; )==""\n"
172	R"==(sa += UNROLL_M * szx * sizeof(FLOATC); )==""\n"
173	R"==(__local FLOATC xb = (__local FLOATC )sb; )==""\n"
174	R"==(sb += UNROLL_N * szy * sizeof(FLOATC); )==""\n"
175	R"==(int cid0 = (idy * szx + idx) * get_local_size(0) + lid; )==""\n"
176	R"==(int ctotal = get_local_size(0) * szx * szy; )==""\n"
177	R"==(for (int cid = cid0; cid < szx * UNROLL_M; cid += ctotal) { )==""\n"
178	R"==(long sa_moffset = (cid & ~(UNROLL_M - 1)) * kk )==""\n"
179	R"==(+ (cid & (UNROLL_M - 1)) * UNROLL_K; )==""\n"
180	R"==(long i = cid + GROUPSIZE_M * gdx; )==""\n"
181	R"==(FLOATC sumA = 0; )==""\n"
182	R"==(#if defined(NN) \|\| defined(NT) )==""\n"
183	R"==(long a_offset = i; )==""\n"
184	R"==(#else )==""\n"
185	R"==(long a_offset = i * lda; )==""\n"
186	R"==(#endif )==""\n"
187	R"==(for (l = 0; l < kk; l += UNROLL_K) { )==""\n"
188	R"==(for (ll = 0; ll < UNROLL_K; ll++) { )==""\n"
189	R"==(FLOATA a_val = (((i < m) && (l + ll < k)) ? a[a_offset] : 0); )==""\n"
190	R"==(sa[sa_moffset + l * UNROLL_M + ll] = a_val; )==""\n"
191	R"==(sumA -= a_val; )==""\n"
192	R"==(#if defined(NN) \|\| defined(NT) )==""\n"
193	R"==(a_offset += lda; )==""\n"
194	R"==(#else )==""\n"
195	R"==(a_offset++; )==""\n"
196	R"==(#endif )==""\n"
197	R"==(} )==""\n"
198	R"==(} )==""\n"
199	R"==(xa[cid] = (FLOATC)ATTR_B0 * sumA; )==""\n"
200	R"==(} )==""\n"
201	R"==(for (int cid = cid0; cid < szy * UNROLL_N; cid += ctotal) { )==""\n"
202	R"==(long sb_noffset = (cid & ~(UNROLL_N - 1)) * kk )==""\n"
203	R"==(+ (cid & (UNROLL_N - 1)) * UNROLL_K; )==""\n"
204	R"==(long j = cid + GROUPSIZE_N * gdy; )==""\n"
205	R"==(FLOATC sumB = (FLOATC)ATTR_B0 * k; )==""\n"
206	R"==(#if defined(NN) \|\| defined(TN) )==""\n"
207	R"==(long b_offset = j * ldb; )==""\n"
208	R"==(#else )==""\n"
209	R"==(long b_offset = j; )==""\n"
210	R"==(#endif )==""\n"
211	R"==(for (l = 0; l < kk; l += UNROLL_K) { )==""\n"
212	R"==(for (ll = 0; ll < UNROLL_K; ll++) { )==""\n"
213	R"==(FLOATB b_val = (((j < n) && (l + ll < k)) ? b[b_offset] : 0); )==""\n"
214	R"==(sb[sb_noffset + l * UNROLL_N + ll] = b_val; )==""\n"
215	R"==(sumB -= b_val; )==""\n"
216	R"==(#if defined(NN) \|\| defined(TN) )==""\n"
217	R"==(b_offset++; )==""\n"
218	R"==(#else )==""\n"
219	R"==(b_offset += ldb; )==""\n"
220	R"==(#endif )==""\n"
221	R"==(} )==""\n"
222	R"==(} )==""\n"
223	R"==(xb[cid] = (FLOATC)ATTR_A0 * sumB; )==""\n"
224	R"==(} )==""\n"
225	R"==(barrier(CLK_LOCAL_MEM_FENCE); )==""\n"
226	R"==(m -= GROUPSIZE_M * gdx + UNROLL_M * idx; )==""\n"
227	R"==(if (m > UNROLL_M) m = UNROLL_M; )==""\n"
228	R"==(n -= GROUPSIZE_N * gdy + UNROLL_N * idy; )==""\n"
229	R"==(if (n > UNROLL_N) n = UNROLL_N; )==""\n"
230	R"==(if ((m <= 0) \|\| (n <= 0)) return; )==""\n"
231	R"==(m -= UNROLL_M * lid / GRX; )==""\n"
232	R"==(sa += UNROLL_M * kk * idx + UNROLL_M * UNROLL_K * lid / GRX; )==""\n"
233	R"==(sb += UNROLL_N * kk * idy + UNROLL_K * lid; )==""\n"
234	R"==(xa += UNROLL_M * idx + UNROLL_M * lid / GRX; )==""\n"
235	R"==(xb += UNROLL_N * idy; )==""\n"
236	R"==(FLOATC4 sc[UNROLL_M * UNROLL_N / GRX / 4] = {0}; )==""\n"
237	R"==(for (l = 0; l < kk; l += UNROLL_K) { )==""\n"
238	R"==(FLOATA4 a0, a1, a2, a3; )==""\n"
239	R"==(FLOATB4 bb, b0, b1, b2, b3; )==""\n"
240	R"==(a0 = ((__local FLOATA4 *)sa)[0]; )==""\n"
241	R"==(a1 = ((__local FLOATA4 *)sa)[1]; )==""\n"
242	R"==(a2 = ((__local FLOATA4 *)sa)[2]; )==""\n"
243	R"==(a3 = ((__local FLOATA4 *)sa)[3]; )==""\n"
244	R"==(for (ll = 0; ll < GRX / 4; ll++) { )==""\n"
245	R"==(bb = ((__local FLOATB4 *)sb)[0]; )==""\n"
246	R"==(b0 = SHUFFLE(bb, 0); )==""\n"
247	R"==(b1 = SHUFFLE(bb, 1); )==""\n"
248	R"==(b2 = SHUFFLE(bb, 2); )==""\n"
249	R"==(b3 = SHUFFLE(bb, 3); )==""\n"
250	R"==(sc[ll * 2 + 0] += ACCUMULATE(a0, b0, b1, b2, b3); )==""\n"
251	R"==(sc[ll * 2 + 4] += ACCUMULATE(a1, b0, b1, b2, b3); )==""\n"
252	R"==(sc[ll * 2 + 8] += ACCUMULATE(a2, b0, b1, b2, b3); )==""\n"
253	R"==(sc[ll * 2 + 12] += ACCUMULATE(a3, b0, b1, b2, b3); )==""\n"
254	R"==(b0 = SHUFFLE(bb, 4); )==""\n"
255	R"==(b1 = SHUFFLE(bb, 5); )==""\n"
256	R"==(b2 = SHUFFLE(bb, 6); )==""\n"
257	R"==(b3 = SHUFFLE(bb, 7); )==""\n"
258	R"==(sc[ll * 2 + 1] += ACCUMULATE(a0, b0, b1, b2, b3); )==""\n"
259	R"==(sc[ll * 2 + 5] += ACCUMULATE(a1, b0, b1, b2, b3); )==""\n"
260	R"==(sc[ll * 2 + 9] += ACCUMULATE(a2, b0, b1, b2, b3); )==""\n"
261	R"==(sc[ll * 2 + 13] += ACCUMULATE(a3, b0, b1, b2, b3); )==""\n"
262	R"==(sb += UNROLL_N * GRX / 4; )==""\n"
263	R"==(} )==""\n"
264	R"==(sa += UNROLL_M * UNROLL_K; )==""\n"
265	R"==(} )==""\n"
266	R"==(ADD_SCALE(0); )==""\n"
267	R"==(ADD_SCALE(4); )==""\n"
268	R"==(ADD_SCALE(8); )==""\n"
269	R"==(ADD_SCALE(12); )==""\n"
270	R"==(} )==""\n"
271	R"==()==";
272	}
273	}
274	}
275	}

Browse the source code of oneDNN/build/src/gpu/ocl/gen9_gemm_nocopy_x8x8s32_kernel.cpp