1namespace dnnl {
2namespace impl {
3namespace gpu {
4namespace ocl {
5const char *gen9_gemm_nocopy_scale_x8x8s32_kernel = R"==(/******************************************************************************* )==""\n"
6R"==(* Copyright 2019-2022 Intel Corporation )==""\n"
7R"==(* )==""\n"
8R"==(* Licensed under the Apache License, Version 2.0 (the "License"); )==""\n"
9R"==(* you may not use this file except in compliance with the License. )==""\n"
10R"==(* You may obtain a copy of the License at )==""\n"
11R"==(* )==""\n"
12R"==(* http: )==""\n"
13R"==(* )==""\n"
14R"==(* Unless required by applicable law or agreed to in writing, software )==""\n"
15R"==(* distributed under the License is distributed on an "AS IS" BASIS, )==""\n"
16R"==(* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. )==""\n"
17R"==(* See the License for the specific language governing permissions and )==""\n"
18R"==(* limitations under the License. )==""\n"
19R"==(*******************************************************************************/ )==""\n"
20R"==(#include "gpu/ocl/gemm/ocl_gemm_attrs.h" )==""\n"
21R"==(#include "gpu/ocl/ocl_post_ops.h" )==""\n"
22R"==(#include "gpu/ocl/ocl_types.h" )==""\n"
23R"==(#if WITH_ELTWISE == 1 )==""\n"
24R"==(#define POST_OP(val) \ )==""\n"
25R"==(do { \ )==""\n"
26R"==(if (apply_eltwise) \ )==""\n"
27R"==(val = fwd_eltwise( \ )==""\n"
28R"==(val, eltwise_alpha, eltwise_beta, eltwise_scale); \ )==""\n"
29R"==(} while (0) )==""\n"
30R"==(#else )==""\n"
31R"==(#define POST_OP(val) )==""\n"
32R"==(#endif )==""\n"
33R"==(kernel void gen9_gemm_scale_x8x8s32(global int *cc, global int *c, char trc, )==""\n"
34R"==(long offset_c, long m, long n, long ldc, global float *alpha, )==""\n"
35R"==(float beta, global int *co, long offset_co, int alpha_is_zero, )==""\n"
36R"==(int apply_eltwise, float eltwise_alpha, float eltwise_beta, )==""\n"
37R"==(float eltwise_scale) { )==""\n"
38R"==(int idx = get_group_id(0); )==""\n"
39R"==(int idy = get_group_id(1); )==""\n"
40R"==(int lid = get_local_id(0); )==""\n"
41R"==(long j; )==""\n"
42R"==(long offset_cc = 0; )==""\n"
43R"==(long offset_x = 0; )==""\n"
44R"==(long ldcc = m; )==""\n"
45R"==(m -= 32 * idx; )==""\n"
46R"==(if (m > 32) m = 32; )==""\n"
47R"==(n -= 16 * idy; )==""\n"
48R"==(if (n > 16) n = 16; )==""\n"
49R"==(m -= 32 * lid / 16; )==""\n"
50R"==(if ((m <= 0) || (n <= 0)) return; )==""\n"
51R"==(offset_cc = 32 * idx + 32 * lid / 16 + 16 * idy * ldcc; )==""\n"
52R"==(offset_c += 32 * idx + 32 * lid / 16 + 16 * idy * ldc; )==""\n"
53R"==(if (trc == 'C') offset_co += 32 * idx + 32 * lid / 16; )==""\n"
54R"==(if (trc == 'R') offset_co += 16 * idy; )==""\n"
55R"==(for (j = 0; j < n; j++) { )==""\n"
56R"==(if (m > 0) { )==""\n"
57R"==(float val = (alpha_is_zero ? 0 : ATTR_ALPHA) * cc[offset_cc + 0] )==""\n"
58R"==(+ beta * c[offset_c + 0]; )==""\n"
59R"==(POST_OP(val); )==""\n"
60R"==(c[offset_c] = convert_int_sat_rte( )==""\n"
61R"==(val + (co ? co[offset_co + offset_x] : 0)); )==""\n"
62R"==(if (trc == 'C') { offset_x++; } )==""\n"
63R"==(} )==""\n"
64R"==(if (m > 1) { )==""\n"
65R"==(float val = (alpha_is_zero ? 0 : ATTR_ALPHA) * cc[offset_cc + 1] )==""\n"
66R"==(+ beta * c[offset_c + 1]; )==""\n"
67R"==(POST_OP(val); )==""\n"
68R"==(c[offset_c + 1] = convert_int_sat_rte( )==""\n"
69R"==(val + (co ? co[offset_co + offset_x] : 0)); )==""\n"
70R"==(} )==""\n"
71R"==(offset_cc += ldcc; )==""\n"
72R"==(offset_c += ldc; )==""\n"
73R"==(if (trc == 'C') offset_x = 0; )==""\n"
74R"==(if (trc == 'R') offset_x++; )==""\n"
75R"==(} )==""\n"
76R"==(} )==""\n"
77R"==()==";
78}
79}
80}
81}